From 5a8a4eb18746d158faa3331748a28c4ccd88d063 Mon Sep 17 00:00:00 2001 From: Julien Plu Date: Mon, 21 Dec 2020 13:10:15 +0100 Subject: [PATCH] Improve BERT-like models performance with better self attention (#9124) * Improve BERT-like models attention layers * Apply style * Put back error raising instead of assert * Update template * Fix copies * Apply raising valueerror in MPNet * Restore the copy check for the Intermediate layer in Longformer * Update longformer --- setup.py | 4 +- src/transformers/dependency_versions_table.py | 4 +- .../models/bert/modeling_tf_bert.py | 127 +++++++++-------- .../models/electra/modeling_tf_electra.py | 133 ++++++++++-------- .../longformer/modeling_tf_longformer.py | 31 ++-- .../models/mpnet/modeling_tf_mpnet.py | 90 ++++++------ .../models/roberta/modeling_tf_roberta.py | 133 ++++++++++-------- ...tf_{{cookiecutter.lowercase_modelname}}.py | 97 +++++++------ 8 files changed, 348 insertions(+), 271 deletions(-) diff --git a/setup.py b/setup.py index 2b025c221a..860cb8e4a1 100644 --- a/setup.py +++ b/setup.py @@ -127,8 +127,8 @@ _deps = [ "sphinx-rtd-theme==0.4.3", # sphinx-rtd-theme==0.5.0 introduced big changes in the style. "sphinx==3.2.1", "starlette", - "tensorflow-cpu>=2.0", - "tensorflow>=2.0", + "tensorflow-cpu>=2.3", + "tensorflow>=2.3", "timeout-decorator", "tokenizers==0.9.4", "torch>=1.0", diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index c6901c198f..b07c53058f 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -40,8 +40,8 @@ deps = { "sphinx-rtd-theme": "sphinx-rtd-theme==0.4.3", "sphinx": "sphinx==3.2.1", "starlette": "starlette", - "tensorflow-cpu": "tensorflow-cpu>=2.0", - "tensorflow": "tensorflow>=2.0", + "tensorflow-cpu": "tensorflow-cpu>=2.3", + "tensorflow": "tensorflow>=2.3", "timeout-decorator": "timeout-decorator", "tokenizers": "tokenizers==0.9.4", "torch": "torch>=1.0", diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py index 987b1d9dc0..485639237f 100644 --- a/src/transformers/models/bert/modeling_tf_bert.py +++ b/src/transformers/models/bert/modeling_tf_bert.py @@ -246,52 +246,52 @@ class TFBertSelfAttention(tf.keras.layers.Layer): if config.hidden_size % config.num_attention_heads != 0: raise ValueError( - "The hidden size (%d) is not a multiple of the number of attention " - "heads (%d)" % (config.hidden_size, config.num_attention_heads) + f"The hidden size ({config.hidden_size}) is not a multiple of the number " + f"of attention heads ({config.num_attention_heads})" ) self.num_attention_heads = config.num_attention_heads - assert config.hidden_size % config.num_attention_heads == 0 self.attention_head_size = int(config.hidden_size / config.num_attention_heads) - self.all_head_size = self.num_attention_heads * self.attention_head_size - self.query = tf.keras.layers.Dense( - self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" + self.query = tf.keras.layers.experimental.EinsumDense( + equation="abc,cde->abde", + output_shape=(None, config.num_attention_heads, self.attention_head_size), + bias_axes="de", + kernel_initializer=get_initializer(initializer_range=config.initializer_range), + name="query", ) - self.key = tf.keras.layers.Dense( - self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" + self.key = tf.keras.layers.experimental.EinsumDense( + equation="abc,cde->abde", + output_shape=(None, config.num_attention_heads, self.attention_head_size), + bias_axes="de", + kernel_initializer=get_initializer(initializer_range=config.initializer_range), + name="key", ) - self.value = tf.keras.layers.Dense( - self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" + self.value = tf.keras.layers.experimental.EinsumDense( + equation="abc,cde->abde", + output_shape=(None, config.num_attention_heads, self.attention_head_size), + bias_axes="de", + kernel_initializer=get_initializer(initializer_range=config.initializer_range), + name="value", ) - self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) - def transpose_for_scores(self, x, batch_size): - x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size)) + def call(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False, training=False): + query_layer = self.query(inputs=hidden_states) + key_layer = self.key(inputs=hidden_states) + value_layer = self.value(inputs=hidden_states) - return tf.transpose(x, perm=[0, 2, 1, 3]) - - def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False): - batch_size = shape_list(hidden_states)[0] - mixed_query_layer = self.query(hidden_states) - mixed_key_layer = self.key(hidden_states) - mixed_value_layer = self.value(hidden_states) - query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) - key_layer = self.transpose_for_scores(mixed_key_layer, batch_size) - value_layer = self.transpose_for_scores(mixed_value_layer, batch_size) - - # Take the dot product between "query" and "key" to get the raw attention scores. - attention_scores = tf.matmul( - query_layer, key_layer, transpose_b=True - ) # (batch size, num_heads, seq_len_q, seq_len_k) - dk = tf.cast(shape_list(key_layer)[-1], attention_scores.dtype) # scale attention_scores - attention_scores = attention_scores / tf.math.sqrt(dk) + # Take the dot product between "query" and "key" to get the raw + # attention scores. + dk = tf.cast(x=self.attention_head_size, dtype=query_layer.dtype) + query_layer = tf.multiply(x=query_layer, y=tf.math.rsqrt(x=dk)) + attention_scores = tf.einsum("aecd,abcd->acbe", key_layer, query_layer) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in TFBertModel call() function) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - attention_probs = tf.nn.softmax(attention_scores, axis=-1) + attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. @@ -299,14 +299,10 @@ class TFBertSelfAttention(tf.keras.layers.Layer): # Mask heads if we want to if head_mask is not None: - attention_probs = attention_probs * head_mask + attention_scores = attention_scores * head_mask - context_layer = tf.matmul(attention_probs, value_layer) - context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3]) - context_layer = tf.reshape( - context_layer, (batch_size, -1, self.all_head_size) - ) # (batch_size, seq_len_q, all_head_size) - outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + attention_output = tf.einsum("acbe,aecd->abcd", attention_probs, value_layer) + outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) return outputs @@ -315,16 +311,29 @@ class TFBertSelfOutput(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( - config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number " + f"of attention heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.dense = tf.keras.layers.experimental.EinsumDense( + equation="abcd,cde->abe", + output_shape=(None, self.all_head_size), + bias_axes="e", + kernel_initializer=get_initializer(initializer_range=config.initializer_range), + name="dense", ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) def call(self, hidden_states, input_tensor, training=False): - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states, training=training) - hidden_states = self.LayerNorm(hidden_states + input_tensor) + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states @@ -353,18 +362,22 @@ class TFBertIntermediate(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( - config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + self.dense = tf.keras.layers.experimental.EinsumDense( + equation="abc,cd->abd", + output_shape=(None, config.intermediate_size), + bias_axes="d", + kernel_initializer=get_initializer(initializer_range=config.initializer_range), + name="dense", ) if isinstance(config.hidden_act, str): - self.intermediate_act_fn = get_tf_activation(config.hidden_act) + self.intermediate_act_fn = get_tf_activation(activation_string=config.hidden_act) else: self.intermediate_act_fn = config.hidden_act def call(self, hidden_states): - hidden_states = self.dense(hidden_states) - hidden_states = self.intermediate_act_fn(hidden_states) + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.intermediate_act_fn(inputs=hidden_states) return hidden_states @@ -373,16 +386,20 @@ class TFBertOutput(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( - config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + self.dense = tf.keras.layers.experimental.EinsumDense( + equation="abc,cd->abd", + bias_axes="d", + output_shape=(None, config.hidden_size), + kernel_initializer=get_initializer(config.initializer_range), + name="dense", ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) def call(self, hidden_states, input_tensor, training=False): - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states, training=training) - hidden_states = self.LayerNorm(hidden_states + input_tensor) + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states diff --git a/src/transformers/models/electra/modeling_tf_electra.py b/src/transformers/models/electra/modeling_tf_electra.py index 3a39b03762..709b5f26d7 100644 --- a/src/transformers/models/electra/modeling_tf_electra.py +++ b/src/transformers/models/electra/modeling_tf_electra.py @@ -69,59 +69,59 @@ TF_ELECTRA_PRETRAINED_MODEL_ARCHIVE_LIST = [ ] -# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention +# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Electra class TFElectraSelfAttention(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) if config.hidden_size % config.num_attention_heads != 0: raise ValueError( - "The hidden size (%d) is not a multiple of the number of attention " - "heads (%d)" % (config.hidden_size, config.num_attention_heads) + f"The hidden size ({config.hidden_size}) is not a multiple of the number " + f"of attention heads ({config.num_attention_heads})" ) self.num_attention_heads = config.num_attention_heads - assert config.hidden_size % config.num_attention_heads == 0 self.attention_head_size = int(config.hidden_size / config.num_attention_heads) - self.all_head_size = self.num_attention_heads * self.attention_head_size - self.query = tf.keras.layers.Dense( - self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" + self.query = tf.keras.layers.experimental.EinsumDense( + equation="abc,cde->abde", + output_shape=(None, config.num_attention_heads, self.attention_head_size), + bias_axes="de", + kernel_initializer=get_initializer(initializer_range=config.initializer_range), + name="query", ) - self.key = tf.keras.layers.Dense( - self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" + self.key = tf.keras.layers.experimental.EinsumDense( + equation="abc,cde->abde", + output_shape=(None, config.num_attention_heads, self.attention_head_size), + bias_axes="de", + kernel_initializer=get_initializer(initializer_range=config.initializer_range), + name="key", ) - self.value = tf.keras.layers.Dense( - self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" + self.value = tf.keras.layers.experimental.EinsumDense( + equation="abc,cde->abde", + output_shape=(None, config.num_attention_heads, self.attention_head_size), + bias_axes="de", + kernel_initializer=get_initializer(initializer_range=config.initializer_range), + name="value", ) - self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) - def transpose_for_scores(self, x, batch_size): - x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size)) + def call(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False, training=False): + query_layer = self.query(inputs=hidden_states) + key_layer = self.key(inputs=hidden_states) + value_layer = self.value(inputs=hidden_states) - return tf.transpose(x, perm=[0, 2, 1, 3]) - - def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False): - batch_size = shape_list(hidden_states)[0] - mixed_query_layer = self.query(hidden_states) - mixed_key_layer = self.key(hidden_states) - mixed_value_layer = self.value(hidden_states) - query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) - key_layer = self.transpose_for_scores(mixed_key_layer, batch_size) - value_layer = self.transpose_for_scores(mixed_value_layer, batch_size) - - # Take the dot product between "query" and "key" to get the raw attention scores. - attention_scores = tf.matmul( - query_layer, key_layer, transpose_b=True - ) # (batch size, num_heads, seq_len_q, seq_len_k) - dk = tf.cast(shape_list(key_layer)[-1], attention_scores.dtype) # scale attention_scores - attention_scores = attention_scores / tf.math.sqrt(dk) + # Take the dot product between "query" and "key" to get the raw + # attention scores. + dk = tf.cast(x=self.attention_head_size, dtype=query_layer.dtype) + query_layer = tf.multiply(x=query_layer, y=tf.math.rsqrt(x=dk)) + attention_scores = tf.einsum("aecd,abcd->acbe", key_layer, query_layer) if attention_mask is not None: - # Apply the attention mask is (precomputed for all layers in TFBertModel call() function) + # Apply the attention mask is (precomputed for all layers in TFElectraModel call() function) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - attention_probs = tf.nn.softmax(attention_scores, axis=-1) + attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. @@ -129,33 +129,42 @@ class TFElectraSelfAttention(tf.keras.layers.Layer): # Mask heads if we want to if head_mask is not None: - attention_probs = attention_probs * head_mask + attention_scores = attention_scores * head_mask - context_layer = tf.matmul(attention_probs, value_layer) - context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3]) - context_layer = tf.reshape( - context_layer, (batch_size, -1, self.all_head_size) - ) # (batch_size, seq_len_q, all_head_size) - outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + attention_output = tf.einsum("acbe,aecd->abcd", attention_probs, value_layer) + outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) return outputs -# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput +# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Electra class TFElectraSelfOutput(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( - config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number " + f"of attention heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.dense = tf.keras.layers.experimental.EinsumDense( + equation="abcd,cde->abe", + output_shape=(None, self.all_head_size), + bias_axes="e", + kernel_initializer=get_initializer(initializer_range=config.initializer_range), + name="dense", ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) def call(self, hidden_states, input_tensor, training=False): - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states, training=training) - hidden_states = self.LayerNorm(hidden_states + input_tensor) + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states @@ -186,18 +195,22 @@ class TFElectraIntermediate(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( - config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + self.dense = tf.keras.layers.experimental.EinsumDense( + equation="abc,cd->abd", + output_shape=(None, config.intermediate_size), + bias_axes="d", + kernel_initializer=get_initializer(initializer_range=config.initializer_range), + name="dense", ) if isinstance(config.hidden_act, str): - self.intermediate_act_fn = get_tf_activation(config.hidden_act) + self.intermediate_act_fn = get_tf_activation(activation_string=config.hidden_act) else: self.intermediate_act_fn = config.hidden_act def call(self, hidden_states): - hidden_states = self.dense(hidden_states) - hidden_states = self.intermediate_act_fn(hidden_states) + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.intermediate_act_fn(inputs=hidden_states) return hidden_states @@ -207,16 +220,20 @@ class TFElectraOutput(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( - config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + self.dense = tf.keras.layers.experimental.EinsumDense( + equation="abc,cd->abd", + bias_axes="d", + output_shape=(None, config.hidden_size), + kernel_initializer=get_initializer(config.initializer_range), + name="dense", ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) def call(self, hidden_states, input_tensor, training=False): - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states, training=training) - hidden_states = self.LayerNorm(hidden_states + input_tensor) + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py index db30435be5..8f0d4fb91c 100644 --- a/src/transformers/models/longformer/modeling_tf_longformer.py +++ b/src/transformers/models/longformer/modeling_tf_longformer.py @@ -618,18 +618,22 @@ class TFLongformerIntermediate(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( - config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + self.dense = tf.keras.layers.experimental.EinsumDense( + equation="abc,cd->abd", + output_shape=(None, config.intermediate_size), + bias_axes="d", + kernel_initializer=get_initializer(initializer_range=config.initializer_range), + name="dense", ) if isinstance(config.hidden_act, str): - self.intermediate_act_fn = get_tf_activation(config.hidden_act) + self.intermediate_act_fn = get_tf_activation(activation_string=config.hidden_act) else: self.intermediate_act_fn = config.hidden_act def call(self, hidden_states): - hidden_states = self.dense(hidden_states) - hidden_states = self.intermediate_act_fn(hidden_states) + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.intermediate_act_fn(inputs=hidden_states) return hidden_states @@ -639,16 +643,20 @@ class TFLongformerOutput(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( - config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + self.dense = tf.keras.layers.experimental.EinsumDense( + equation="abc,cd->abd", + bias_axes="d", + output_shape=(None, config.hidden_size), + kernel_initializer=get_initializer(config.initializer_range), + name="dense", ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) def call(self, hidden_states, input_tensor, training=False): - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states, training=training) - hidden_states = self.LayerNorm(hidden_states + input_tensor) + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states @@ -674,7 +682,6 @@ class TFLongformerPooler(tf.keras.layers.Layer): return pooled_output -# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput class TFLongformerSelfOutput(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) diff --git a/src/transformers/models/mpnet/modeling_tf_mpnet.py b/src/transformers/models/mpnet/modeling_tf_mpnet.py index 02f462572d..23d3d45d6e 100644 --- a/src/transformers/models/mpnet/modeling_tf_mpnet.py +++ b/src/transformers/models/mpnet/modeling_tf_mpnet.py @@ -239,54 +239,58 @@ class TFMPNetSelfAttention(tf.keras.layers.Layer): if config.hidden_size % config.num_attention_heads != 0: raise ValueError( - "The hidden size (%d) is not a multiple of the number of attention " - "heads (%d)" % (config.hidden_size, config.num_attention_heads) + f"The hidden size ({config.hidden_size}) is not a multiple of the number " + f"of attention heads ({config.num_attention_heads})" ) self.num_attention_heads = config.num_attention_heads - assert config.hidden_size % config.num_attention_heads == 0 self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.all_head_size = self.num_attention_heads * self.attention_head_size - - self.q = tf.keras.layers.Dense( - self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="q" + self.q = tf.keras.layers.experimental.EinsumDense( + equation="abc,cde->abde", + output_shape=(None, config.num_attention_heads, self.attention_head_size), + bias_axes="de", + kernel_initializer=get_initializer(initializer_range=config.initializer_range), + name="q", ) - self.k = tf.keras.layers.Dense( - self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="k" + self.k = tf.keras.layers.experimental.EinsumDense( + equation="abc,cde->abde", + output_shape=(None, config.num_attention_heads, self.attention_head_size), + bias_axes="de", + kernel_initializer=get_initializer(initializer_range=config.initializer_range), + name="k", ) - self.v = tf.keras.layers.Dense( - self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="v" + self.v = tf.keras.layers.experimental.EinsumDense( + equation="abc,cde->abde", + output_shape=(None, config.num_attention_heads, self.attention_head_size), + bias_axes="de", + kernel_initializer=get_initializer(initializer_range=config.initializer_range), + name="v", ) - self.o = tf.keras.layers.Dense( - config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="o" + self.o = tf.keras.layers.experimental.EinsumDense( + equation="abcd,cde->abe", + output_shape=(None, self.all_head_size), + bias_axes="e", + kernel_initializer=get_initializer(initializer_range=config.initializer_range), + name="o", ) self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) - def transpose_for_scores(self, x, batch_size): - x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size)) - - return tf.transpose(x, perm=[0, 2, 1, 3]) - def call(self, hidden_states, attention_mask, head_mask, output_attentions, position_bias=None, training=False): - batch_size = shape_list(hidden_states)[0] - q = self.q(hidden_states) k = self.k(hidden_states) v = self.v(hidden_states) - q = self.transpose_for_scores(q, batch_size) - k = self.transpose_for_scores(k, batch_size) - v = self.transpose_for_scores(v, batch_size) - - attention_scores = tf.matmul(q, k, transpose_b=True) - dk = tf.cast(shape_list(k)[-1], attention_scores.dtype) - attention_scores = attention_scores / tf.math.sqrt(dk) + dk = tf.cast(x=self.attention_head_size, dtype=q.dtype) + q = tf.multiply(x=q, y=tf.math.rsqrt(x=dk)) + attention_scores = tf.einsum("aecd,abcd->acbe", k, q) # Apply relative position embedding (precomputed in MPNetEncoder) if provided. if position_bias is not None: attention_scores += position_bias if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in TFMPNetModel call() function) attention_scores = attention_scores + attention_mask attention_probs = tf.nn.softmax(attention_scores, axis=-1) @@ -296,9 +300,7 @@ class TFMPNetSelfAttention(tf.keras.layers.Layer): if head_mask is not None: attention_probs = attention_probs * head_mask - c = tf.matmul(attention_probs, v) - c = tf.transpose(c, perm=[0, 2, 1, 3]) - c = tf.reshape(c, (batch_size, -1, self.all_head_size)) + c = tf.einsum("acbe,aecd->abcd", attention_probs, v) o = self.o(c) outputs = (o, attention_probs) if output_attentions else (o,) @@ -330,18 +332,22 @@ class TFMPNetIntermediate(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( - config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + self.dense = tf.keras.layers.experimental.EinsumDense( + equation="abc,cd->abd", + output_shape=(None, config.intermediate_size), + bias_axes="d", + kernel_initializer=get_initializer(initializer_range=config.initializer_range), + name="dense", ) if isinstance(config.hidden_act, str): - self.intermediate_act_fn = get_tf_activation(config.hidden_act) + self.intermediate_act_fn = get_tf_activation(activation_string=config.hidden_act) else: self.intermediate_act_fn = config.hidden_act def call(self, hidden_states): - hidden_states = self.dense(hidden_states) - hidden_states = self.intermediate_act_fn(hidden_states) + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.intermediate_act_fn(inputs=hidden_states) return hidden_states @@ -351,16 +357,20 @@ class TFMPNetOutput(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( - config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + self.dense = tf.keras.layers.experimental.EinsumDense( + equation="abc,cd->abd", + bias_axes="d", + output_shape=(None, config.hidden_size), + kernel_initializer=get_initializer(config.initializer_range), + name="dense", ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) def call(self, hidden_states, input_tensor, training=False): - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states, training=training) - hidden_states = self.LayerNorm(hidden_states + input_tensor) + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states diff --git a/src/transformers/models/roberta/modeling_tf_roberta.py b/src/transformers/models/roberta/modeling_tf_roberta.py index ae5f3dd223..a7c56b1746 100644 --- a/src/transformers/models/roberta/modeling_tf_roberta.py +++ b/src/transformers/models/roberta/modeling_tf_roberta.py @@ -243,59 +243,59 @@ class TFRobertaPooler(tf.keras.layers.Layer): return pooled_output -# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention +# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Roberta class TFRobertaSelfAttention(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) if config.hidden_size % config.num_attention_heads != 0: raise ValueError( - "The hidden size (%d) is not a multiple of the number of attention " - "heads (%d)" % (config.hidden_size, config.num_attention_heads) + f"The hidden size ({config.hidden_size}) is not a multiple of the number " + f"of attention heads ({config.num_attention_heads})" ) self.num_attention_heads = config.num_attention_heads - assert config.hidden_size % config.num_attention_heads == 0 self.attention_head_size = int(config.hidden_size / config.num_attention_heads) - self.all_head_size = self.num_attention_heads * self.attention_head_size - self.query = tf.keras.layers.Dense( - self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" + self.query = tf.keras.layers.experimental.EinsumDense( + equation="abc,cde->abde", + output_shape=(None, config.num_attention_heads, self.attention_head_size), + bias_axes="de", + kernel_initializer=get_initializer(initializer_range=config.initializer_range), + name="query", ) - self.key = tf.keras.layers.Dense( - self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" + self.key = tf.keras.layers.experimental.EinsumDense( + equation="abc,cde->abde", + output_shape=(None, config.num_attention_heads, self.attention_head_size), + bias_axes="de", + kernel_initializer=get_initializer(initializer_range=config.initializer_range), + name="key", ) - self.value = tf.keras.layers.Dense( - self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" + self.value = tf.keras.layers.experimental.EinsumDense( + equation="abc,cde->abde", + output_shape=(None, config.num_attention_heads, self.attention_head_size), + bias_axes="de", + kernel_initializer=get_initializer(initializer_range=config.initializer_range), + name="value", ) - self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) - def transpose_for_scores(self, x, batch_size): - x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size)) + def call(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False, training=False): + query_layer = self.query(inputs=hidden_states) + key_layer = self.key(inputs=hidden_states) + value_layer = self.value(inputs=hidden_states) - return tf.transpose(x, perm=[0, 2, 1, 3]) - - def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False): - batch_size = shape_list(hidden_states)[0] - mixed_query_layer = self.query(hidden_states) - mixed_key_layer = self.key(hidden_states) - mixed_value_layer = self.value(hidden_states) - query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) - key_layer = self.transpose_for_scores(mixed_key_layer, batch_size) - value_layer = self.transpose_for_scores(mixed_value_layer, batch_size) - - # Take the dot product between "query" and "key" to get the raw attention scores. - attention_scores = tf.matmul( - query_layer, key_layer, transpose_b=True - ) # (batch size, num_heads, seq_len_q, seq_len_k) - dk = tf.cast(shape_list(key_layer)[-1], attention_scores.dtype) # scale attention_scores - attention_scores = attention_scores / tf.math.sqrt(dk) + # Take the dot product between "query" and "key" to get the raw + # attention scores. + dk = tf.cast(x=self.attention_head_size, dtype=query_layer.dtype) + query_layer = tf.multiply(x=query_layer, y=tf.math.rsqrt(x=dk)) + attention_scores = tf.einsum("aecd,abcd->acbe", key_layer, query_layer) if attention_mask is not None: - # Apply the attention mask is (precomputed for all layers in TFBertModel call() function) + # Apply the attention mask is (precomputed for all layers in TFRobertaModel call() function) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - attention_probs = tf.nn.softmax(attention_scores, axis=-1) + attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. @@ -303,33 +303,42 @@ class TFRobertaSelfAttention(tf.keras.layers.Layer): # Mask heads if we want to if head_mask is not None: - attention_probs = attention_probs * head_mask + attention_scores = attention_scores * head_mask - context_layer = tf.matmul(attention_probs, value_layer) - context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3]) - context_layer = tf.reshape( - context_layer, (batch_size, -1, self.all_head_size) - ) # (batch_size, seq_len_q, all_head_size) - outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + attention_output = tf.einsum("acbe,aecd->abcd", attention_probs, value_layer) + outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) return outputs -# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput +# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Roberta class TFRobertaSelfOutput(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( - config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number " + f"of attention heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.dense = tf.keras.layers.experimental.EinsumDense( + equation="abcd,cde->abe", + output_shape=(None, self.all_head_size), + bias_axes="e", + kernel_initializer=get_initializer(initializer_range=config.initializer_range), + name="dense", ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) def call(self, hidden_states, input_tensor, training=False): - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states, training=training) - hidden_states = self.LayerNorm(hidden_states + input_tensor) + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states @@ -360,18 +369,22 @@ class TFRobertaIntermediate(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( - config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + self.dense = tf.keras.layers.experimental.EinsumDense( + equation="abc,cd->abd", + output_shape=(None, config.intermediate_size), + bias_axes="d", + kernel_initializer=get_initializer(initializer_range=config.initializer_range), + name="dense", ) if isinstance(config.hidden_act, str): - self.intermediate_act_fn = get_tf_activation(config.hidden_act) + self.intermediate_act_fn = get_tf_activation(activation_string=config.hidden_act) else: self.intermediate_act_fn = config.hidden_act def call(self, hidden_states): - hidden_states = self.dense(hidden_states) - hidden_states = self.intermediate_act_fn(hidden_states) + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.intermediate_act_fn(inputs=hidden_states) return hidden_states @@ -381,16 +394,20 @@ class TFRobertaOutput(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( - config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + self.dense = tf.keras.layers.experimental.EinsumDense( + equation="abc,cd->abd", + bias_axes="d", + output_shape=(None, config.hidden_size), + kernel_initializer=get_initializer(config.initializer_range), + name="dense", ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) def call(self, hidden_states, input_tensor, training=False): - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states, training=training) - hidden_states = self.LayerNorm(hidden_states + input_tensor) + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py index 109b9f310b..5c8ffbfc41 100644 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py @@ -191,52 +191,52 @@ class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer) if config.hidden_size % config.num_attention_heads != 0: raise ValueError( - "The hidden size (%d) is not a multiple of the number of attention " - "heads (%d)" % (config.hidden_size, config.num_attention_heads) + f"The hidden size ({config.hidden_size}) is not a multiple of the number " + f"of attention heads ({config.num_attention_heads})" ) self.num_attention_heads = config.num_attention_heads - assert config.hidden_size % config.num_attention_heads == 0 self.attention_head_size = int(config.hidden_size / config.num_attention_heads) - self.all_head_size = self.num_attention_heads * self.attention_head_size - self.query = tf.keras.layers.Dense( - self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" + self.query = tf.keras.layers.experimental.EinsumDense( + equation="abc,cde->abde", + output_shape=(None, config.num_attention_heads, self.attention_head_size), + bias_axes="de", + kernel_initializer=get_initializer(initializer_range=config.initializer_range), + name="query", ) - self.key = tf.keras.layers.Dense( - self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" + self.key = tf.keras.layers.experimental.EinsumDense( + equation="abc,cde->abde", + output_shape=(None, config.num_attention_heads, self.attention_head_size), + bias_axes="de", + kernel_initializer=get_initializer(initializer_range=config.initializer_range), + name="key", ) - self.value = tf.keras.layers.Dense( - self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" + self.value = tf.keras.layers.experimental.EinsumDense( + equation="abc,cde->abde", + output_shape=(None, config.num_attention_heads, self.attention_head_size), + bias_axes="de", + kernel_initializer=get_initializer(initializer_range=config.initializer_range), + name="value", ) - self.dropout = tf.keras.layers.Dropout(config.attention_probs_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) - def transpose_for_scores(self, x, batch_size): - x = tf.reshape(x, (batch_size, -1, self.num_attention_heads, self.attention_head_size)) + def call(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False, training=False): + query_layer = self.query(inputs=hidden_states) + key_layer = self.key(inputs=hidden_states) + value_layer = self.value(inputs=hidden_states) - return tf.transpose(x, perm=[0, 2, 1, 3]) - - def call(self, hidden_states, attention_mask, head_mask, output_attentions, training=False): - batch_size = shape_list(hidden_states)[0] - mixed_query_layer = self.query(hidden_states) - mixed_key_layer = self.key(hidden_states) - mixed_value_layer = self.value(hidden_states) - query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) - key_layer = self.transpose_for_scores(mixed_key_layer, batch_size) - value_layer = self.transpose_for_scores(mixed_value_layer, batch_size) - - # Take the dot product between "query" and "key" to get the raw attention scores. - attention_scores = tf.matmul( - query_layer, key_layer, transpose_b=True - ) # (batch size, num_heads, seq_len_q, seq_len_k) - dk = tf.cast(shape_list(key_layer)[-1], attention_scores.dtype) # scale attention_scores - attention_scores = attention_scores / tf.math.sqrt(dk) + # Take the dot product between "query" and "key" to get the raw + # attention scores. + dk = tf.cast(x=self.attention_head_size, dtype=query_layer.dtype) + query_layer = tf.multiply(x=query_layer, y=tf.math.rsqrt(x=dk)) + attention_scores = tf.einsum("aecd,abcd->acbe", key_layer, query_layer) if attention_mask is not None: # Apply the attention mask is (precomputed for all layers in TF{{cookiecutter.camelcase_modelname}}Model call() function) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - attention_probs = tf.nn.softmax(attention_scores, axis=-1) + attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. @@ -244,14 +244,10 @@ class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer) # Mask heads if we want to if head_mask is not None: - attention_probs = attention_probs * head_mask + attention_scores = attention_scores * head_mask - context_layer = tf.matmul(attention_probs, value_layer) - context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3]) - context_layer = tf.reshape( - context_layer, (batch_size, -1, self.all_head_size) - ) # (batch_size, seq_len_q, all_head_size) - outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + attention_output = tf.einsum("acbe,aecd->abcd", attention_probs, value_layer) + outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) return outputs @@ -261,16 +257,29 @@ class TF{{cookiecutter.camelcase_modelname}}SelfOutput(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super().__init__(**kwargs) - self.dense = tf.keras.layers.Dense( - config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number " + f"of attention heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.dense = tf.keras.layers.experimental.EinsumDense( + equation="abcd,cde->abe", + output_shape=(None, self.all_head_size), + bias_axes="e", + kernel_initializer=get_initializer(initializer_range=config.initializer_range), + name="dense", ) self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) def call(self, hidden_states, input_tensor, training=False): - hidden_states = self.dense(hidden_states) - hidden_states = self.dropout(hidden_states, training=training) - hidden_states = self.LayerNorm(hidden_states + input_tensor) + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) return hidden_states