From 2ee9f9b69e67426aaed690f652f9cdd8b524b99d Mon Sep 17 00:00:00 2001
From: Funtowicz Morgan <mfuntowicz@users.noreply.github.com>
Date: Thu, 28 Jan 2021 12:11:52 +0100
Subject: [PATCH] Fix computation of attention_probs when head_mask is
 provided. (#9853)

* Fix computation of attention_probs when head_mask is provided.

Signed-off-by: Morgan Funtowicz <funtowiczmo@gmail.com>

* Apply changes to the template

Co-authored-by: Lysandre <lysandre.debut@reseau.eseo.fr>
---
 src/transformers/models/bert/modeling_tf_bert.py                | 2 +-
 src/transformers/models/electra/modeling_tf_electra.py          | 2 +-
 src/transformers/models/roberta/modeling_tf_roberta.py          | 2 +-
 .../modeling_tf_{{cookiecutter.lowercase_modelname}}.py         | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py
index d7bf7715a4..da353e8ca4 100644
--- a/src/transformers/models/bert/modeling_tf_bert.py
+++ b/src/transformers/models/bert/modeling_tf_bert.py
@@ -370,7 +370,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
 
         # Mask heads if we want to
         if head_mask is not None:
-            attention_scores = tf.multiply(attention_scores, head_mask)
+            attention_probs = tf.multiply(attention_probs, head_mask)
 
         attention_output = tf.einsum("acbe,aecd->abcd", attention_probs, value_layer)
         outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
diff --git a/src/transformers/models/electra/modeling_tf_electra.py b/src/transformers/models/electra/modeling_tf_electra.py
index 476b2b6172..a968dbf71f 100644
--- a/src/transformers/models/electra/modeling_tf_electra.py
+++ b/src/transformers/models/electra/modeling_tf_electra.py
@@ -253,7 +253,7 @@ class TFElectraSelfAttention(tf.keras.layers.Layer):
 
         # Mask heads if we want to
         if head_mask is not None:
-            attention_scores = tf.multiply(attention_scores, head_mask)
+            attention_probs = tf.multiply(attention_probs, head_mask)
 
         attention_output = tf.einsum("acbe,aecd->abcd", attention_probs, value_layer)
         outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
diff --git a/src/transformers/models/roberta/modeling_tf_roberta.py b/src/transformers/models/roberta/modeling_tf_roberta.py
index e34c984a37..c7460ff776 100644
--- a/src/transformers/models/roberta/modeling_tf_roberta.py
+++ b/src/transformers/models/roberta/modeling_tf_roberta.py
@@ -377,7 +377,7 @@ class TFRobertaSelfAttention(tf.keras.layers.Layer):
 
         # Mask heads if we want to
         if head_mask is not None:
-            attention_scores = tf.multiply(attention_scores, head_mask)
+            attention_probs = tf.multiply(attention_probs, head_mask)
 
         attention_output = tf.einsum("acbe,aecd->abcd", attention_probs, value_layer)
         outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
index 37aaf0358b..0d5a61206b 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
@@ -317,7 +317,7 @@ class TF{{cookiecutter.camelcase_modelname}}SelfAttention(tf.keras.layers.Layer)
 
         # Mask heads if we want to
         if head_mask is not None:
-            attention_scores = tf.multiply(attention_scores, head_mask)
+            attention_probs = tf.multiply(attention_probs, head_mask)
 
         attention_output = tf.einsum("acbe,aecd->abcd", attention_probs, value_layer)
         outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)