From 87d60b6e19ee1c6d818e6cd5b7a3c4f56f5471ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 17 Oct 2019 10:18:19 +0200
Subject: [PATCH] reword explanation of encoder_attention_mask

---
 transformers/modeling_bert.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index 05ab3395de..be8ec5ba21 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -201,9 +201,9 @@ class BertSelfAttention(nn.Module):
     def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
         mixed_query_layer = self.query(hidden_states)
 
-        # if the attention Module is a encoder-decoder self attention module
-        # they keys & values are given by the encoder; the attention mask
-        # needs to be such that there is no atention on the encoder's padding tokens.
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
         if encoder_hidden_states is not None:
             mixed_key_layer = self.key(encoder_hidden_states)
             mixed_value_layer = self.value(encoder_hidden_states)