From 87d60b6e19ee1c6d818e6cd5b7a3c4f56f5471ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Louf?= Date: Thu, 17 Oct 2019 10:18:19 +0200 Subject: [PATCH] reword explanation of encoder_attention_mask --- transformers/modeling_bert.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py index 05ab3395de..be8ec5ba21 100644 --- a/transformers/modeling_bert.py +++ b/transformers/modeling_bert.py @@ -201,9 +201,9 @@ class BertSelfAttention(nn.Module): def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None): mixed_query_layer = self.query(hidden_states) - # if the attention Module is a encoder-decoder self attention module - # they keys & values are given by the encoder; the attention mask - # needs to be such that there is no atention on the encoder's padding tokens. + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. if encoder_hidden_states is not None: mixed_key_layer = self.key(encoder_hidden_states) mixed_value_layer = self.value(encoder_hidden_states)