From 6ed9882ddb2b6249463c855dcca6860161d91f3e Mon Sep 17 00:00:00 2001 From: Thomas Viehmann Date: Tue, 30 Nov 2021 17:47:33 +0100 Subject: [PATCH] use functional interface for softmax in attention (#14198) * use functional interface instead of instantiating module and immediately calling it * fix torch.nn.functional to nn.functional. Thank you Stas! --- .../movement-pruning/emmental/modeling_bert_masked.py | 2 +- src/transformers/models/albert/modeling_albert.py | 2 +- src/transformers/models/beit/modeling_beit.py | 2 +- src/transformers/models/bert/modeling_bert.py | 2 +- src/transformers/models/canine/modeling_canine.py | 2 +- src/transformers/models/deit/modeling_deit.py | 2 +- src/transformers/models/distilbert/modeling_distilbert.py | 2 +- src/transformers/models/electra/modeling_electra.py | 2 +- src/transformers/models/gpt2/modeling_gpt2.py | 4 ++-- src/transformers/models/gpt_neo/modeling_gpt_neo.py | 2 +- src/transformers/models/gptj/modeling_gptj.py | 2 +- src/transformers/models/ibert/quant_modules.py | 2 +- src/transformers/models/layoutlm/modeling_layoutlm.py | 2 +- src/transformers/models/luke/modeling_luke.py | 2 +- src/transformers/models/lxmert/modeling_lxmert.py | 2 +- .../models/megatron_bert/modeling_megatron_bert.py | 2 +- src/transformers/models/mobilebert/modeling_mobilebert.py | 2 +- src/transformers/models/mpnet/modeling_mpnet.py | 2 +- src/transformers/models/openai/modeling_openai.py | 2 +- src/transformers/models/rembert/modeling_rembert.py | 2 +- src/transformers/models/roberta/modeling_roberta.py | 2 +- src/transformers/models/roformer/modeling_roformer.py | 2 +- src/transformers/models/segformer/modeling_segformer.py | 2 +- src/transformers/models/splinter/modeling_splinter.py | 2 +- src/transformers/models/tapas/modeling_tapas.py | 2 +- src/transformers/models/visual_bert/modeling_visual_bert.py | 2 +- src/transformers/models/vit/modeling_vit.py | 2 +- .../modeling_{{cookiecutter.lowercase_modelname}}.py | 2 +- 28 files changed, 29 insertions(+), 29 deletions(-) diff --git a/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py b/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py index eafb730e95..771d2078d0 100644 --- a/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py +++ b/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py @@ -152,7 +152,7 @@ class BertSelfAttention(nn.Module): attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - attention_probs = nn.Softmax(dim=-1)(attention_scores) + attention_probs = nn.functional.softmax(attention_scores, dim=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py index 79bb553613..95fd9fe235 100755 --- a/src/transformers/models/albert/modeling_albert.py +++ b/src/transformers/models/albert/modeling_albert.py @@ -348,7 +348,7 @@ class AlbertAttention(nn.Module): attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key # Normalize the attention scores to probabilities. - attention_probs = nn.Softmax(dim=-1)(attention_scores) + attention_probs = nn.functional.softmax(attention_scores, dim=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py index c6f0d89093..12f0050912 100755 --- a/src/transformers/models/beit/modeling_beit.py +++ b/src/transformers/models/beit/modeling_beit.py @@ -244,7 +244,7 @@ class BeitSelfAttention(nn.Module): attention_scores = attention_scores + relative_position_bias # Normalize the attention scores to probabilities. - attention_probs = nn.Softmax(dim=-1)(attention_scores) + attention_probs = nn.functional.softmax(attention_scores, dim=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py index 496560db59..5af77eec00 100755 --- a/src/transformers/models/bert/modeling_bert.py +++ b/src/transformers/models/bert/modeling_bert.py @@ -327,7 +327,7 @@ class BertSelfAttention(nn.Module): attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - attention_probs = nn.Softmax(dim=-1)(attention_scores) + attention_probs = nn.functional.softmax(attention_scores, dim=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/canine/modeling_canine.py b/src/transformers/models/canine/modeling_canine.py index f2ba24ec0d..0081bedcd9 100644 --- a/src/transformers/models/canine/modeling_canine.py +++ b/src/transformers/models/canine/modeling_canine.py @@ -477,7 +477,7 @@ class CanineSelfAttention(nn.Module): attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - attention_probs = nn.Softmax(dim=-1)(attention_scores) + attention_probs = nn.functional.softmax(attention_scores, dim=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/deit/modeling_deit.py b/src/transformers/models/deit/modeling_deit.py index e47e88b849..3f26cbef31 100644 --- a/src/transformers/models/deit/modeling_deit.py +++ b/src/transformers/models/deit/modeling_deit.py @@ -159,7 +159,7 @@ class DeiTSelfAttention(nn.Module): attention_scores = attention_scores / math.sqrt(self.attention_head_size) # Normalize the attention scores to probabilities. - attention_probs = nn.Softmax(dim=-1)(attention_scores) + attention_probs = nn.functional.softmax(attention_scores, dim=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py index a79b452394..e9b57adc81 100755 --- a/src/transformers/models/distilbert/modeling_distilbert.py +++ b/src/transformers/models/distilbert/modeling_distilbert.py @@ -206,7 +206,7 @@ class MultiHeadSelfAttention(nn.Module): mask = (mask == 0).view(mask_reshp).expand_as(scores) # (bs, n_heads, q_length, k_length) scores = scores.masked_fill(mask, -float("inf")) # (bs, n_heads, q_length, k_length) - weights = nn.Softmax(dim=-1)(scores) # (bs, n_heads, q_length, k_length) + weights = nn.functional.softmax(scores, dim=-1) # (bs, n_heads, q_length, k_length) weights = self.dropout(weights) # (bs, n_heads, q_length, k_length) # Mask heads if we want to diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py index 1bd2f45644..71782da69b 100644 --- a/src/transformers/models/electra/modeling_electra.py +++ b/src/transformers/models/electra/modeling_electra.py @@ -319,7 +319,7 @@ class ElectraSelfAttention(nn.Module): attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - attention_probs = nn.Softmax(dim=-1)(attention_scores) + attention_probs = nn.functional.softmax(attention_scores, dim=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py index 77ef0386ea..1fbee09990 100644 --- a/src/transformers/models/gpt2/modeling_gpt2.py +++ b/src/transformers/models/gpt2/modeling_gpt2.py @@ -209,7 +209,7 @@ class GPT2Attention(nn.Module): # Apply the attention mask attn_weights = attn_weights + attention_mask - attn_weights = nn.Softmax(dim=-1)(attn_weights) + attn_weights = nn.functional.softmax(attn_weights, dim=-1) # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise attn_weights = attn_weights.type(value.dtype) @@ -260,7 +260,7 @@ class GPT2Attention(nn.Module): # Apply the attention mask attn_weights = attn_weights + attention_mask - attn_weights = nn.Softmax(dim=-1)(attn_weights) + attn_weights = nn.functional.softmax(attn_weights, dim=-1) # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op if otherwise if attn_weights.dtype != torch.float32: diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py index 9785178ce8..7046f75b55 100755 --- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py +++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py @@ -199,7 +199,7 @@ class GPTNeoSelfAttention(nn.Module): # Apply the attention mask attn_weights = attn_weights + attention_mask - attn_weights = nn.Softmax(dim=-1)(attn_weights) + attn_weights = nn.functional.softmax(attn_weights, dim=-1) attn_weights = attn_weights.to(value.dtype) attn_weights = self.attn_dropout(attn_weights) diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py index 603619cc5a..60705fbb22 100755 --- a/src/transformers/models/gptj/modeling_gptj.py +++ b/src/transformers/models/gptj/modeling_gptj.py @@ -151,7 +151,7 @@ class GPTJAttention(nn.Module): # Apply the attention mask attn_weights = attn_weights + attention_mask - attn_weights = nn.Softmax(dim=-1)(attn_weights) + attn_weights = nn.functional.softmax(attn_weights, dim=-1) attn_weights = attn_weights.to(value.dtype) attn_weights = self.attn_dropout(attn_weights) diff --git a/src/transformers/models/ibert/quant_modules.py b/src/transformers/models/ibert/quant_modules.py index 281bc96df8..386988c06d 100644 --- a/src/transformers/models/ibert/quant_modules.py +++ b/src/transformers/models/ibert/quant_modules.py @@ -409,7 +409,7 @@ class IntSoftmax(nn.Module): def forward(self, x, scaling_factor): if not self.quant_mode: - return nn.Softmax(dim=-1)(x), None + return nn.functional.softmax(x, dim=-1), None x_int = x / scaling_factor diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py index 186146e120..251ad624cf 100644 --- a/src/transformers/models/layoutlm/modeling_layoutlm.py +++ b/src/transformers/models/layoutlm/modeling_layoutlm.py @@ -235,7 +235,7 @@ class LayoutLMSelfAttention(nn.Module): attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - attention_probs = nn.Softmax(dim=-1)(attention_scores) + attention_probs = nn.functional.softmax(attention_scores, dim=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/luke/modeling_luke.py b/src/transformers/models/luke/modeling_luke.py index 6edd84a3ce..1b2a4f6ffb 100644 --- a/src/transformers/models/luke/modeling_luke.py +++ b/src/transformers/models/luke/modeling_luke.py @@ -399,7 +399,7 @@ class LukeSelfAttention(nn.Module): attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - attention_probs = nn.Softmax(dim=-1)(attention_scores) + attention_probs = nn.functional.softmax(attention_scores, dim=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/lxmert/modeling_lxmert.py b/src/transformers/models/lxmert/modeling_lxmert.py index c78e36fddb..1a54353d8b 100644 --- a/src/transformers/models/lxmert/modeling_lxmert.py +++ b/src/transformers/models/lxmert/modeling_lxmert.py @@ -356,7 +356,7 @@ class LxmertAttention(nn.Module): attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - attention_probs = nn.Softmax(dim=-1)(attention_scores) + attention_probs = nn.functional.softmax(attention_scores, dim=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py index 12f026f63c..a2546ef51d 100755 --- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py +++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py @@ -298,7 +298,7 @@ class MegatronBertSelfAttention(nn.Module): attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - attention_probs = nn.Softmax(dim=-1)(attention_scores) + attention_probs = nn.functional.softmax(attention_scores, dim=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py index 28c01d5521..79519f6cf8 100644 --- a/src/transformers/models/mobilebert/modeling_mobilebert.py +++ b/src/transformers/models/mobilebert/modeling_mobilebert.py @@ -264,7 +264,7 @@ class MobileBertSelfAttention(nn.Module): # Apply the attention mask is (precomputed for all layers in BertModel forward() function) attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - attention_probs = nn.Softmax(dim=-1)(attention_scores) + attention_probs = nn.functional.softmax(attention_scores, dim=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = self.dropout(attention_probs) diff --git a/src/transformers/models/mpnet/modeling_mpnet.py b/src/transformers/models/mpnet/modeling_mpnet.py index 70e2d09a93..c4eadbf439 100644 --- a/src/transformers/models/mpnet/modeling_mpnet.py +++ b/src/transformers/models/mpnet/modeling_mpnet.py @@ -184,7 +184,7 @@ class MPNetSelfAttention(nn.Module): attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - attention_probs = nn.Softmax(dim=-1)(attention_scores) + attention_probs = nn.functional.softmax(attention_scores, dim=-1) attention_probs = self.dropout(attention_probs) diff --git a/src/transformers/models/openai/modeling_openai.py b/src/transformers/models/openai/modeling_openai.py index 782812b7e7..6153a87301 100644 --- a/src/transformers/models/openai/modeling_openai.py +++ b/src/transformers/models/openai/modeling_openai.py @@ -185,7 +185,7 @@ class Attention(nn.Module): # Apply the attention mask w = w + attention_mask - w = nn.Softmax(dim=-1)(w) + w = nn.functional.softmax(w, dim=-1) w = self.attn_dropout(w) # Mask heads if we want to diff --git a/src/transformers/models/rembert/modeling_rembert.py b/src/transformers/models/rembert/modeling_rembert.py index bc5569e553..55d4c557ac 100755 --- a/src/transformers/models/rembert/modeling_rembert.py +++ b/src/transformers/models/rembert/modeling_rembert.py @@ -290,7 +290,7 @@ class RemBertSelfAttention(nn.Module): attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - attention_probs = nn.Softmax(dim=-1)(attention_scores) + attention_probs = nn.functional.softmax(attention_scores, dim=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py index f6fce3eedd..0ed738cd6b 100644 --- a/src/transformers/models/roberta/modeling_roberta.py +++ b/src/transformers/models/roberta/modeling_roberta.py @@ -262,7 +262,7 @@ class RobertaSelfAttention(nn.Module): attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - attention_probs = nn.Softmax(dim=-1)(attention_scores) + attention_probs = nn.functional.softmax(attention_scores, dim=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/roformer/modeling_roformer.py b/src/transformers/models/roformer/modeling_roformer.py index 14e74a24f8..4f3188111b 100644 --- a/src/transformers/models/roformer/modeling_roformer.py +++ b/src/transformers/models/roformer/modeling_roformer.py @@ -301,7 +301,7 @@ class RoFormerSelfAttention(nn.Module): attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - attention_probs = nn.Softmax(dim=-1)(attention_scores) + attention_probs = nn.functional.softmax(attention_scores, dim=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/segformer/modeling_segformer.py b/src/transformers/models/segformer/modeling_segformer.py index e365febccb..538b64d1a8 100755 --- a/src/transformers/models/segformer/modeling_segformer.py +++ b/src/transformers/models/segformer/modeling_segformer.py @@ -165,7 +165,7 @@ class SegformerEfficientSelfAttention(nn.Module): attention_scores = attention_scores / math.sqrt(self.attention_head_size) # Normalize the attention scores to probabilities. - attention_probs = nn.Softmax(dim=-1)(attention_scores) + attention_probs = nn.functional.softmax(attention_scores, dim=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/splinter/modeling_splinter.py b/src/transformers/models/splinter/modeling_splinter.py index 19dab0457d..c88f0eab0b 100755 --- a/src/transformers/models/splinter/modeling_splinter.py +++ b/src/transformers/models/splinter/modeling_splinter.py @@ -202,7 +202,7 @@ class SplinterSelfAttention(nn.Module): attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - attention_probs = nn.Softmax(dim=-1)(attention_scores) + attention_probs = nn.functional.softmax(attention_scores, dim=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py index 7ff9081fab..05d6da3b63 100644 --- a/src/transformers/models/tapas/modeling_tapas.py +++ b/src/transformers/models/tapas/modeling_tapas.py @@ -419,7 +419,7 @@ class TapasSelfAttention(nn.Module): attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - attention_probs = nn.Softmax(dim=-1)(attention_scores) + attention_probs = nn.functional.softmax(attention_scores, dim=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/visual_bert/modeling_visual_bert.py b/src/transformers/models/visual_bert/modeling_visual_bert.py index eabca9ad4c..a1b8cc644a 100755 --- a/src/transformers/models/visual_bert/modeling_visual_bert.py +++ b/src/transformers/models/visual_bert/modeling_visual_bert.py @@ -244,7 +244,7 @@ class VisualBertSelfAttention(nn.Module): attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - attention_probs = nn.Softmax(dim=-1)(attention_scores) + attention_probs = nn.functional.softmax(attention_scores, dim=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py index b1bc303124..1a8a0db513 100644 --- a/src/transformers/models/vit/modeling_vit.py +++ b/src/transformers/models/vit/modeling_vit.py @@ -193,7 +193,7 @@ class ViTSelfAttention(nn.Module): attention_scores = attention_scores / math.sqrt(self.attention_head_size) # Normalize the attention scores to probabilities. - attention_probs = nn.Softmax(dim=-1)(attention_scores) + attention_probs = nn.functional.softmax(attention_scores, dim=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py index 7d0afd2d9c..87806aafc8 100755 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py @@ -304,7 +304,7 @@ class {{cookiecutter.camelcase_modelname}}SelfAttention(nn.Module): attention_scores = attention_scores + attention_mask # Normalize the attention scores to probabilities. - attention_probs = nn.Softmax(dim=-1)(attention_scores) + attention_probs = nn.functional.softmax(attention_scores, dim=-1) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper.