use functional interface for softmax in attention (#14198)

* use functional interface instead of instantiating module and immediately calling it * fix torch.nn.functional to nn.functional. Thank you Stas!
2021-11-30 17:47:33 +01:00
parent 4176bc161c
commit 6ed9882ddb
28 changed files with 29 additions and 29 deletions
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
@@ -304,7 +304,7 @@ class {{cookiecutter.camelcase_modelname}}SelfAttention(nn.Module):
            attention_scores = attention_scores + attention_mask

        # Normalize the attention scores to probabilities.
-        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)

        # This is actually dropping out entire tokens to attend to, which might
        # seem a bit unusual, but is taken from the original Transformer paper.