Merge pull request #495 from SudoSharma/patch-2

Fix gradient overflow issue during attention mask
2019-04-17 11:10:36 +02:00
parent 46078e1b46 9e666aaa29
commit 2e153930cf
1 changed files with 1 additions and 1 deletions
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -218,7 +218,7 @@ class Attention(nn.Module):
            w = w / math.sqrt(v.size(-1))
        nd, ns = w.size(-2), w.size(-1)
        b = self.bias[:, :, ns-nd:ns, :ns]
-        w = w * b - 1e10 * (1 - b)
+        w = w * b - 1e4 * (1 - b)

        w = nn.Softmax(dim=-1)(w)
        return torch.matmul(w, v)