From 9e666aaa297a84f8276cd891cd1a151e5266349e Mon Sep 17 00:00:00 2001 From: Abhi Sharma <18308855+SudoSharma@users.noreply.github.com> Date: Tue, 16 Apr 2019 11:42:34 -0700 Subject: [PATCH] Fix gradient overflow issue during attention mask This fix is in reference to issue #382. GPT2 can now be trained in mixed precision, which I've confirmed with testing. I also tested unconditional generation on multiple seeds before and after changing 1e10 to 1e4 and there was no difference. Please let me know if there is anything else I can do to make this pull request better. Thanks for all your work! --- pytorch_pretrained_bert/modeling_gpt2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py index 7cf1e6b59d..063c525d98 100644 --- a/pytorch_pretrained_bert/modeling_gpt2.py +++ b/pytorch_pretrained_bert/modeling_gpt2.py @@ -218,7 +218,7 @@ class Attention(nn.Module): w = w / math.sqrt(v.size(-1)) nd, ns = w.size(-2), w.size(-1) b = self.bias[:, :, ns-nd:ns, :ns] - w = w * b - 1e10 * (1 - b) + w = w * b - 1e4 * (1 - b) w = nn.Softmax(dim=-1)(w) return torch.matmul(w, v)