From 80607874c1f82e137ceb2cff3397c6a91d6aa963 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 8 Feb 2019 21:49:05 +0100 Subject: [PATCH] fix layer norm epsilon in OpenAI GPT --- pytorch_pretrained_bert/modeling_openai.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py index 7100905a3a..e6f3fc4efe 100644 --- a/pytorch_pretrained_bert/modeling_openai.py +++ b/pytorch_pretrained_bert/modeling_openai.py @@ -141,6 +141,7 @@ class OpenAIGPTConfig(object): resid_pdrop=0.1, embd_pdrop=0.1, attn_pdrop=0.1, + layer_norm_epsilon=1e-5, initializer_range=0.02, ): """Constructs OpenAIGPTConfig. @@ -161,6 +162,7 @@ class OpenAIGPTConfig(object): attn_pdrop: The dropout ratio for the attention probabilities. embd_pdrop: The dropout ratio for the embeddings. + layer_norm_epsilon: epsilon to use in the layer norm layers initializer_range: The sttdev of the truncated_normal_initializer for initializing all weight matrices. """ @@ -182,6 +184,7 @@ class OpenAIGPTConfig(object): self.resid_pdrop = resid_pdrop self.embd_pdrop = embd_pdrop self.attn_pdrop = attn_pdrop + self.layer_norm_epsilon = layer_norm_epsilon self.initializer_range = initializer_range else: raise ValueError( @@ -318,9 +321,9 @@ class Block(nn.Module): super(Block, self).__init__() nx = config.n_embd self.attn = Attention(nx, n_ctx, config, scale) - self.ln_1 = LayerNorm(nx) + self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon) self.mlp = MLP(4 * nx, config) - self.ln_2 = LayerNorm(nx) + self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon) def forward(self, x): a = self.attn(x)