From 2f2aa0c89cab9a77560e6845578f917a61081c67 Mon Sep 17 00:00:00 2001 From: Teven Date: Thu, 6 Aug 2020 17:47:32 +0200 Subject: [PATCH] added `n_inner` argument to gpt2 config (#6296) --- src/transformers/configuration_gpt2.py | 4 ++++ src/transformers/modeling_gpt2.py | 3 ++- src/transformers/modeling_tf_gpt2.py | 3 ++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/transformers/configuration_gpt2.py b/src/transformers/configuration_gpt2.py index 814846cbde..5dd80d1987 100644 --- a/src/transformers/configuration_gpt2.py +++ b/src/transformers/configuration_gpt2.py @@ -59,6 +59,8 @@ class GPT2Config(PretrainedConfig): Number of hidden layers in the Transformer encoder. n_head (:obj:`int`, optional, defaults to 12): Number of attention heads for each attention layer in the Transformer encoder. + n_inner (:obj:`int`, optional, defaults to None): + Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd activation_function (:obj:`str`, optional, defaults to 'gelu'): Activation function selected in the list ["relu", "swish", "gelu", "tanh", "gelu_new"]. resid_pdrop (:obj:`float`, optional, defaults to 0.1): @@ -122,6 +124,7 @@ class GPT2Config(PretrainedConfig): n_embd=768, n_layer=12, n_head=12, + n_inner=None, activation_function="gelu_new", resid_pdrop=0.1, embd_pdrop=0.1, @@ -145,6 +148,7 @@ class GPT2Config(PretrainedConfig): self.n_embd = n_embd self.n_layer = n_layer self.n_head = n_head + self.n_inner = n_inner self.activation_function = activation_function self.resid_pdrop = resid_pdrop self.embd_pdrop = embd_pdrop diff --git a/src/transformers/modeling_gpt2.py b/src/transformers/modeling_gpt2.py index 3a8d104d89..ea23a819d5 100644 --- a/src/transformers/modeling_gpt2.py +++ b/src/transformers/modeling_gpt2.py @@ -240,10 +240,11 @@ class Block(nn.Module): def __init__(self, n_ctx, config, scale=False): super().__init__() nx = config.n_embd + inner_dim = config.n_inner if config.n_inner is not None else 4 * nx self.ln_1 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon) self.attn = Attention(nx, n_ctx, config, scale) self.ln_2 = nn.LayerNorm(nx, eps=config.layer_norm_epsilon) - self.mlp = MLP(4 * nx, config) + self.mlp = MLP(inner_dim, config) def forward( self, x, layer_past=None, attention_mask=None, head_mask=None, use_cache=False, output_attentions=False, diff --git a/src/transformers/modeling_tf_gpt2.py b/src/transformers/modeling_tf_gpt2.py index 5221ef46ce..4952e286fb 100644 --- a/src/transformers/modeling_tf_gpt2.py +++ b/src/transformers/modeling_tf_gpt2.py @@ -194,10 +194,11 @@ class TFBlock(tf.keras.layers.Layer): def __init__(self, n_ctx, config, scale=False, **kwargs): super().__init__(**kwargs) nx = config.n_embd + inner_dim = config.n_inner if config.n_inner is not None else 4 * nx self.ln_1 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_1") self.attn = TFAttention(nx, n_ctx, config, scale, name="attn") self.ln_2 = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="ln_2") - self.mlp = TFMLP(4 * nx, config, name="mlp") + self.mlp = TFMLP(inner_dim, config, name="mlp") def call(self, x, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=False): a = self.ln_1(x)