From 5680a1106302b1ebeb960de0700d6379c0aeef5c Mon Sep 17 00:00:00 2001 From: Lysandre Date: Wed, 30 Oct 2019 20:42:49 +0000 Subject: [PATCH] Activation function managed from the config file --- transformers/configuration_albert.py | 2 +- transformers/modeling_albert.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/transformers/configuration_albert.py b/transformers/configuration_albert.py index c86c9565cb..15437dbbea 100644 --- a/transformers/configuration_albert.py +++ b/transformers/configuration_albert.py @@ -16,7 +16,7 @@ class AlbertConfig(PretrainedConfig): intermediate_size=16384, inner_group_num=1, down_scale_factor=1, - hidden_act="gelu", + hidden_act="gelu_new", hidden_dropout_prob=0, attention_probs_dropout_prob=0, max_position_embeddings=512, diff --git a/transformers/modeling_albert.py b/transformers/modeling_albert.py index 371a2e535c..7e9f7f1c46 100644 --- a/transformers/modeling_albert.py +++ b/transformers/modeling_albert.py @@ -6,7 +6,7 @@ import torch import torch.nn as nn from torch.nn import CrossEntropyLoss from transformers.configuration_albert import AlbertConfig -from transformers.modeling_bert import BertEmbeddings, BertModel, BertSelfAttention, prune_linear_layer, gelu_new +from transformers.modeling_bert import BertEmbeddings, BertModel, BertSelfAttention, prune_linear_layer, ACT2FN from transformers.modeling_utils import PreTrainedModel from .file_utils import add_start_docstrings @@ -190,11 +190,12 @@ class AlbertLayer(nn.Module): self.attention = AlbertAttention(config) self.ffn = nn.Linear(config.hidden_size, config.intermediate_size) self.ffn_output = nn.Linear(config.intermediate_size, config.hidden_size) + self.activation = ACT2FN[config.hidden_act] def forward(self, hidden_states, attention_mask=None, head_mask=None): attention_output = self.attention(hidden_states, attention_mask) ffn_output = self.ffn(attention_output) - ffn_output = gelu_new(ffn_output) + ffn_output = self.activation(ffn_output) ffn_output = self.ffn_output(ffn_output) hidden_states = self.full_layer_layer_norm(ffn_output + attention_output) @@ -392,6 +393,7 @@ class AlbertForMaskedLM(PreTrainedModel): self.bias = nn.Parameter(torch.zeros(config.vocab_size)) self.dense = nn.Linear(config.hidden_size, config.embedding_size) self.word_embeddings = nn.Linear(config.embedding_size, config.vocab_size) + self.activation = ACT2FN[config.hidden_act] def tie_weights(self): """ Make sure we are sharing the input and output embeddings. @@ -405,7 +407,7 @@ class AlbertForMaskedLM(PreTrainedModel): outputs = self.bert(input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None) sequence_outputs = outputs[0] hidden_states = self.dense(sequence_outputs) - hidden_states = gelu_new(hidden_states) + hidden_states = self.activation(hidden_states) hidden_states = self.LayerNorm(hidden_states) prediction_scores = self.word_embeddings(hidden_states)