From ef74b0f07a190f19c69abc0732ea955e8dd7330f Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Thu, 13 Feb 2020 08:28:33 -0500 Subject: [PATCH] =?UTF-8?q?get=5Factivation('relu')=20provides=20a=20simpl?= =?UTF-8?q?e=20mapping=20from=20strings=20i=E2=80=A6=20(#2807)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * activations.py contains a mapping from string to activation function * resolves some `gelu` vs `gelu_new` ambiguity --- src/transformers/activations.py | 48 +++++++++++++++++++++++++ src/transformers/modeling_bert.py | 21 +---------- src/transformers/modeling_distilbert.py | 3 +- src/transformers/modeling_gpt2.py | 7 ++-- src/transformers/modeling_openai.py | 11 ++---- src/transformers/modeling_utils.py | 15 ++++---- src/transformers/modeling_xlm.py | 12 +------ src/transformers/modeling_xlnet.py | 17 ++------- tests/test_activations.py | 28 +++++++++++++++ 9 files changed, 94 insertions(+), 68 deletions(-) create mode 100644 src/transformers/activations.py create mode 100644 tests/test_activations.py diff --git a/src/transformers/activations.py b/src/transformers/activations.py new file mode 100644 index 0000000000..5d7d3cdc59 --- /dev/null +++ b/src/transformers/activations.py @@ -0,0 +1,48 @@ +import math + +import torch +import torch.nn.functional as F + + +def swish(x): + return x * torch.sigmoid(x) + + +def _gelu_python(x): + """ Original Implementation of the gelu activation function in Google Bert repo when initially created. + For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): + 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) + This is now written in C in torch.nn.functional + Also see https://arxiv.org/abs/1606.08415 + """ + return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) + + +gelu = getattr(F, "gelu", _gelu_python) + + +def gelu_new(x): + """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT). + Also see https://arxiv.org/abs/1606.08415 + """ + return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) + + +ACT2FN = { + "relu": F.relu, + "swish": swish, + "gelu": gelu, + "tanh": F.tanh, + "gelu_new": gelu_new, +} + + +def get_activation(activation_string): + if activation_string in ACT2FN: + return ACT2FN[activation_string] + else: + raise KeyError( + "function {} not found in ACT2FN mapping {} or torch.nn.functional".format( + activation_string, list(ACT2FN.keys()) + ) + ) diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py index 6cfbe3d00a..feb3529057 100644 --- a/src/transformers/modeling_bert.py +++ b/src/transformers/modeling_bert.py @@ -24,6 +24,7 @@ import torch from torch import nn from torch.nn import CrossEntropyLoss, MSELoss +from .activations import gelu, gelu_new, swish from .configuration_bert import BertConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_utils import PreTrainedModel, prune_linear_layer @@ -129,26 +130,6 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path): return model -def gelu(x): - """ Original Implementation of the gelu activation function in Google Bert repo when initially created. - For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): - 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) - Also see https://arxiv.org/abs/1606.08415 - """ - return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) - - -def gelu_new(x): - """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT). - Also see https://arxiv.org/abs/1606.08415 - """ - return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) - - -def swish(x): - return x * torch.sigmoid(x) - - def mish(x): return x * torch.tanh(nn.functional.softplus(x)) diff --git a/src/transformers/modeling_distilbert.py b/src/transformers/modeling_distilbert.py index 6634aacaff..b155efebef 100644 --- a/src/transformers/modeling_distilbert.py +++ b/src/transformers/modeling_distilbert.py @@ -27,6 +27,7 @@ import torch import torch.nn as nn from torch.nn import CrossEntropyLoss +from .activations import gelu from .configuration_distilbert import DistilBertConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_utils import PreTrainedModel, prune_linear_layer @@ -47,8 +48,6 @@ DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = { # UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE # -def gelu(x): - return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0))) def create_sinusoidal_embeddings(n_pos, dim, out): diff --git a/src/transformers/modeling_gpt2.py b/src/transformers/modeling_gpt2.py index 2426ef4352..b72d11af92 100644 --- a/src/transformers/modeling_gpt2.py +++ b/src/transformers/modeling_gpt2.py @@ -24,6 +24,7 @@ import torch import torch.nn as nn from torch.nn import CrossEntropyLoss +from .activations import gelu_new from .configuration_gpt2 import GPT2Config from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_utils import Conv1D, PreTrainedModel, SequenceSummary, prune_conv1d_layer @@ -95,10 +96,6 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path): return model -def gelu(x): - return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) - - class Attention(nn.Module): def __init__(self, nx, n_ctx, config, scale=False): super().__init__() @@ -206,7 +203,7 @@ class MLP(nn.Module): nx = config.n_embd self.c_fc = Conv1D(n_state, nx) self.c_proj = Conv1D(nx, n_state) - self.act = gelu + self.act = gelu_new self.dropout = nn.Dropout(config.resid_pdrop) def forward(self, x): diff --git a/src/transformers/modeling_openai.py b/src/transformers/modeling_openai.py index 70abd5a1dc..c81620fd1b 100644 --- a/src/transformers/modeling_openai.py +++ b/src/transformers/modeling_openai.py @@ -25,6 +25,7 @@ import torch import torch.nn as nn from torch.nn import CrossEntropyLoss +from .activations import gelu_new, swish from .configuration_openai import OpenAIGPTConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_utils import Conv1D, PreTrainedModel, SequenceSummary, prune_conv1d_layer @@ -114,15 +115,7 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path): return model -def gelu(x): - return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) - - -def swish(x): - return x * torch.sigmoid(x) - - -ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu} +ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu_new} class Attention(nn.Module): diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 07894cf0e0..e272be89ec 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -18,12 +18,14 @@ import logging import os +import typing import torch from torch import nn from torch.nn import CrossEntropyLoss from torch.nn import functional as F +from .activations import get_activation from .configuration_utils import PretrainedConfig from .file_utils import ( DUMMY_INPUTS, @@ -1378,15 +1380,15 @@ class SequenceSummary(nn.Module): - 'attn' => Not implemented now, use multi-head attention summary_use_proj: Add a projection after the vector extraction summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False. - summary_activation: 'tanh' => add a tanh activation to the output, Other => no activation. Default + summary_activation: 'tanh' or another string => add an activation to the output, Other => no activation. Default summary_first_dropout: Add a dropout before the projection and activation summary_last_dropout: Add a dropout after the projection and activation """ - def __init__(self, config): + def __init__(self, config: PretrainedConfig): super().__init__() - self.summary_type = config.summary_type if hasattr(config, "summary_type") else "last" + self.summary_type = getattr(config, "summary_type", "last") if self.summary_type == "attn": # We should use a standard multi-head attention module with absolute positional embedding for that. # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276 @@ -1401,9 +1403,10 @@ class SequenceSummary(nn.Module): num_classes = config.hidden_size self.summary = nn.Linear(config.hidden_size, num_classes) - self.activation = Identity() - if hasattr(config, "summary_activation") and config.summary_activation == "tanh": - self.activation = nn.Tanh() + activation_string = getattr(config, "summary_activation", None) + self.activation = ( + get_activation(activation_string) if activation_string else Identity() + ) # type: typing.Callable self.first_dropout = Identity() if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0: diff --git a/src/transformers/modeling_xlm.py b/src/transformers/modeling_xlm.py index 9ba5540f9c..0742005979 100644 --- a/src/transformers/modeling_xlm.py +++ b/src/transformers/modeling_xlm.py @@ -26,6 +26,7 @@ from torch import nn from torch.nn import CrossEntropyLoss, MSELoss from torch.nn import functional as F +from .activations import gelu from .configuration_xlm import XLMConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_utils import PreTrainedModel, SequenceSummary, SQuADHead, prune_linear_layer @@ -55,17 +56,6 @@ def create_sinusoidal_embeddings(n_pos, dim, out): out.requires_grad = False -def gelu(x): - """ - GELU activation - https://arxiv.org/abs/1606.08415 - https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/model_pytorch.py#L14 - https://github.com/huggingface/transformers/blob/master/modeling.py - """ - # return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) - return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0))) - - def get_masks(slen, lengths, causal, padding_mask=None): """ Generate hidden states mask, and optionally an attention mask. diff --git a/src/transformers/modeling_xlnet.py b/src/transformers/modeling_xlnet.py index b3daad921f..14c4d580aa 100644 --- a/src/transformers/modeling_xlnet.py +++ b/src/transformers/modeling_xlnet.py @@ -18,13 +18,13 @@ import logging -import math import torch from torch import nn from torch.nn import CrossEntropyLoss, MSELoss from torch.nn import functional as F +from .activations import gelu_new, swish from .configuration_xlnet import XLNetConfig from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_utils import PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits, PreTrainedModel, SequenceSummary @@ -183,20 +183,7 @@ def load_tf_weights_in_xlnet(model, config, tf_path): return model -def gelu(x): - """ Implementation of the gelu activation function. - XLNet is using OpenAI GPT's gelu (not exactly the same as BERT) - Also see https://arxiv.org/abs/1606.08415 - """ - cdf = 0.5 * (1.0 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) - return x * cdf - - -def swish(x): - return x * torch.sigmoid(x) - - -ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish} +ACT2FN = {"gelu": gelu_new, "relu": torch.nn.functional.relu, "swish": swish} XLNetLayerNorm = nn.LayerNorm diff --git a/tests/test_activations.py b/tests/test_activations.py new file mode 100644 index 0000000000..d6cbc1f9e5 --- /dev/null +++ b/tests/test_activations.py @@ -0,0 +1,28 @@ +import unittest + +from transformers import is_torch_available + +from .utils import require_torch + + +if is_torch_available(): + from transformers.activations import _gelu_python, get_activation, gelu_new + import torch + + +@require_torch +class TestActivations(unittest.TestCase): + def test_gelu_versions(self): + x = torch.Tensor([-100, -1, -0.1, 0, 0.1, 1.0, 100]) + torch_builtin = get_activation("gelu") + self.assertTrue(torch.eq(_gelu_python(x), torch_builtin(x)).all().item()) + self.assertFalse(torch.eq(_gelu_python(x), gelu_new(x)).all().item()) + + def test_get_activation(self): + get_activation("swish") + get_activation("relu") + get_activation("tanh") + with self.assertRaises(KeyError): + get_activation("bogus") + with self.assertRaises(KeyError): + get_activation(None)