From ef74b0f07a190f19c69abc0732ea955e8dd7330f Mon Sep 17 00:00:00 2001
From: Sam Shleifer <sshleifer@gmail.com>
Date: Thu, 13 Feb 2020 08:28:33 -0500
Subject: [PATCH] =?UTF-8?q?get=5Factivation('relu')=20provides=20a=20simpl?=
 =?UTF-8?q?e=20mapping=20from=20strings=20i=E2=80=A6=20(#2807)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* activations.py contains a mapping from string to activation function
* resolves some `gelu` vs `gelu_new` ambiguity
---
 src/transformers/activations.py         | 48 +++++++++++++++++++++++++
 src/transformers/modeling_bert.py       | 21 +----------
 src/transformers/modeling_distilbert.py |  3 +-
 src/transformers/modeling_gpt2.py       |  7 ++--
 src/transformers/modeling_openai.py     | 11 ++----
 src/transformers/modeling_utils.py      | 15 ++++----
 src/transformers/modeling_xlm.py        | 12 +------
 src/transformers/modeling_xlnet.py      | 17 ++-------
 tests/test_activations.py               | 28 +++++++++++++++
 9 files changed, 94 insertions(+), 68 deletions(-)
 create mode 100644 src/transformers/activations.py
 create mode 100644 tests/test_activations.py

diff --git a/src/transformers/activations.py b/src/transformers/activations.py
new file mode 100644
index 0000000000..5d7d3cdc59
--- /dev/null
+++ b/src/transformers/activations.py
@@ -0,0 +1,48 @@
+import math
+
+import torch
+import torch.nn.functional as F
+
+
+def swish(x):
+    return x * torch.sigmoid(x)
+
+
+def _gelu_python(x):
+    """ Original Implementation of the gelu activation function in Google Bert repo when initially created.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+        This is now written in C in torch.nn.functional
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+
+gelu = getattr(F, "gelu", _gelu_python)
+
+
+def gelu_new(x):
+    """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+
+
+ACT2FN = {
+    "relu": F.relu,
+    "swish": swish,
+    "gelu": gelu,
+    "tanh": F.tanh,
+    "gelu_new": gelu_new,
+}
+
+
+def get_activation(activation_string):
+    if activation_string in ACT2FN:
+        return ACT2FN[activation_string]
+    else:
+        raise KeyError(
+            "function {} not found in ACT2FN mapping {} or torch.nn.functional".format(
+                activation_string, list(ACT2FN.keys())
+            )
+        )
diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py
index 6cfbe3d00a..feb3529057 100644
--- a/src/transformers/modeling_bert.py
+++ b/src/transformers/modeling_bert.py
@@ -24,6 +24,7 @@ import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
 
+from .activations import gelu, gelu_new, swish
 from .configuration_bert import BertConfig
 from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import PreTrainedModel, prune_linear_layer
@@ -129,26 +130,6 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
     return model
 
 
-def gelu(x):
-    """ Original Implementation of the gelu activation function in Google Bert repo when initially created.
-        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
-        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-        Also see https://arxiv.org/abs/1606.08415
-    """
-    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
-
-
-def gelu_new(x):
-    """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
-        Also see https://arxiv.org/abs/1606.08415
-    """
-    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-
-
-def swish(x):
-    return x * torch.sigmoid(x)
-
-
 def mish(x):
     return x * torch.tanh(nn.functional.softplus(x))
 
diff --git a/src/transformers/modeling_distilbert.py b/src/transformers/modeling_distilbert.py
index 6634aacaff..b155efebef 100644
--- a/src/transformers/modeling_distilbert.py
+++ b/src/transformers/modeling_distilbert.py
@@ -27,6 +27,7 @@ import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 
+from .activations import gelu
 from .configuration_distilbert import DistilBertConfig
 from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import PreTrainedModel, prune_linear_layer
@@ -47,8 +48,6 @@ DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
 
 
 # UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE #
-def gelu(x):
-    return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0)))
 
 
 def create_sinusoidal_embeddings(n_pos, dim, out):
diff --git a/src/transformers/modeling_gpt2.py b/src/transformers/modeling_gpt2.py
index 2426ef4352..b72d11af92 100644
--- a/src/transformers/modeling_gpt2.py
+++ b/src/transformers/modeling_gpt2.py
@@ -24,6 +24,7 @@ import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 
+from .activations import gelu_new
 from .configuration_gpt2 import GPT2Config
 from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import Conv1D, PreTrainedModel, SequenceSummary, prune_conv1d_layer
@@ -95,10 +96,6 @@ def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
     return model
 
 
-def gelu(x):
-    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-
-
 class Attention(nn.Module):
     def __init__(self, nx, n_ctx, config, scale=False):
         super().__init__()
@@ -206,7 +203,7 @@ class MLP(nn.Module):
         nx = config.n_embd
         self.c_fc = Conv1D(n_state, nx)
         self.c_proj = Conv1D(nx, n_state)
-        self.act = gelu
+        self.act = gelu_new
         self.dropout = nn.Dropout(config.resid_pdrop)
 
     def forward(self, x):
diff --git a/src/transformers/modeling_openai.py b/src/transformers/modeling_openai.py
index 70abd5a1dc..c81620fd1b 100644
--- a/src/transformers/modeling_openai.py
+++ b/src/transformers/modeling_openai.py
@@ -25,6 +25,7 @@ import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 
+from .activations import gelu_new, swish
 from .configuration_openai import OpenAIGPTConfig
 from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import Conv1D, PreTrainedModel, SequenceSummary, prune_conv1d_layer
@@ -114,15 +115,7 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
     return model
 
 
-def gelu(x):
-    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-
-
-def swish(x):
-    return x * torch.sigmoid(x)
-
-
-ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu}
+ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu_new}
 
 
 class Attention(nn.Module):
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 07894cf0e0..e272be89ec 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -18,12 +18,14 @@
 
 import logging
 import os
+import typing
 
 import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss
 from torch.nn import functional as F
 
+from .activations import get_activation
 from .configuration_utils import PretrainedConfig
 from .file_utils import (
     DUMMY_INPUTS,
@@ -1378,15 +1380,15 @@ class SequenceSummary(nn.Module):
                 - 'attn' => Not implemented now, use multi-head attention
             summary_use_proj: Add a projection after the vector extraction
             summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
-            summary_activation: 'tanh' => add a tanh activation to the output, Other => no activation. Default
+            summary_activation: 'tanh' or another string => add an activation to the output, Other => no activation. Default
             summary_first_dropout: Add a dropout before the projection and activation
             summary_last_dropout: Add a dropout after the projection and activation
     """
 
-    def __init__(self, config):
+    def __init__(self, config: PretrainedConfig):
         super().__init__()
 
-        self.summary_type = config.summary_type if hasattr(config, "summary_type") else "last"
+        self.summary_type = getattr(config, "summary_type", "last")
         if self.summary_type == "attn":
             # We should use a standard multi-head attention module with absolute positional embedding for that.
             # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
@@ -1401,9 +1403,10 @@ class SequenceSummary(nn.Module):
                 num_classes = config.hidden_size
             self.summary = nn.Linear(config.hidden_size, num_classes)
 
-        self.activation = Identity()
-        if hasattr(config, "summary_activation") and config.summary_activation == "tanh":
-            self.activation = nn.Tanh()
+        activation_string = getattr(config, "summary_activation", None)
+        self.activation = (
+            get_activation(activation_string) if activation_string else Identity()
+        )  # type: typing.Callable
 
         self.first_dropout = Identity()
         if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0:
diff --git a/src/transformers/modeling_xlm.py b/src/transformers/modeling_xlm.py
index 9ba5540f9c..0742005979 100644
--- a/src/transformers/modeling_xlm.py
+++ b/src/transformers/modeling_xlm.py
@@ -26,6 +26,7 @@ from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
 from torch.nn import functional as F
 
+from .activations import gelu
 from .configuration_xlm import XLMConfig
 from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import PreTrainedModel, SequenceSummary, SQuADHead, prune_linear_layer
@@ -55,17 +56,6 @@ def create_sinusoidal_embeddings(n_pos, dim, out):
     out.requires_grad = False
 
 
-def gelu(x):
-    """
-    GELU activation
-    https://arxiv.org/abs/1606.08415
-    https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/model_pytorch.py#L14
-    https://github.com/huggingface/transformers/blob/master/modeling.py
-    """
-    # return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-    return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0)))
-
-
 def get_masks(slen, lengths, causal, padding_mask=None):
     """
     Generate hidden states mask, and optionally an attention mask.
diff --git a/src/transformers/modeling_xlnet.py b/src/transformers/modeling_xlnet.py
index b3daad921f..14c4d580aa 100644
--- a/src/transformers/modeling_xlnet.py
+++ b/src/transformers/modeling_xlnet.py
@@ -18,13 +18,13 @@
 
 
 import logging
-import math
 
 import torch
 from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
 from torch.nn import functional as F
 
+from .activations import gelu_new, swish
 from .configuration_xlnet import XLNetConfig
 from .file_utils import add_start_docstrings, add_start_docstrings_to_callable
 from .modeling_utils import PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits, PreTrainedModel, SequenceSummary
@@ -183,20 +183,7 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
     return model
 
 
-def gelu(x):
-    """ Implementation of the gelu activation function.
-        XLNet is using OpenAI GPT's gelu (not exactly the same as BERT)
-        Also see https://arxiv.org/abs/1606.08415
-    """
-    cdf = 0.5 * (1.0 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-    return x * cdf
-
-
-def swish(x):
-    return x * torch.sigmoid(x)
-
-
-ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
+ACT2FN = {"gelu": gelu_new, "relu": torch.nn.functional.relu, "swish": swish}
 
 
 XLNetLayerNorm = nn.LayerNorm
diff --git a/tests/test_activations.py b/tests/test_activations.py
new file mode 100644
index 0000000000..d6cbc1f9e5
--- /dev/null
+++ b/tests/test_activations.py
@@ -0,0 +1,28 @@
+import unittest
+
+from transformers import is_torch_available
+
+from .utils import require_torch
+
+
+if is_torch_available():
+    from transformers.activations import _gelu_python, get_activation, gelu_new
+    import torch
+
+
+@require_torch
+class TestActivations(unittest.TestCase):
+    def test_gelu_versions(self):
+        x = torch.Tensor([-100, -1, -0.1, 0, 0.1, 1.0, 100])
+        torch_builtin = get_activation("gelu")
+        self.assertTrue(torch.eq(_gelu_python(x), torch_builtin(x)).all().item())
+        self.assertFalse(torch.eq(_gelu_python(x), gelu_new(x)).all().item())
+
+    def test_get_activation(self):
+        get_activation("swish")
+        get_activation("relu")
+        get_activation("tanh")
+        with self.assertRaises(KeyError):
+            get_activation("bogus")
+        with self.assertRaises(KeyError):
+            get_activation(None)