From 00112c35393e4d81ef7593a3763dc626c0403e7b Mon Sep 17 00:00:00 2001
From: TFUsers <25044281+TFUsers@users.noreply.github.com>
Date: Fri, 30 Oct 2020 12:09:10 -0700
Subject: [PATCH] Replace swish with silu (#8166)

* Replace swish with silu

* revert nn.silu to nn.swish due to older version

* simplify optimized silu conditional and fix format

* Update activations.py

* Update activations_tf.py

* Update modeling_flax_utils.py

* Update modeling_openai.py

* add swish testcase

* add pytorch swish testcase

* Add more robust python version check

* more formatting fixes

Co-authored-by: TFUsers <TFUsers@gmail.com>
---
 src/transformers/activations.py               | 37 +++++++++++++------
 src/transformers/activations_tf.py            |  1 +
 src/transformers/configuration_albert.py      |  2 +-
 src/transformers/configuration_bart.py        |  2 +-
 src/transformers/configuration_bert.py        |  2 +-
 .../configuration_bert_generation.py          |  2 +-
 src/transformers/configuration_blenderbot.py  |  2 +-
 src/transformers/configuration_deberta.py     |  2 +-
 src/transformers/configuration_distilbert.py  |  2 +-
 src/transformers/configuration_dpr.py         |  2 +-
 src/transformers/configuration_electra.py     |  2 +-
 src/transformers/configuration_fsmt.py        |  2 +-
 src/transformers/configuration_funnel.py      |  2 +-
 src/transformers/configuration_gpt2.py        |  2 +-
 src/transformers/configuration_layoutlm.py    |  2 +-
 src/transformers/configuration_lxmert.py      |  2 +-
 src/transformers/configuration_marian.py      |  2 +-
 src/transformers/configuration_mbart.py       |  2 +-
 src/transformers/configuration_mobilebert.py  |  2 +-
 src/transformers/configuration_openai.py      |  2 +-
 src/transformers/configuration_pegasus.py     |  2 +-
 src/transformers/configuration_prophetnet.py  |  2 +-
 src/transformers/configuration_reformer.py    |  2 +-
 src/transformers/configuration_retribert.py   |  2 +-
 src/transformers/configuration_squeezebert.py |  2 +-
 src/transformers/configuration_xlnet.py       |  2 +-
 src/transformers/modeling_flax_utils.py       |  1 +
 src/transformers/modeling_openai.py           |  4 +-
 tests/test_activations.py                     |  1 +
 tests/test_activations_tf.py                  |  1 +
 30 files changed, 56 insertions(+), 37 deletions(-)

diff --git a/src/transformers/activations.py b/src/transformers/activations.py
index e1f238ab4f..12f8408d11 100644
--- a/src/transformers/activations.py
+++ b/src/transformers/activations.py
@@ -2,6 +2,7 @@ import math
 
 import torch
 import torch.nn.functional as F
+from packaging import version
 
 from .utils import logging
 
@@ -9,29 +10,25 @@ from .utils import logging
 logger = logging.get_logger(__name__)
 
 
-def swish(x):
-    return x * torch.sigmoid(x)
-
-
 def _gelu_python(x):
     """
-    Original Implementation of the gelu activation function in Google Bert repo when initially created. For
-    information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 +
+    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
+    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
     torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in
-    torch.nn.functional Also see https://arxiv.org/abs/1606.08415
+    torch.nn.functional Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
     """
     return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
 
 
 def gelu_new(x):
     """
-    Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT). Also see
-    https://arxiv.org/abs/1606.08415
+    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
+    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
     """
     return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
 
 
-if torch.__version__ < "1.4.0":
+if version.parse(torch.__version__) < version.parse("1.4"):
     gelu = _gelu_python
 else:
     gelu = F.gelu
@@ -41,6 +38,23 @@ def gelu_fast(x):
     return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x)))
 
 
+def _silu_python(x):
+    """
+    See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
+    Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
+    Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
+    Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
+    later.
+    """
+    return x * torch.sigmoid(x)
+
+
+if version.parse(torch.__version__) < version.parse("1.7"):
+    silu = _silu_python
+else:
+    silu = F.silu
+
+
 def mish(x):
     return x * torch.tanh(torch.nn.functional.softplus(x))
 
@@ -51,7 +65,8 @@ def linear_act(x):
 
 ACT2FN = {
     "relu": F.relu,
-    "swish": swish,
+    "silu": silu,
+    "swish": silu,
     "gelu": gelu,
     "tanh": torch.tanh,
     "gelu_new": gelu_new,
diff --git a/src/transformers/activations_tf.py b/src/transformers/activations_tf.py
index c6e71b9d4d..1e330f4ccb 100644
--- a/src/transformers/activations_tf.py
+++ b/src/transformers/activations_tf.py
@@ -52,6 +52,7 @@ ACT2FN = {
     "gelu": tf.keras.layers.Activation(gelu),
     "relu": tf.keras.activations.relu,
     "swish": tf.keras.activations.swish,
+    "silu": tf.keras.activations.swish,
     "gelu_new": tf.keras.layers.Activation(gelu_new),
     "mish": tf.keras.layers.Activation(mish),
     "tanh": tf.keras.activations.tanh,
diff --git a/src/transformers/configuration_albert.py b/src/transformers/configuration_albert.py
index 958876558b..78bde71570 100644
--- a/src/transformers/configuration_albert.py
+++ b/src/transformers/configuration_albert.py
@@ -61,7 +61,7 @@ class AlbertConfig(PretrainedConfig):
             The number of inner repetition of attention and ffn.
         hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu_new"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
         hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0):
diff --git a/src/transformers/configuration_bart.py b/src/transformers/configuration_bart.py
index a5f79f33d1..1bc06624a0 100644
--- a/src/transformers/configuration_bart.py
+++ b/src/transformers/configuration_bart.py
@@ -59,7 +59,7 @@ class BartConfig(PretrainedConfig):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
         activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
         dropout (:obj:`float`, `optional`, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
diff --git a/src/transformers/configuration_bert.py b/src/transformers/configuration_bert.py
index 8c9ec766d1..5cb86168d0 100644
--- a/src/transformers/configuration_bert.py
+++ b/src/transformers/configuration_bert.py
@@ -74,7 +74,7 @@ class BertConfig(PretrainedConfig):
             Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
         hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
         hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
diff --git a/src/transformers/configuration_bert_generation.py b/src/transformers/configuration_bert_generation.py
index 0342d4909c..3b9dc4873f 100644
--- a/src/transformers/configuration_bert_generation.py
+++ b/src/transformers/configuration_bert_generation.py
@@ -40,7 +40,7 @@ class BertGenerationConfig(PretrainedConfig):
             Dimensionality of the "intermediate" (often called feed-forward) layer in the Transformer encoder.
         hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
         hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
diff --git a/src/transformers/configuration_blenderbot.py b/src/transformers/configuration_blenderbot.py
index ef9b97db5a..449089a862 100644
--- a/src/transformers/configuration_blenderbot.py
+++ b/src/transformers/configuration_blenderbot.py
@@ -56,7 +56,7 @@ class BlenderbotConfig(BartConfig):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
         activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
         dropout (:obj:`float`, `optional`, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
diff --git a/src/transformers/configuration_deberta.py b/src/transformers/configuration_deberta.py
index e305784e84..ffc236df41 100644
--- a/src/transformers/configuration_deberta.py
+++ b/src/transformers/configuration_deberta.py
@@ -52,7 +52,7 @@ class DebertaConfig(PretrainedConfig):
             Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
         hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"`, :obj:`"gelu"`, :obj:`"tanh"`, :obj:`"gelu_fast"`,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"`, :obj:`"gelu"`, :obj:`"tanh"`, :obj:`"gelu_fast"`,
             :obj:`"mish"`, :obj:`"linear"`, :obj:`"sigmoid"` and :obj:`"gelu_new"` are supported.
         hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
diff --git a/src/transformers/configuration_distilbert.py b/src/transformers/configuration_distilbert.py
index 42a6eae22e..fce9563af2 100644
--- a/src/transformers/configuration_distilbert.py
+++ b/src/transformers/configuration_distilbert.py
@@ -66,7 +66,7 @@ class DistilBertConfig(PretrainedConfig):
             The dropout ratio for the attention probabilities.
         activation (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
         initializer_range (:obj:`float`, `optional`, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         qa_dropout (:obj:`float`, `optional`, defaults to 0.1):
diff --git a/src/transformers/configuration_dpr.py b/src/transformers/configuration_dpr.py
index b079e8a7d6..506a2c4f5b 100644
--- a/src/transformers/configuration_dpr.py
+++ b/src/transformers/configuration_dpr.py
@@ -55,7 +55,7 @@ class DPRConfig(PretrainedConfig):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
         hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
         hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
diff --git a/src/transformers/configuration_electra.py b/src/transformers/configuration_electra.py
index 91253f0aef..066c1d501e 100644
--- a/src/transformers/configuration_electra.py
+++ b/src/transformers/configuration_electra.py
@@ -60,7 +60,7 @@ class ElectraConfig(PretrainedConfig):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
         hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
         hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
diff --git a/src/transformers/configuration_fsmt.py b/src/transformers/configuration_fsmt.py
index 4008d93fb1..16a68b514d 100644
--- a/src/transformers/configuration_fsmt.py
+++ b/src/transformers/configuration_fsmt.py
@@ -71,7 +71,7 @@ class FSMTConfig(PretrainedConfig):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
         activation_function (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"relu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
         dropout (:obj:`float`, `optional`, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
diff --git a/src/transformers/configuration_funnel.py b/src/transformers/configuration_funnel.py
index c1b6a284af..7883aec923 100644
--- a/src/transformers/configuration_funnel.py
+++ b/src/transformers/configuration_funnel.py
@@ -66,7 +66,7 @@ class FunnelConfig(PretrainedConfig):
             Inner dimension in the feed-forward blocks.
         hidden_act (:obj:`str` or :obj:`callable`, `optional`, defaults to :obj:`"gelu_new"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
         hidden_dropout (:obj:`float`, `optional`, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
diff --git a/src/transformers/configuration_gpt2.py b/src/transformers/configuration_gpt2.py
index af8fc331a6..7a054e4bbe 100644
--- a/src/transformers/configuration_gpt2.py
+++ b/src/transformers/configuration_gpt2.py
@@ -60,7 +60,7 @@ class GPT2Config(PretrainedConfig):
         n_inner (:obj:`int`, `optional`, defaults to None):
             Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd
         activation_function (:obj:`str`, `optional`, defaults to :obj:`"gelu"`):
-            Activation function, to be selected in the list :obj:`["relu", "swish", "gelu", "tanh", "gelu_new"]`.
+            Activation function, to be selected in the list :obj:`["relu", "silu", "gelu", "tanh", "gelu_new"]`.
         resid_pdrop (:obj:`float`, `optional`, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         embd_pdrop (:obj:`int`, `optional`, defaults to 0.1):
diff --git a/src/transformers/configuration_layoutlm.py b/src/transformers/configuration_layoutlm.py
index 75e5fe717c..f16e17e4ea 100644
--- a/src/transformers/configuration_layoutlm.py
+++ b/src/transformers/configuration_layoutlm.py
@@ -52,7 +52,7 @@ class LayoutLMConfig(BertConfig):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
         hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
         hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
diff --git a/src/transformers/configuration_lxmert.py b/src/transformers/configuration_lxmert.py
index 0c06d14ebd..e18d4ed031 100644
--- a/src/transformers/configuration_lxmert.py
+++ b/src/transformers/configuration_lxmert.py
@@ -55,7 +55,7 @@ class LxmertConfig(PretrainedConfig):
             Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
         hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
         hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
diff --git a/src/transformers/configuration_marian.py b/src/transformers/configuration_marian.py
index 8e4e257ce9..042062a314 100644
--- a/src/transformers/configuration_marian.py
+++ b/src/transformers/configuration_marian.py
@@ -50,7 +50,7 @@ class MarianConfig(BartConfig):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
         activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
         dropout (:obj:`float`, `optional`, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
diff --git a/src/transformers/configuration_mbart.py b/src/transformers/configuration_mbart.py
index 8406236889..b03b6f9777 100644
--- a/src/transformers/configuration_mbart.py
+++ b/src/transformers/configuration_mbart.py
@@ -55,7 +55,7 @@ class MBartConfig(BartConfig):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
         activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
         dropout (:obj:`float`, `optional`, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
diff --git a/src/transformers/configuration_mobilebert.py b/src/transformers/configuration_mobilebert.py
index a67ff5a79d..37493d1fb6 100644
--- a/src/transformers/configuration_mobilebert.py
+++ b/src/transformers/configuration_mobilebert.py
@@ -48,7 +48,7 @@ class MobileBertConfig(PretrainedConfig):
             Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
         hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"relu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
         hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.0):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
diff --git a/src/transformers/configuration_openai.py b/src/transformers/configuration_openai.py
index fb8f68411a..2301c36922 100644
--- a/src/transformers/configuration_openai.py
+++ b/src/transformers/configuration_openai.py
@@ -54,7 +54,7 @@ class OpenAIGPTConfig(PretrainedConfig):
             Number of attention heads for each attention layer in the Transformer encoder.
         afn (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
         resid_pdrop (:obj:`float`, `optional`, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         embd_pdrop (:obj:`int`, `optional`, defaults to 0.1):
diff --git a/src/transformers/configuration_pegasus.py b/src/transformers/configuration_pegasus.py
index 8bda4dc114..ed56f0b22c 100644
--- a/src/transformers/configuration_pegasus.py
+++ b/src/transformers/configuration_pegasus.py
@@ -94,7 +94,7 @@ class PegasusConfig(BartConfig):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
         activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
         dropout (:obj:`float`, `optional`, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
diff --git a/src/transformers/configuration_prophetnet.py b/src/transformers/configuration_prophetnet.py
index 3dc2b011ea..451c93954c 100644
--- a/src/transformers/configuration_prophetnet.py
+++ b/src/transformers/configuration_prophetnet.py
@@ -39,7 +39,7 @@ class ProphetNetConfig(PretrainedConfig):
             The dropout ratio for activations inside the fully connected layer.
         activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
         vocab_size (:obj:`int`, `optional`, defaults to 30522):
             Vocabulary size of the ProphetNET model. Defines the number of different tokens that can be represented by
             the :obj:`inputs_ids` passed when calling :class:`~transformers.ProphetNetModel`.
diff --git a/src/transformers/configuration_reformer.py b/src/transformers/configuration_reformer.py
index 0ef4b598b7..55367d1188 100755
--- a/src/transformers/configuration_reformer.py
+++ b/src/transformers/configuration_reformer.py
@@ -80,7 +80,7 @@ class ReformerConfig(PretrainedConfig):
             :obj:`None` to ensure fully random rotations in local sensitive hashing scheme.
         hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"relu"`):
             The non-linear activation function (function or string) in the feed forward layer in the residual attention
-            block. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            block. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
         hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.05):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         hidden_size (:obj:`int`, `optional`, defaults to 256):
diff --git a/src/transformers/configuration_retribert.py b/src/transformers/configuration_retribert.py
index 36e04faa71..a68801cbcc 100644
--- a/src/transformers/configuration_retribert.py
+++ b/src/transformers/configuration_retribert.py
@@ -49,7 +49,7 @@ class RetriBertConfig(PretrainedConfig):
             Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
         hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
         hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
diff --git a/src/transformers/configuration_squeezebert.py b/src/transformers/configuration_squeezebert.py
index 666c79ab2f..80f456cb5b 100644
--- a/src/transformers/configuration_squeezebert.py
+++ b/src/transformers/configuration_squeezebert.py
@@ -50,7 +50,7 @@ class SqueezeBertConfig(PretrainedConfig):
             Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
         hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
         hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
         attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
diff --git a/src/transformers/configuration_xlnet.py b/src/transformers/configuration_xlnet.py
index 365162de71..05eda6010c 100644
--- a/src/transformers/configuration_xlnet.py
+++ b/src/transformers/configuration_xlnet.py
@@ -54,7 +54,7 @@ class XLNetConfig(PretrainedConfig):
             Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
         ff_activation (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
             The non-linear activation function (function or string) in the If string, :obj:`"gelu"`, :obj:`"relu"`,
-            :obj:`"swish"` and :obj:`"gelu_new"` are supported.
+            :obj:`"silu"` and :obj:`"gelu_new"` are supported.
         untie_r (:obj:`bool`, `optional`, defaults to :obj:`True`):
             Whether or not to untie relative position biases
         attn_type (:obj:`str`, `optional`, defaults to :obj:`"bi"`):
diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py
index 32cb9f450e..6c9eb14ce4 100644
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@@ -53,6 +53,7 @@ def gelu(x):
 ACT2FN = {
     "gelu": nn.gelu,
     "relu": nn.relu,
+    "silu": nn.swish,
     "swish": nn.swish,
     "gelu_new": gelu,
 }
diff --git a/src/transformers/modeling_openai.py b/src/transformers/modeling_openai.py
index b69ef557b9..5bfb2b682a 100644
--- a/src/transformers/modeling_openai.py
+++ b/src/transformers/modeling_openai.py
@@ -27,7 +27,7 @@ import torch
 import torch.nn as nn
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from .activations import gelu_new, swish
+from .activations import gelu_new, silu
 from .configuration_openai import OpenAIGPTConfig
 from .file_utils import (
     ModelOutput,
@@ -139,7 +139,7 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
     return model
 
 
-ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu_new}
+ACT_FNS = {"relu": nn.ReLU, "silu": silu, "gelu": gelu_new, "swish": silu}
 
 
 class Attention(nn.Module):
diff --git a/tests/test_activations.py b/tests/test_activations.py
index a5a9a23477..cc92ea3cda 100644
--- a/tests/test_activations.py
+++ b/tests/test_activations.py
@@ -20,6 +20,7 @@ class TestActivations(unittest.TestCase):
 
     def test_get_activation(self):
         get_activation("swish")
+        get_activation("silu")
         get_activation("relu")
         get_activation("tanh")
         get_activation("gelu_new")
diff --git a/tests/test_activations_tf.py b/tests/test_activations_tf.py
index bdaecff407..406105c09b 100644
--- a/tests/test_activations_tf.py
+++ b/tests/test_activations_tf.py
@@ -12,6 +12,7 @@ if is_tf_available():
 class TestTFActivations(unittest.TestCase):
     def test_get_activation(self):
         get_tf_activation("swish")
+        get_tf_activation("silu")
         get_tf_activation("gelu")
         get_tf_activation("relu")
         get_tf_activation("tanh")