From 070dcf1c020e28e96e8e4b5acfb29fb818b0b4dd Mon Sep 17 00:00:00 2001 From: Diganta Misra Date: Thu, 7 Nov 2019 03:45:43 +0530 Subject: [PATCH] Added Mish Activation Function Mish is a new activation function proposed here - https://arxiv.org/abs/1908.08681 It has seen some recent success and has been adopted in SpaCy, Thic, TensorFlow Addons and FastAI-dev. All benchmarks recorded till now (including against ReLU, Swish and GELU) is present in the repository - https://github.com/digantamisra98/Mish Might be a good addition to experiment with especially in the Bert Model. --- transformers/modeling_bert.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py index 7c2c6f4602..2baee71f82 100644 --- a/transformers/modeling_bert.py +++ b/transformers/modeling_bert.py @@ -138,7 +138,11 @@ def swish(x): return x * torch.sigmoid(x) -ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new} +def mish(x): + return x * torch.tanh(nn.functional.softplus(x)) + + +ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new, "mish": mish} BertLayerNorm = torch.nn.LayerNorm