From 00112c35393e4d81ef7593a3763dc626c0403e7b Mon Sep 17 00:00:00 2001 From: TFUsers <25044281+TFUsers@users.noreply.github.com> Date: Fri, 30 Oct 2020 12:09:10 -0700 Subject: [PATCH] Replace swish with silu (#8166) * Replace swish with silu * revert nn.silu to nn.swish due to older version * simplify optimized silu conditional and fix format * Update activations.py * Update activations_tf.py * Update modeling_flax_utils.py * Update modeling_openai.py * add swish testcase * add pytorch swish testcase * Add more robust python version check * more formatting fixes Co-authored-by: TFUsers --- src/transformers/activations.py | 37 +++++++++++++------ src/transformers/activations_tf.py | 1 + src/transformers/configuration_albert.py | 2 +- src/transformers/configuration_bart.py | 2 +- src/transformers/configuration_bert.py | 2 +- .../configuration_bert_generation.py | 2 +- src/transformers/configuration_blenderbot.py | 2 +- src/transformers/configuration_deberta.py | 2 +- src/transformers/configuration_distilbert.py | 2 +- src/transformers/configuration_dpr.py | 2 +- src/transformers/configuration_electra.py | 2 +- src/transformers/configuration_fsmt.py | 2 +- src/transformers/configuration_funnel.py | 2 +- src/transformers/configuration_gpt2.py | 2 +- src/transformers/configuration_layoutlm.py | 2 +- src/transformers/configuration_lxmert.py | 2 +- src/transformers/configuration_marian.py | 2 +- src/transformers/configuration_mbart.py | 2 +- src/transformers/configuration_mobilebert.py | 2 +- src/transformers/configuration_openai.py | 2 +- src/transformers/configuration_pegasus.py | 2 +- src/transformers/configuration_prophetnet.py | 2 +- src/transformers/configuration_reformer.py | 2 +- src/transformers/configuration_retribert.py | 2 +- src/transformers/configuration_squeezebert.py | 2 +- src/transformers/configuration_xlnet.py | 2 +- src/transformers/modeling_flax_utils.py | 1 + src/transformers/modeling_openai.py | 4 +- tests/test_activations.py | 1 + tests/test_activations_tf.py | 1 + 30 files changed, 56 insertions(+), 37 deletions(-) diff --git a/src/transformers/activations.py b/src/transformers/activations.py index e1f238ab4f..12f8408d11 100644 --- a/src/transformers/activations.py +++ b/src/transformers/activations.py @@ -2,6 +2,7 @@ import math import torch import torch.nn.functional as F +from packaging import version from .utils import logging @@ -9,29 +10,25 @@ from .utils import logging logger = logging.get_logger(__name__) -def swish(x): - return x * torch.sigmoid(x) - - def _gelu_python(x): """ - Original Implementation of the gelu activation function in Google Bert repo when initially created. For - information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 + + Original Implementation of the GELU activation function in Google BERT repo when initially created. For + information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in - torch.nn.functional Also see https://arxiv.org/abs/1606.08415 + torch.nn.functional Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 """ return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) def gelu_new(x): """ - Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT). Also see - https://arxiv.org/abs/1606.08415 + Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see + the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 """ return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0)))) -if torch.__version__ < "1.4.0": +if version.parse(torch.__version__) < version.parse("1.4"): gelu = _gelu_python else: gelu = F.gelu @@ -41,6 +38,23 @@ def gelu_fast(x): return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x))) +def _silu_python(x): + """ + See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear + Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function + Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated + Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with + later. + """ + return x * torch.sigmoid(x) + + +if version.parse(torch.__version__) < version.parse("1.7"): + silu = _silu_python +else: + silu = F.silu + + def mish(x): return x * torch.tanh(torch.nn.functional.softplus(x)) @@ -51,7 +65,8 @@ def linear_act(x): ACT2FN = { "relu": F.relu, - "swish": swish, + "silu": silu, + "swish": silu, "gelu": gelu, "tanh": torch.tanh, "gelu_new": gelu_new, diff --git a/src/transformers/activations_tf.py b/src/transformers/activations_tf.py index c6e71b9d4d..1e330f4ccb 100644 --- a/src/transformers/activations_tf.py +++ b/src/transformers/activations_tf.py @@ -52,6 +52,7 @@ ACT2FN = { "gelu": tf.keras.layers.Activation(gelu), "relu": tf.keras.activations.relu, "swish": tf.keras.activations.swish, + "silu": tf.keras.activations.swish, "gelu_new": tf.keras.layers.Activation(gelu_new), "mish": tf.keras.layers.Activation(mish), "tanh": tf.keras.activations.tanh, diff --git a/src/transformers/configuration_albert.py b/src/transformers/configuration_albert.py index 958876558b..78bde71570 100644 --- a/src/transformers/configuration_albert.py +++ b/src/transformers/configuration_albert.py @@ -61,7 +61,7 @@ class AlbertConfig(PretrainedConfig): The number of inner repetition of attention and ffn. hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu_new"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0): diff --git a/src/transformers/configuration_bart.py b/src/transformers/configuration_bart.py index a5f79f33d1..1bc06624a0 100644 --- a/src/transformers/configuration_bart.py +++ b/src/transformers/configuration_bart.py @@ -59,7 +59,7 @@ class BartConfig(PretrainedConfig): Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. dropout (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_dropout (:obj:`float`, `optional`, defaults to 0.0): diff --git a/src/transformers/configuration_bert.py b/src/transformers/configuration_bert.py index 8c9ec766d1..5cb86168d0 100644 --- a/src/transformers/configuration_bert.py +++ b/src/transformers/configuration_bert.py @@ -74,7 +74,7 @@ class BertConfig(PretrainedConfig): Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): diff --git a/src/transformers/configuration_bert_generation.py b/src/transformers/configuration_bert_generation.py index 0342d4909c..3b9dc4873f 100644 --- a/src/transformers/configuration_bert_generation.py +++ b/src/transformers/configuration_bert_generation.py @@ -40,7 +40,7 @@ class BertGenerationConfig(PretrainedConfig): Dimensionality of the "intermediate" (often called feed-forward) layer in the Transformer encoder. hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): diff --git a/src/transformers/configuration_blenderbot.py b/src/transformers/configuration_blenderbot.py index ef9b97db5a..449089a862 100644 --- a/src/transformers/configuration_blenderbot.py +++ b/src/transformers/configuration_blenderbot.py @@ -56,7 +56,7 @@ class BlenderbotConfig(BartConfig): Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. dropout (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_dropout (:obj:`float`, `optional`, defaults to 0.0): diff --git a/src/transformers/configuration_deberta.py b/src/transformers/configuration_deberta.py index e305784e84..ffc236df41 100644 --- a/src/transformers/configuration_deberta.py +++ b/src/transformers/configuration_deberta.py @@ -52,7 +52,7 @@ class DebertaConfig(PretrainedConfig): Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"`, :obj:`"gelu"`, :obj:`"tanh"`, :obj:`"gelu_fast"`, + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"`, :obj:`"gelu"`, :obj:`"tanh"`, :obj:`"gelu_fast"`, :obj:`"mish"`, :obj:`"linear"`, :obj:`"sigmoid"` and :obj:`"gelu_new"` are supported. hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. diff --git a/src/transformers/configuration_distilbert.py b/src/transformers/configuration_distilbert.py index 42a6eae22e..fce9563af2 100644 --- a/src/transformers/configuration_distilbert.py +++ b/src/transformers/configuration_distilbert.py @@ -66,7 +66,7 @@ class DistilBertConfig(PretrainedConfig): The dropout ratio for the attention probabilities. activation (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. initializer_range (:obj:`float`, `optional`, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. qa_dropout (:obj:`float`, `optional`, defaults to 0.1): diff --git a/src/transformers/configuration_dpr.py b/src/transformers/configuration_dpr.py index b079e8a7d6..506a2c4f5b 100644 --- a/src/transformers/configuration_dpr.py +++ b/src/transformers/configuration_dpr.py @@ -55,7 +55,7 @@ class DPRConfig(PretrainedConfig): Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): diff --git a/src/transformers/configuration_electra.py b/src/transformers/configuration_electra.py index 91253f0aef..066c1d501e 100644 --- a/src/transformers/configuration_electra.py +++ b/src/transformers/configuration_electra.py @@ -60,7 +60,7 @@ class ElectraConfig(PretrainedConfig): Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): diff --git a/src/transformers/configuration_fsmt.py b/src/transformers/configuration_fsmt.py index 4008d93fb1..16a68b514d 100644 --- a/src/transformers/configuration_fsmt.py +++ b/src/transformers/configuration_fsmt.py @@ -71,7 +71,7 @@ class FSMTConfig(PretrainedConfig): Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. activation_function (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"relu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. dropout (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_dropout (:obj:`float`, `optional`, defaults to 0.0): diff --git a/src/transformers/configuration_funnel.py b/src/transformers/configuration_funnel.py index c1b6a284af..7883aec923 100644 --- a/src/transformers/configuration_funnel.py +++ b/src/transformers/configuration_funnel.py @@ -66,7 +66,7 @@ class FunnelConfig(PretrainedConfig): Inner dimension in the feed-forward blocks. hidden_act (:obj:`str` or :obj:`callable`, `optional`, defaults to :obj:`"gelu_new"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. hidden_dropout (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_dropout (:obj:`float`, `optional`, defaults to 0.1): diff --git a/src/transformers/configuration_gpt2.py b/src/transformers/configuration_gpt2.py index af8fc331a6..7a054e4bbe 100644 --- a/src/transformers/configuration_gpt2.py +++ b/src/transformers/configuration_gpt2.py @@ -60,7 +60,7 @@ class GPT2Config(PretrainedConfig): n_inner (:obj:`int`, `optional`, defaults to None): Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd activation_function (:obj:`str`, `optional`, defaults to :obj:`"gelu"`): - Activation function, to be selected in the list :obj:`["relu", "swish", "gelu", "tanh", "gelu_new"]`. + Activation function, to be selected in the list :obj:`["relu", "silu", "gelu", "tanh", "gelu_new"]`. resid_pdrop (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. embd_pdrop (:obj:`int`, `optional`, defaults to 0.1): diff --git a/src/transformers/configuration_layoutlm.py b/src/transformers/configuration_layoutlm.py index 75e5fe717c..f16e17e4ea 100644 --- a/src/transformers/configuration_layoutlm.py +++ b/src/transformers/configuration_layoutlm.py @@ -52,7 +52,7 @@ class LayoutLMConfig(BertConfig): Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): diff --git a/src/transformers/configuration_lxmert.py b/src/transformers/configuration_lxmert.py index 0c06d14ebd..e18d4ed031 100644 --- a/src/transformers/configuration_lxmert.py +++ b/src/transformers/configuration_lxmert.py @@ -55,7 +55,7 @@ class LxmertConfig(PretrainedConfig): Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): diff --git a/src/transformers/configuration_marian.py b/src/transformers/configuration_marian.py index 8e4e257ce9..042062a314 100644 --- a/src/transformers/configuration_marian.py +++ b/src/transformers/configuration_marian.py @@ -50,7 +50,7 @@ class MarianConfig(BartConfig): Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder. activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. dropout (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_dropout (:obj:`float`, `optional`, defaults to 0.0): diff --git a/src/transformers/configuration_mbart.py b/src/transformers/configuration_mbart.py index 8406236889..b03b6f9777 100644 --- a/src/transformers/configuration_mbart.py +++ b/src/transformers/configuration_mbart.py @@ -55,7 +55,7 @@ class MBartConfig(BartConfig): Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder. activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. dropout (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_dropout (:obj:`float`, `optional`, defaults to 0.0): diff --git a/src/transformers/configuration_mobilebert.py b/src/transformers/configuration_mobilebert.py index a67ff5a79d..37493d1fb6 100644 --- a/src/transformers/configuration_mobilebert.py +++ b/src/transformers/configuration_mobilebert.py @@ -48,7 +48,7 @@ class MobileBertConfig(PretrainedConfig): Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"relu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.0): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): diff --git a/src/transformers/configuration_openai.py b/src/transformers/configuration_openai.py index fb8f68411a..2301c36922 100644 --- a/src/transformers/configuration_openai.py +++ b/src/transformers/configuration_openai.py @@ -54,7 +54,7 @@ class OpenAIGPTConfig(PretrainedConfig): Number of attention heads for each attention layer in the Transformer encoder. afn (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. resid_pdrop (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. embd_pdrop (:obj:`int`, `optional`, defaults to 0.1): diff --git a/src/transformers/configuration_pegasus.py b/src/transformers/configuration_pegasus.py index 8bda4dc114..ed56f0b22c 100644 --- a/src/transformers/configuration_pegasus.py +++ b/src/transformers/configuration_pegasus.py @@ -94,7 +94,7 @@ class PegasusConfig(BartConfig): Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder. activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. dropout (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_dropout (:obj:`float`, `optional`, defaults to 0.0): diff --git a/src/transformers/configuration_prophetnet.py b/src/transformers/configuration_prophetnet.py index 3dc2b011ea..451c93954c 100644 --- a/src/transformers/configuration_prophetnet.py +++ b/src/transformers/configuration_prophetnet.py @@ -39,7 +39,7 @@ class ProphetNetConfig(PretrainedConfig): The dropout ratio for activations inside the fully connected layer. activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. vocab_size (:obj:`int`, `optional`, defaults to 30522): Vocabulary size of the ProphetNET model. Defines the number of different tokens that can be represented by the :obj:`inputs_ids` passed when calling :class:`~transformers.ProphetNetModel`. diff --git a/src/transformers/configuration_reformer.py b/src/transformers/configuration_reformer.py index 0ef4b598b7..55367d1188 100755 --- a/src/transformers/configuration_reformer.py +++ b/src/transformers/configuration_reformer.py @@ -80,7 +80,7 @@ class ReformerConfig(PretrainedConfig): :obj:`None` to ensure fully random rotations in local sensitive hashing scheme. hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"relu"`): The non-linear activation function (function or string) in the feed forward layer in the residual attention - block. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + block. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.05): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. hidden_size (:obj:`int`, `optional`, defaults to 256): diff --git a/src/transformers/configuration_retribert.py b/src/transformers/configuration_retribert.py index 36e04faa71..a68801cbcc 100644 --- a/src/transformers/configuration_retribert.py +++ b/src/transformers/configuration_retribert.py @@ -49,7 +49,7 @@ class RetriBertConfig(PretrainedConfig): Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): diff --git a/src/transformers/configuration_squeezebert.py b/src/transformers/configuration_squeezebert.py index 666c79ab2f..80f456cb5b 100644 --- a/src/transformers/configuration_squeezebert.py +++ b/src/transformers/configuration_squeezebert.py @@ -50,7 +50,7 @@ class SqueezeBertConfig(PretrainedConfig): Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, - :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported. hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1): diff --git a/src/transformers/configuration_xlnet.py b/src/transformers/configuration_xlnet.py index 365162de71..05eda6010c 100644 --- a/src/transformers/configuration_xlnet.py +++ b/src/transformers/configuration_xlnet.py @@ -54,7 +54,7 @@ class XLNetConfig(PretrainedConfig): Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. ff_activation (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`): The non-linear activation function (function or string) in the If string, :obj:`"gelu"`, :obj:`"relu"`, - :obj:`"swish"` and :obj:`"gelu_new"` are supported. + :obj:`"silu"` and :obj:`"gelu_new"` are supported. untie_r (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether or not to untie relative position biases attn_type (:obj:`str`, `optional`, defaults to :obj:`"bi"`): diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py index 32cb9f450e..6c9eb14ce4 100644 --- a/src/transformers/modeling_flax_utils.py +++ b/src/transformers/modeling_flax_utils.py @@ -53,6 +53,7 @@ def gelu(x): ACT2FN = { "gelu": nn.gelu, "relu": nn.relu, + "silu": nn.swish, "swish": nn.swish, "gelu_new": gelu, } diff --git a/src/transformers/modeling_openai.py b/src/transformers/modeling_openai.py index b69ef557b9..5bfb2b682a 100644 --- a/src/transformers/modeling_openai.py +++ b/src/transformers/modeling_openai.py @@ -27,7 +27,7 @@ import torch import torch.nn as nn from torch.nn import CrossEntropyLoss, MSELoss -from .activations import gelu_new, swish +from .activations import gelu_new, silu from .configuration_openai import OpenAIGPTConfig from .file_utils import ( ModelOutput, @@ -139,7 +139,7 @@ def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path): return model -ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu_new} +ACT_FNS = {"relu": nn.ReLU, "silu": silu, "gelu": gelu_new, "swish": silu} class Attention(nn.Module): diff --git a/tests/test_activations.py b/tests/test_activations.py index a5a9a23477..cc92ea3cda 100644 --- a/tests/test_activations.py +++ b/tests/test_activations.py @@ -20,6 +20,7 @@ class TestActivations(unittest.TestCase): def test_get_activation(self): get_activation("swish") + get_activation("silu") get_activation("relu") get_activation("tanh") get_activation("gelu_new") diff --git a/tests/test_activations_tf.py b/tests/test_activations_tf.py index bdaecff407..406105c09b 100644 --- a/tests/test_activations_tf.py +++ b/tests/test_activations_tf.py @@ -12,6 +12,7 @@ if is_tf_available(): class TestTFActivations(unittest.TestCase): def test_get_activation(self): get_tf_activation("swish") + get_tf_activation("silu") get_tf_activation("gelu") get_tf_activation("relu") get_tf_activation("tanh")