From f3d38632551c7a4bad76399a73541a9e4ee3130c Mon Sep 17 00:00:00 2001 From: Shijie Wu Date: Thu, 15 Sep 2022 14:25:32 -0400 Subject: [PATCH] fix arg name in BLOOM testing and remove unused arg document (#18843) --- src/transformers/models/bloom/configuration_bloom.py | 8 +------- tests/models/bloom/test_modeling_bloom.py | 8 ++++---- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/src/transformers/models/bloom/configuration_bloom.py b/src/transformers/models/bloom/configuration_bloom.py index 4bcc6e2edb..1103a8148a 100644 --- a/src/transformers/models/bloom/configuration_bloom.py +++ b/src/transformers/models/bloom/configuration_bloom.py @@ -62,18 +62,12 @@ class BloomConfig(PretrainedConfig): Number of hidden layers in the Transformer encoder. n_head (`int`, *optional*, defaults to 12): Number of attention heads for each attention layer in the Transformer encoder. - attn_pdrop (`float`, *optional*, defaults to 0.1): - The dropout ratio for the attention. layer_norm_epsilon (`float`, *optional*, defaults to 1e-5): The epsilon to use in the layer normalization layers. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. apply_residual_connection_post_layernorm (`bool`, *optional*, defaults to `False`): If enabled, use the layer norm of the hidden states as the residual in the transformer blocks - skip_bias_add (`bool`, *optional*, defaults to `True`): - If set to `True`, it will skip bias add for each linear layer in the transformer blocks - skip_bias_add_qkv (`bool`, *optional*, defaults to `False`): - If set to `True`, it will skip bias add for the first linear layer in the transformer blocks hidden_dropout (`float`, *optional*, defaults to 0.1): Dropout rate of the dropout function on the bias dropout. attention_dropout (`float`, *optional*, defaults to 0.1): @@ -124,7 +118,7 @@ class BloomConfig(PretrainedConfig): n_head=8, layer_norm_epsilon=1e-5, initializer_range=0.02, - use_cache=False, + use_cache=True, bos_token_id=1, eos_token_id=2, apply_residual_connection_post_layernorm=False, diff --git a/tests/models/bloom/test_modeling_bloom.py b/tests/models/bloom/test_modeling_bloom.py index 12f66b63a8..aa7894d79e 100644 --- a/tests/models/bloom/test_modeling_bloom.py +++ b/tests/models/bloom/test_modeling_bloom.py @@ -57,7 +57,7 @@ class BloomModelTester: intermediate_size=37, hidden_act="gelu", hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, + attention_dropout_prob=0.1, max_position_embeddings=512, type_vocab_size=16, type_sequence_label_size=2, @@ -81,7 +81,7 @@ class BloomModelTester: self.intermediate_size = intermediate_size self.hidden_act = hidden_act self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.attention_dropout_prob = attention_dropout_prob self.max_position_embeddings = max_position_embeddings self.type_vocab_size = type_vocab_size self.type_sequence_label_size = type_sequence_label_size @@ -118,8 +118,8 @@ class BloomModelTester: hidden_size=self.hidden_size, n_layer=self.num_hidden_layers, n_head=self.num_attention_heads, - resid_pdrop=self.hidden_dropout_prob, - attn_pdrop=self.attention_probs_dropout_prob, + hidden_dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_dropout_prob, n_positions=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range,