fix arg name in BLOOM testing and remove unused arg document (#18843)

This commit is contained in:
Shijie Wu
2022-09-15 14:25:32 -04:00
committed by GitHub
parent 16242e1bf0
commit f3d3863255
2 changed files with 5 additions and 11 deletions

View File

@@ -62,18 +62,12 @@ class BloomConfig(PretrainedConfig):
Number of hidden layers in the Transformer encoder. Number of hidden layers in the Transformer encoder.
n_head (`int`, *optional*, defaults to 12): n_head (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
attn_pdrop (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention.
layer_norm_epsilon (`float`, *optional*, defaults to 1e-5): layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
The epsilon to use in the layer normalization layers. The epsilon to use in the layer normalization layers.
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
apply_residual_connection_post_layernorm (`bool`, *optional*, defaults to `False`): apply_residual_connection_post_layernorm (`bool`, *optional*, defaults to `False`):
If enabled, use the layer norm of the hidden states as the residual in the transformer blocks If enabled, use the layer norm of the hidden states as the residual in the transformer blocks
skip_bias_add (`bool`, *optional*, defaults to `True`):
If set to `True`, it will skip bias add for each linear layer in the transformer blocks
skip_bias_add_qkv (`bool`, *optional*, defaults to `False`):
If set to `True`, it will skip bias add for the first linear layer in the transformer blocks
hidden_dropout (`float`, *optional*, defaults to 0.1): hidden_dropout (`float`, *optional*, defaults to 0.1):
Dropout rate of the dropout function on the bias dropout. Dropout rate of the dropout function on the bias dropout.
attention_dropout (`float`, *optional*, defaults to 0.1): attention_dropout (`float`, *optional*, defaults to 0.1):
@@ -124,7 +118,7 @@ class BloomConfig(PretrainedConfig):
n_head=8, n_head=8,
layer_norm_epsilon=1e-5, layer_norm_epsilon=1e-5,
initializer_range=0.02, initializer_range=0.02,
use_cache=False, use_cache=True,
bos_token_id=1, bos_token_id=1,
eos_token_id=2, eos_token_id=2,
apply_residual_connection_post_layernorm=False, apply_residual_connection_post_layernorm=False,

View File

@@ -57,7 +57,7 @@ class BloomModelTester:
intermediate_size=37, intermediate_size=37,
hidden_act="gelu", hidden_act="gelu",
hidden_dropout_prob=0.1, hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1, attention_dropout_prob=0.1,
max_position_embeddings=512, max_position_embeddings=512,
type_vocab_size=16, type_vocab_size=16,
type_sequence_label_size=2, type_sequence_label_size=2,
@@ -81,7 +81,7 @@ class BloomModelTester:
self.intermediate_size = intermediate_size self.intermediate_size = intermediate_size
self.hidden_act = hidden_act self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob self.attention_dropout_prob = attention_dropout_prob
self.max_position_embeddings = max_position_embeddings self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size self.type_vocab_size = type_vocab_size
self.type_sequence_label_size = type_sequence_label_size self.type_sequence_label_size = type_sequence_label_size
@@ -118,8 +118,8 @@ class BloomModelTester:
hidden_size=self.hidden_size, hidden_size=self.hidden_size,
n_layer=self.num_hidden_layers, n_layer=self.num_hidden_layers,
n_head=self.num_attention_heads, n_head=self.num_attention_heads,
resid_pdrop=self.hidden_dropout_prob, hidden_dropout=self.hidden_dropout_prob,
attn_pdrop=self.attention_probs_dropout_prob, attention_dropout=self.attention_dropout_prob,
n_positions=self.max_position_embeddings, n_positions=self.max_position_embeddings,
type_vocab_size=self.type_vocab_size, type_vocab_size=self.type_vocab_size,
initializer_range=self.initializer_range, initializer_range=self.initializer_range,