fix arg name in BLOOM testing and remove unused arg document (#18843)
This commit is contained in:
@@ -62,18 +62,12 @@ class BloomConfig(PretrainedConfig):
|
|||||||
Number of hidden layers in the Transformer encoder.
|
Number of hidden layers in the Transformer encoder.
|
||||||
n_head (`int`, *optional*, defaults to 12):
|
n_head (`int`, *optional*, defaults to 12):
|
||||||
Number of attention heads for each attention layer in the Transformer encoder.
|
Number of attention heads for each attention layer in the Transformer encoder.
|
||||||
attn_pdrop (`float`, *optional*, defaults to 0.1):
|
|
||||||
The dropout ratio for the attention.
|
|
||||||
layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
|
layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
|
||||||
The epsilon to use in the layer normalization layers.
|
The epsilon to use in the layer normalization layers.
|
||||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||||
apply_residual_connection_post_layernorm (`bool`, *optional*, defaults to `False`):
|
apply_residual_connection_post_layernorm (`bool`, *optional*, defaults to `False`):
|
||||||
If enabled, use the layer norm of the hidden states as the residual in the transformer blocks
|
If enabled, use the layer norm of the hidden states as the residual in the transformer blocks
|
||||||
skip_bias_add (`bool`, *optional*, defaults to `True`):
|
|
||||||
If set to `True`, it will skip bias add for each linear layer in the transformer blocks
|
|
||||||
skip_bias_add_qkv (`bool`, *optional*, defaults to `False`):
|
|
||||||
If set to `True`, it will skip bias add for the first linear layer in the transformer blocks
|
|
||||||
hidden_dropout (`float`, *optional*, defaults to 0.1):
|
hidden_dropout (`float`, *optional*, defaults to 0.1):
|
||||||
Dropout rate of the dropout function on the bias dropout.
|
Dropout rate of the dropout function on the bias dropout.
|
||||||
attention_dropout (`float`, *optional*, defaults to 0.1):
|
attention_dropout (`float`, *optional*, defaults to 0.1):
|
||||||
@@ -124,7 +118,7 @@ class BloomConfig(PretrainedConfig):
|
|||||||
n_head=8,
|
n_head=8,
|
||||||
layer_norm_epsilon=1e-5,
|
layer_norm_epsilon=1e-5,
|
||||||
initializer_range=0.02,
|
initializer_range=0.02,
|
||||||
use_cache=False,
|
use_cache=True,
|
||||||
bos_token_id=1,
|
bos_token_id=1,
|
||||||
eos_token_id=2,
|
eos_token_id=2,
|
||||||
apply_residual_connection_post_layernorm=False,
|
apply_residual_connection_post_layernorm=False,
|
||||||
|
|||||||
@@ -57,7 +57,7 @@ class BloomModelTester:
|
|||||||
intermediate_size=37,
|
intermediate_size=37,
|
||||||
hidden_act="gelu",
|
hidden_act="gelu",
|
||||||
hidden_dropout_prob=0.1,
|
hidden_dropout_prob=0.1,
|
||||||
attention_probs_dropout_prob=0.1,
|
attention_dropout_prob=0.1,
|
||||||
max_position_embeddings=512,
|
max_position_embeddings=512,
|
||||||
type_vocab_size=16,
|
type_vocab_size=16,
|
||||||
type_sequence_label_size=2,
|
type_sequence_label_size=2,
|
||||||
@@ -81,7 +81,7 @@ class BloomModelTester:
|
|||||||
self.intermediate_size = intermediate_size
|
self.intermediate_size = intermediate_size
|
||||||
self.hidden_act = hidden_act
|
self.hidden_act = hidden_act
|
||||||
self.hidden_dropout_prob = hidden_dropout_prob
|
self.hidden_dropout_prob = hidden_dropout_prob
|
||||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
self.attention_dropout_prob = attention_dropout_prob
|
||||||
self.max_position_embeddings = max_position_embeddings
|
self.max_position_embeddings = max_position_embeddings
|
||||||
self.type_vocab_size = type_vocab_size
|
self.type_vocab_size = type_vocab_size
|
||||||
self.type_sequence_label_size = type_sequence_label_size
|
self.type_sequence_label_size = type_sequence_label_size
|
||||||
@@ -118,8 +118,8 @@ class BloomModelTester:
|
|||||||
hidden_size=self.hidden_size,
|
hidden_size=self.hidden_size,
|
||||||
n_layer=self.num_hidden_layers,
|
n_layer=self.num_hidden_layers,
|
||||||
n_head=self.num_attention_heads,
|
n_head=self.num_attention_heads,
|
||||||
resid_pdrop=self.hidden_dropout_prob,
|
hidden_dropout=self.hidden_dropout_prob,
|
||||||
attn_pdrop=self.attention_probs_dropout_prob,
|
attention_dropout=self.attention_dropout_prob,
|
||||||
n_positions=self.max_position_embeddings,
|
n_positions=self.max_position_embeddings,
|
||||||
type_vocab_size=self.type_vocab_size,
|
type_vocab_size=self.type_vocab_size,
|
||||||
initializer_range=self.initializer_range,
|
initializer_range=self.initializer_range,
|
||||||
|
|||||||
Reference in New Issue
Block a user