From f3d38632551c7a4bad76399a73541a9e4ee3130c Mon Sep 17 00:00:00 2001
From: Shijie Wu <swu671@bloomberg.net>
Date: Thu, 15 Sep 2022 14:25:32 -0400
Subject: [PATCH] fix arg name in BLOOM testing and remove unused arg document
 (#18843)

---
 src/transformers/models/bloom/configuration_bloom.py | 8 +-------
 tests/models/bloom/test_modeling_bloom.py            | 8 ++++----
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/src/transformers/models/bloom/configuration_bloom.py b/src/transformers/models/bloom/configuration_bloom.py
index 4bcc6e2edb..1103a8148a 100644
--- a/src/transformers/models/bloom/configuration_bloom.py
+++ b/src/transformers/models/bloom/configuration_bloom.py
@@ -62,18 +62,12 @@ class BloomConfig(PretrainedConfig):
             Number of hidden layers in the Transformer encoder.
         n_head (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        attn_pdrop (`float`, *optional*, defaults to 0.1):
-            The dropout ratio for the attention.
         layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
             The epsilon to use in the layer normalization layers.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
         apply_residual_connection_post_layernorm (`bool`, *optional*, defaults to `False`):
             If enabled, use the layer norm of the hidden states as the residual in the transformer blocks
-        skip_bias_add (`bool`, *optional*, defaults to `True`):
-            If set to `True`, it will skip bias add for each linear layer in the transformer blocks
-        skip_bias_add_qkv (`bool`, *optional*, defaults to `False`):
-            If set to `True`, it will skip bias add for the first linear layer in the transformer blocks
         hidden_dropout (`float`, *optional*, defaults to 0.1):
             Dropout rate of the dropout function on the bias dropout.
         attention_dropout (`float`, *optional*, defaults to 0.1):
@@ -124,7 +118,7 @@ class BloomConfig(PretrainedConfig):
         n_head=8,
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
-        use_cache=False,
+        use_cache=True,
         bos_token_id=1,
         eos_token_id=2,
         apply_residual_connection_post_layernorm=False,
diff --git a/tests/models/bloom/test_modeling_bloom.py b/tests/models/bloom/test_modeling_bloom.py
index 12f66b63a8..aa7894d79e 100644
--- a/tests/models/bloom/test_modeling_bloom.py
+++ b/tests/models/bloom/test_modeling_bloom.py
@@ -57,7 +57,7 @@ class BloomModelTester:
         intermediate_size=37,
         hidden_act="gelu",
         hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
+        attention_dropout_prob=0.1,
         max_position_embeddings=512,
         type_vocab_size=16,
         type_sequence_label_size=2,
@@ -81,7 +81,7 @@ class BloomModelTester:
         self.intermediate_size = intermediate_size
         self.hidden_act = hidden_act
         self.hidden_dropout_prob = hidden_dropout_prob
-        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.attention_dropout_prob = attention_dropout_prob
         self.max_position_embeddings = max_position_embeddings
         self.type_vocab_size = type_vocab_size
         self.type_sequence_label_size = type_sequence_label_size
@@ -118,8 +118,8 @@ class BloomModelTester:
             hidden_size=self.hidden_size,
             n_layer=self.num_hidden_layers,
             n_head=self.num_attention_heads,
-            resid_pdrop=self.hidden_dropout_prob,
-            attn_pdrop=self.attention_probs_dropout_prob,
+            hidden_dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_dropout_prob,
             n_positions=self.max_position_embeddings,
             type_vocab_size=self.type_vocab_size,
             initializer_range=self.initializer_range,