Fix check_config_attributes: check all configuration classes (#24231)
* fix --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
@@ -206,8 +206,6 @@ class AlignVisionConfig(PretrainedConfig):
|
|||||||
The epsilon used by the batch normalization layers.
|
The epsilon used by the batch normalization layers.
|
||||||
batch_norm_momentum (`float`, *optional*, defaults to 0.99):
|
batch_norm_momentum (`float`, *optional*, defaults to 0.99):
|
||||||
The momentum used by the batch normalization layers.
|
The momentum used by the batch normalization layers.
|
||||||
dropout_rate (`float`, *optional*, defaults to 0.5):
|
|
||||||
The dropout rate to be applied before final classifier layer.
|
|
||||||
drop_connect_rate (`float`, *optional*, defaults to 0.2):
|
drop_connect_rate (`float`, *optional*, defaults to 0.2):
|
||||||
The drop rate for skip connections.
|
The drop rate for skip connections.
|
||||||
|
|
||||||
@@ -249,7 +247,6 @@ class AlignVisionConfig(PretrainedConfig):
|
|||||||
initializer_range: float = 0.02,
|
initializer_range: float = 0.02,
|
||||||
batch_norm_eps: float = 0.001,
|
batch_norm_eps: float = 0.001,
|
||||||
batch_norm_momentum: float = 0.99,
|
batch_norm_momentum: float = 0.99,
|
||||||
dropout_rate: float = 0.5,
|
|
||||||
drop_connect_rate: float = 0.2,
|
drop_connect_rate: float = 0.2,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
@@ -274,7 +271,6 @@ class AlignVisionConfig(PretrainedConfig):
|
|||||||
self.initializer_range = initializer_range
|
self.initializer_range = initializer_range
|
||||||
self.batch_norm_eps = batch_norm_eps
|
self.batch_norm_eps = batch_norm_eps
|
||||||
self.batch_norm_momentum = batch_norm_momentum
|
self.batch_norm_momentum = batch_norm_momentum
|
||||||
self.dropout_rate = dropout_rate
|
|
||||||
self.drop_connect_rate = drop_connect_rate
|
self.drop_connect_rate = drop_connect_rate
|
||||||
self.num_hidden_layers = sum(num_block_repeats) * 4
|
self.num_hidden_layers = sum(num_block_repeats) * 4
|
||||||
|
|
||||||
|
|||||||
@@ -234,7 +234,6 @@ class BlipVisionConfig(PretrainedConfig):
|
|||||||
projection_dim=512,
|
projection_dim=512,
|
||||||
num_hidden_layers=12,
|
num_hidden_layers=12,
|
||||||
num_attention_heads=12,
|
num_attention_heads=12,
|
||||||
num_channels=3,
|
|
||||||
image_size=384,
|
image_size=384,
|
||||||
patch_size=16,
|
patch_size=16,
|
||||||
hidden_act="gelu",
|
hidden_act="gelu",
|
||||||
@@ -250,7 +249,6 @@ class BlipVisionConfig(PretrainedConfig):
|
|||||||
self.projection_dim = projection_dim
|
self.projection_dim = projection_dim
|
||||||
self.num_hidden_layers = num_hidden_layers
|
self.num_hidden_layers = num_hidden_layers
|
||||||
self.num_attention_heads = num_attention_heads
|
self.num_attention_heads = num_attention_heads
|
||||||
self.num_channels = num_channels
|
|
||||||
self.patch_size = patch_size
|
self.patch_size = patch_size
|
||||||
self.image_size = image_size
|
self.image_size = image_size
|
||||||
self.initializer_range = initializer_range
|
self.initializer_range = initializer_range
|
||||||
|
|||||||
@@ -58,15 +58,10 @@ class Blip2VisionConfig(PretrainedConfig):
|
|||||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||||
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults
|
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults
|
||||||
to 1e-5): The epsilon used by the layer normalization layers.
|
to 1e-5): The epsilon used by the layer normalization layers.
|
||||||
dropout (`float`, *optional*, defaults to 0.0):
|
|
||||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
|
||||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
The dropout ratio for the attention probabilities.
|
The dropout ratio for the attention probabilities.
|
||||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||||
initializer_factor (`float``, *optional*, defaults to 1):
|
|
||||||
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
|
|
||||||
testing).
|
|
||||||
qkv_bias (`bool`, *optional*, defaults to `True`):
|
qkv_bias (`bool`, *optional*, defaults to `True`):
|
||||||
Whether to add a bias to the queries and values in the self-attention layers.
|
Whether to add a bias to the queries and values in the self-attention layers.
|
||||||
|
|
||||||
@@ -91,18 +86,14 @@ class Blip2VisionConfig(PretrainedConfig):
|
|||||||
self,
|
self,
|
||||||
hidden_size=1408,
|
hidden_size=1408,
|
||||||
intermediate_size=6144,
|
intermediate_size=6144,
|
||||||
projection_dim=512,
|
|
||||||
num_hidden_layers=39,
|
num_hidden_layers=39,
|
||||||
num_attention_heads=16,
|
num_attention_heads=16,
|
||||||
num_channels=3,
|
|
||||||
image_size=224,
|
image_size=224,
|
||||||
patch_size=14,
|
patch_size=14,
|
||||||
hidden_act="gelu",
|
hidden_act="gelu",
|
||||||
layer_norm_eps=0.00001,
|
layer_norm_eps=0.00001,
|
||||||
dropout=0.0,
|
|
||||||
attention_dropout=0.0,
|
attention_dropout=0.0,
|
||||||
initializer_range=1e-10,
|
initializer_range=1e-10,
|
||||||
initializer_factor=1.0,
|
|
||||||
qkv_bias=True,
|
qkv_bias=True,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
@@ -110,15 +101,11 @@ class Blip2VisionConfig(PretrainedConfig):
|
|||||||
|
|
||||||
self.hidden_size = hidden_size
|
self.hidden_size = hidden_size
|
||||||
self.intermediate_size = intermediate_size
|
self.intermediate_size = intermediate_size
|
||||||
self.projection_dim = projection_dim
|
|
||||||
self.dropout = dropout
|
|
||||||
self.num_hidden_layers = num_hidden_layers
|
self.num_hidden_layers = num_hidden_layers
|
||||||
self.num_attention_heads = num_attention_heads
|
self.num_attention_heads = num_attention_heads
|
||||||
self.num_channels = num_channels
|
|
||||||
self.patch_size = patch_size
|
self.patch_size = patch_size
|
||||||
self.image_size = image_size
|
self.image_size = image_size
|
||||||
self.initializer_range = initializer_range
|
self.initializer_range = initializer_range
|
||||||
self.initializer_factor = initializer_factor
|
|
||||||
self.attention_dropout = attention_dropout
|
self.attention_dropout = attention_dropout
|
||||||
self.layer_norm_eps = layer_norm_eps
|
self.layer_norm_eps = layer_norm_eps
|
||||||
self.hidden_act = hidden_act
|
self.hidden_act = hidden_act
|
||||||
@@ -184,8 +171,6 @@ class Blip2QFormerConfig(PretrainedConfig):
|
|||||||
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
||||||
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
||||||
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
||||||
classifier_dropout (`float`, *optional*):
|
|
||||||
The dropout ratio for the classification head.
|
|
||||||
cross_attention_frequency (`int`, *optional*, defaults to 2):
|
cross_attention_frequency (`int`, *optional*, defaults to 2):
|
||||||
The frequency of adding cross-attention to the Transformer layers.
|
The frequency of adding cross-attention to the Transformer layers.
|
||||||
encoder_hidden_size (`int`, *optional*, defaults to 1408):
|
encoder_hidden_size (`int`, *optional*, defaults to 1408):
|
||||||
@@ -221,7 +206,6 @@ class Blip2QFormerConfig(PretrainedConfig):
|
|||||||
layer_norm_eps=1e-12,
|
layer_norm_eps=1e-12,
|
||||||
pad_token_id=0,
|
pad_token_id=0,
|
||||||
position_embedding_type="absolute",
|
position_embedding_type="absolute",
|
||||||
classifier_dropout=None,
|
|
||||||
cross_attention_frequency=2,
|
cross_attention_frequency=2,
|
||||||
encoder_hidden_size=1408,
|
encoder_hidden_size=1408,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
@@ -240,7 +224,6 @@ class Blip2QFormerConfig(PretrainedConfig):
|
|||||||
self.initializer_range = initializer_range
|
self.initializer_range = initializer_range
|
||||||
self.layer_norm_eps = layer_norm_eps
|
self.layer_norm_eps = layer_norm_eps
|
||||||
self.position_embedding_type = position_embedding_type
|
self.position_embedding_type = position_embedding_type
|
||||||
self.classifier_dropout = classifier_dropout
|
|
||||||
self.cross_attention_frequency = cross_attention_frequency
|
self.cross_attention_frequency = cross_attention_frequency
|
||||||
self.encoder_hidden_size = encoder_hidden_size
|
self.encoder_hidden_size = encoder_hidden_size
|
||||||
|
|
||||||
|
|||||||
@@ -155,8 +155,6 @@ class BridgeTowerTextConfig(PretrainedConfig):
|
|||||||
initializer_factor (`float``, *optional*, defaults to 1):
|
initializer_factor (`float``, *optional*, defaults to 1):
|
||||||
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
|
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
|
||||||
testing).
|
testing).
|
||||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
|
||||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
|
||||||
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
|
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||||
The epsilon used by the layer normalization layers.
|
The epsilon used by the layer normalization layers.
|
||||||
position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
|
position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
|
||||||
@@ -170,8 +168,6 @@ class BridgeTowerTextConfig(PretrainedConfig):
|
|||||||
use_cache (`bool`, *optional*, defaults to `True`):
|
use_cache (`bool`, *optional*, defaults to `True`):
|
||||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||||
relevant if `config.is_decoder=True`.
|
relevant if `config.is_decoder=True`.
|
||||||
classifier_dropout (`float`, *optional*):
|
|
||||||
The dropout ratio for the classification head.
|
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
@@ -199,14 +195,12 @@ class BridgeTowerTextConfig(PretrainedConfig):
|
|||||||
attention_probs_dropout_prob=0.1,
|
attention_probs_dropout_prob=0.1,
|
||||||
max_position_embeddings=514,
|
max_position_embeddings=514,
|
||||||
type_vocab_size=1,
|
type_vocab_size=1,
|
||||||
initializer_range=0.02,
|
|
||||||
layer_norm_eps=1e-05,
|
layer_norm_eps=1e-05,
|
||||||
pad_token_id=1,
|
pad_token_id=1,
|
||||||
bos_token_id=0,
|
bos_token_id=0,
|
||||||
eos_token_id=2,
|
eos_token_id=2,
|
||||||
position_embedding_type="absolute",
|
position_embedding_type="absolute",
|
||||||
use_cache=True,
|
use_cache=True,
|
||||||
classifier_dropout=None,
|
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
@@ -222,11 +216,9 @@ class BridgeTowerTextConfig(PretrainedConfig):
|
|||||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||||
self.max_position_embeddings = max_position_embeddings
|
self.max_position_embeddings = max_position_embeddings
|
||||||
self.type_vocab_size = type_vocab_size
|
self.type_vocab_size = type_vocab_size
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.layer_norm_eps = layer_norm_eps
|
self.layer_norm_eps = layer_norm_eps
|
||||||
self.position_embedding_type = position_embedding_type
|
self.position_embedding_type = position_embedding_type
|
||||||
self.use_cache = use_cache
|
self.use_cache = use_cache
|
||||||
self.classifier_dropout = classifier_dropout
|
|
||||||
self.pad_token_id = pad_token_id
|
self.pad_token_id = pad_token_id
|
||||||
self.bos_token_id = bos_token_id
|
self.bos_token_id = bos_token_id
|
||||||
self.eos_token_id = eos_token_id
|
self.eos_token_id = eos_token_id
|
||||||
|
|||||||
@@ -65,8 +65,6 @@ class ClapTextConfig(PretrainedConfig):
|
|||||||
just in case (e.g., 512 or 1024 or 2048).
|
just in case (e.g., 512 or 1024 or 2048).
|
||||||
type_vocab_size (`int`, *optional*, defaults to 2):
|
type_vocab_size (`int`, *optional*, defaults to 2):
|
||||||
The vocabulary size of the `token_type_ids` passed when calling [`ClapTextModel`].
|
The vocabulary size of the `token_type_ids` passed when calling [`ClapTextModel`].
|
||||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
|
||||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
|
||||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||||
The epsilon used by the layer normalization layers.
|
The epsilon used by the layer normalization layers.
|
||||||
position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
|
position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
|
||||||
@@ -80,8 +78,6 @@ class ClapTextConfig(PretrainedConfig):
|
|||||||
use_cache (`bool`, *optional*, defaults to `True`):
|
use_cache (`bool`, *optional*, defaults to `True`):
|
||||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||||
relevant if `config.is_decoder=True`.
|
relevant if `config.is_decoder=True`.
|
||||||
classifier_dropout (`float`, *optional*):
|
|
||||||
The dropout ratio for the classification head.
|
|
||||||
projection_hidden_act (`str`, *optional*, defaults to `"relu"`):
|
projection_hidden_act (`str`, *optional*, defaults to `"relu"`):
|
||||||
The non-linear activation function (function or string) in the projection layer. If string, `"gelu"`,
|
The non-linear activation function (function or string) in the projection layer. If string, `"gelu"`,
|
||||||
`"relu"`, `"silu"` and `"gelu_new"` are supported.
|
`"relu"`, `"silu"` and `"gelu_new"` are supported.
|
||||||
@@ -116,7 +112,6 @@ class ClapTextConfig(PretrainedConfig):
|
|||||||
attention_probs_dropout_prob=0.1,
|
attention_probs_dropout_prob=0.1,
|
||||||
max_position_embeddings=514,
|
max_position_embeddings=514,
|
||||||
type_vocab_size=1,
|
type_vocab_size=1,
|
||||||
initializer_range=0.02,
|
|
||||||
initializer_factor=1.0,
|
initializer_factor=1.0,
|
||||||
layer_norm_eps=1e-12,
|
layer_norm_eps=1e-12,
|
||||||
projection_dim=512,
|
projection_dim=512,
|
||||||
@@ -125,7 +120,6 @@ class ClapTextConfig(PretrainedConfig):
|
|||||||
eos_token_id=2,
|
eos_token_id=2,
|
||||||
position_embedding_type="absolute",
|
position_embedding_type="absolute",
|
||||||
use_cache=True,
|
use_cache=True,
|
||||||
classifier_dropout=None,
|
|
||||||
projection_hidden_act="relu",
|
projection_hidden_act="relu",
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
@@ -141,12 +135,10 @@ class ClapTextConfig(PretrainedConfig):
|
|||||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||||
self.max_position_embeddings = max_position_embeddings
|
self.max_position_embeddings = max_position_embeddings
|
||||||
self.type_vocab_size = type_vocab_size
|
self.type_vocab_size = type_vocab_size
|
||||||
self.initializer_range = initializer_range
|
|
||||||
self.initializer_factor = initializer_factor
|
self.initializer_factor = initializer_factor
|
||||||
self.layer_norm_eps = layer_norm_eps
|
self.layer_norm_eps = layer_norm_eps
|
||||||
self.position_embedding_type = position_embedding_type
|
self.position_embedding_type = position_embedding_type
|
||||||
self.use_cache = use_cache
|
self.use_cache = use_cache
|
||||||
self.classifier_dropout = classifier_dropout
|
|
||||||
self.projection_hidden_act = projection_hidden_act
|
self.projection_hidden_act = projection_hidden_act
|
||||||
self.projection_dim = projection_dim
|
self.projection_dim = projection_dim
|
||||||
|
|
||||||
|
|||||||
@@ -187,16 +187,10 @@ class Pix2StructVisionConfig(PretrainedConfig):
|
|||||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||||
d_kv (`int`, *optional*, defaults to 64):
|
d_kv (`int`, *optional*, defaults to 64):
|
||||||
Dimensionality of the key, query, value projections per attention head.
|
Dimensionality of the key, query, value projections per attention head.
|
||||||
projection_dim (`int`, *optional*, defaults to 768):
|
|
||||||
Dimensionality of the projection layer in the Transformer encoder.
|
|
||||||
num_hidden_layers (`int`, *optional*, defaults to 12):
|
num_hidden_layers (`int`, *optional*, defaults to 12):
|
||||||
Number of hidden layers in the Transformer encoder.
|
Number of hidden layers in the Transformer encoder.
|
||||||
num_attention_heads (`int`, *optional*, defaults to 12):
|
num_attention_heads (`int`, *optional*, defaults to 12):
|
||||||
Number of attention heads for each attention layer in the Transformer encoder.
|
Number of attention heads for each attention layer in the Transformer encoder.
|
||||||
num_channels (`int`, *optional*, defaults to 3):
|
|
||||||
Number of channels of the input images.
|
|
||||||
patch_size (`int`, *optional*, defaults to 16):
|
|
||||||
The size (resolution) of each patch.
|
|
||||||
dense_act_fn (`str` or `function`, *optional*, defaults to `"gelu_new"`):
|
dense_act_fn (`str` or `function`, *optional*, defaults to `"gelu_new"`):
|
||||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||||
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
|
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
|
||||||
@@ -213,8 +207,6 @@ class Pix2StructVisionConfig(PretrainedConfig):
|
|||||||
testing).
|
testing).
|
||||||
seq_len (`int`, *optional*, defaults to 4096):
|
seq_len (`int`, *optional*, defaults to 4096):
|
||||||
Maximum sequence length (here number of patches) supported by the model.
|
Maximum sequence length (here number of patches) supported by the model.
|
||||||
layer_norm_bias (`bool`, *optional*, defaults to `False`):
|
|
||||||
Whether or not to add a bias to the layer normalization layers.
|
|
||||||
relative_attention_num_buckets (`int`, *optional*, defaults to 32):
|
relative_attention_num_buckets (`int`, *optional*, defaults to 32):
|
||||||
The number of buckets to use for each attention layer.
|
The number of buckets to use for each attention layer.
|
||||||
relative_attention_max_distance (`int`, *optional*, defaults to 128):
|
relative_attention_max_distance (`int`, *optional*, defaults to 128):
|
||||||
@@ -243,11 +235,8 @@ class Pix2StructVisionConfig(PretrainedConfig):
|
|||||||
patch_embed_hidden_size=768,
|
patch_embed_hidden_size=768,
|
||||||
d_ff=2048,
|
d_ff=2048,
|
||||||
d_kv=64,
|
d_kv=64,
|
||||||
projection_dim=768,
|
|
||||||
num_hidden_layers=12,
|
num_hidden_layers=12,
|
||||||
num_attention_heads=12,
|
num_attention_heads=12,
|
||||||
num_channels=3,
|
|
||||||
patch_size=16,
|
|
||||||
dense_act_fn="gelu_new",
|
dense_act_fn="gelu_new",
|
||||||
layer_norm_eps=1e-6,
|
layer_norm_eps=1e-6,
|
||||||
dropout_rate=0.0,
|
dropout_rate=0.0,
|
||||||
@@ -255,7 +244,6 @@ class Pix2StructVisionConfig(PretrainedConfig):
|
|||||||
initializer_range=1e-10,
|
initializer_range=1e-10,
|
||||||
initializer_factor=1.0,
|
initializer_factor=1.0,
|
||||||
seq_len=4096,
|
seq_len=4096,
|
||||||
layer_norm_bias=False,
|
|
||||||
relative_attention_num_buckets=32,
|
relative_attention_num_buckets=32,
|
||||||
relative_attention_max_distance=128,
|
relative_attention_max_distance=128,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
@@ -265,19 +253,15 @@ class Pix2StructVisionConfig(PretrainedConfig):
|
|||||||
self.hidden_size = hidden_size
|
self.hidden_size = hidden_size
|
||||||
self.patch_embed_hidden_size = patch_embed_hidden_size
|
self.patch_embed_hidden_size = patch_embed_hidden_size
|
||||||
self.d_ff = d_ff
|
self.d_ff = d_ff
|
||||||
self.projection_dim = projection_dim
|
|
||||||
self.dropout_rate = dropout_rate
|
self.dropout_rate = dropout_rate
|
||||||
self.num_hidden_layers = num_hidden_layers
|
self.num_hidden_layers = num_hidden_layers
|
||||||
self.num_attention_heads = num_attention_heads
|
self.num_attention_heads = num_attention_heads
|
||||||
self.num_channels = num_channels
|
|
||||||
self.patch_size = patch_size
|
|
||||||
self.initializer_range = initializer_range
|
self.initializer_range = initializer_range
|
||||||
self.initializer_factor = initializer_factor
|
self.initializer_factor = initializer_factor
|
||||||
self.attention_dropout = attention_dropout
|
self.attention_dropout = attention_dropout
|
||||||
self.layer_norm_eps = layer_norm_eps
|
self.layer_norm_eps = layer_norm_eps
|
||||||
self.dense_act_fn = dense_act_fn
|
self.dense_act_fn = dense_act_fn
|
||||||
self.seq_len = seq_len
|
self.seq_len = seq_len
|
||||||
self.layer_norm_bias = layer_norm_bias
|
|
||||||
self.relative_attention_num_buckets = relative_attention_num_buckets
|
self.relative_attention_num_buckets = relative_attention_num_buckets
|
||||||
self.relative_attention_max_distance = relative_attention_max_distance
|
self.relative_attention_max_distance = relative_attention_max_distance
|
||||||
self.d_kv = d_kv
|
self.d_kv = d_kv
|
||||||
|
|||||||
@@ -150,10 +150,6 @@ class SamVisionConfig(PretrainedConfig):
|
|||||||
Args:
|
Args:
|
||||||
hidden_size (`int`, *optional*, defaults to 768):
|
hidden_size (`int`, *optional*, defaults to 768):
|
||||||
Dimensionality of the encoder layers and the pooler layer.
|
Dimensionality of the encoder layers and the pooler layer.
|
||||||
intermediate_size (`int`, *optional*, defaults to 6144):
|
|
||||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
|
||||||
projection_dim (`int`, *optional*, defaults to 512):
|
|
||||||
Dimensionality of the projection layer in the Transformer encoder.
|
|
||||||
output_channels (`int`, *optional*, defaults to 256):
|
output_channels (`int`, *optional*, defaults to 256):
|
||||||
Dimensionality of the output channels in the Patch Encoder.
|
Dimensionality of the output channels in the Patch Encoder.
|
||||||
num_hidden_layers (`int`, *optional*, defaults to 12):
|
num_hidden_layers (`int`, *optional*, defaults to 12):
|
||||||
@@ -170,14 +166,10 @@ class SamVisionConfig(PretrainedConfig):
|
|||||||
The non-linear activation function (function or string)
|
The non-linear activation function (function or string)
|
||||||
layer_norm_eps (`float`, *optional*, defaults to 1e-6):
|
layer_norm_eps (`float`, *optional*, defaults to 1e-6):
|
||||||
The epsilon used by the layer normalization layers.
|
The epsilon used by the layer normalization layers.
|
||||||
dropout (`float`, *optional*, defaults to 0.0):
|
|
||||||
The dropout probability.
|
|
||||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
The dropout ratio for the attention probabilities.
|
The dropout ratio for the attention probabilities.
|
||||||
initializer_range (`float`, *optional*, defaults to 1e-10):
|
initializer_range (`float`, *optional*, defaults to 1e-10):
|
||||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||||
initializer_factor (`float`, *optional*, defaults to 1.0):
|
|
||||||
A factor for multiplying the initializer range.
|
|
||||||
qkv_bias (`bool`, *optional*, defaults to `True`):
|
qkv_bias (`bool`, *optional*, defaults to `True`):
|
||||||
Whether to add a bias to query, key, value projections.
|
Whether to add a bias to query, key, value projections.
|
||||||
mlp_ratio (`float`, *optional*, defaults to 4.0):
|
mlp_ratio (`float`, *optional*, defaults to 4.0):
|
||||||
@@ -200,8 +192,6 @@ class SamVisionConfig(PretrainedConfig):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
hidden_size=768,
|
hidden_size=768,
|
||||||
intermediate_size=6144,
|
|
||||||
projection_dim=512,
|
|
||||||
output_channels=256,
|
output_channels=256,
|
||||||
num_hidden_layers=12,
|
num_hidden_layers=12,
|
||||||
num_attention_heads=12,
|
num_attention_heads=12,
|
||||||
@@ -210,10 +200,8 @@ class SamVisionConfig(PretrainedConfig):
|
|||||||
patch_size=16,
|
patch_size=16,
|
||||||
hidden_act="gelu",
|
hidden_act="gelu",
|
||||||
layer_norm_eps=1e-06,
|
layer_norm_eps=1e-06,
|
||||||
dropout=0.0,
|
|
||||||
attention_dropout=0.0,
|
attention_dropout=0.0,
|
||||||
initializer_range=1e-10,
|
initializer_range=1e-10,
|
||||||
initializer_factor=1.0,
|
|
||||||
qkv_bias=True,
|
qkv_bias=True,
|
||||||
mlp_ratio=4.0,
|
mlp_ratio=4.0,
|
||||||
use_abs_pos=True,
|
use_abs_pos=True,
|
||||||
@@ -227,8 +215,6 @@ class SamVisionConfig(PretrainedConfig):
|
|||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
self.hidden_size = hidden_size
|
self.hidden_size = hidden_size
|
||||||
self.intermediate_size = intermediate_size
|
|
||||||
self.projection_dim = projection_dim
|
|
||||||
self.output_channels = output_channels
|
self.output_channels = output_channels
|
||||||
self.num_hidden_layers = num_hidden_layers
|
self.num_hidden_layers = num_hidden_layers
|
||||||
self.num_attention_heads = num_attention_heads
|
self.num_attention_heads = num_attention_heads
|
||||||
@@ -237,10 +223,8 @@ class SamVisionConfig(PretrainedConfig):
|
|||||||
self.patch_size = patch_size
|
self.patch_size = patch_size
|
||||||
self.hidden_act = hidden_act
|
self.hidden_act = hidden_act
|
||||||
self.layer_norm_eps = layer_norm_eps
|
self.layer_norm_eps = layer_norm_eps
|
||||||
self.dropout = dropout
|
|
||||||
self.attention_dropout = attention_dropout
|
self.attention_dropout = attention_dropout
|
||||||
self.initializer_range = initializer_range
|
self.initializer_range = initializer_range
|
||||||
self.initializer_factor = initializer_factor
|
|
||||||
self.qkv_bias = qkv_bias
|
self.qkv_bias = qkv_bias
|
||||||
self.mlp_ratio = mlp_ratio
|
self.mlp_ratio = mlp_ratio
|
||||||
self.use_abs_pos = use_abs_pos
|
self.use_abs_pos = use_abs_pos
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ import inspect
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from transformers.configuration_utils import PretrainedConfig
|
||||||
from transformers.utils import direct_transformers_import
|
from transformers.utils import direct_transformers_import
|
||||||
|
|
||||||
|
|
||||||
@@ -77,6 +78,12 @@ SPECIAL_CASES_TO_ALLOW = {
|
|||||||
"TimeSeriesTransformerConfig": ["num_static_real_features", "num_time_features"],
|
"TimeSeriesTransformerConfig": ["num_static_real_features", "num_time_features"],
|
||||||
# used internally to calculate the feature size
|
# used internally to calculate the feature size
|
||||||
"AutoformerConfig": ["num_static_real_features", "num_time_features"],
|
"AutoformerConfig": ["num_static_real_features", "num_time_features"],
|
||||||
|
# used internally to calculate `mlp_dim`
|
||||||
|
"SamVisionConfig": ["mlp_ratio"],
|
||||||
|
# For (head) training, but so far not implemented
|
||||||
|
"ClapAudioConfig": ["num_classes"],
|
||||||
|
# Not used, but providing useful information to users
|
||||||
|
"SpeechT5HifiGanConfig": ["sampling_rate"],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -113,6 +120,10 @@ SPECIAL_CASES_TO_ALLOW.update(
|
|||||||
"VanConfig": True,
|
"VanConfig": True,
|
||||||
"WavLMConfig": True,
|
"WavLMConfig": True,
|
||||||
"WhisperConfig": True,
|
"WhisperConfig": True,
|
||||||
|
# TODO: @Arthur (for `alignment_head` and `alignment_layer`)
|
||||||
|
"JukeboxPriorConfig": True,
|
||||||
|
# TODO: @Younes (for `is_decoder`)
|
||||||
|
"Pix2StructTextConfig": True,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -254,10 +265,21 @@ def check_config_attributes_being_used(config_class):
|
|||||||
def check_config_attributes():
|
def check_config_attributes():
|
||||||
"""Check the arguments in `__init__` of all configuration classes are used in python files"""
|
"""Check the arguments in `__init__` of all configuration classes are used in python files"""
|
||||||
configs_with_unused_attributes = {}
|
configs_with_unused_attributes = {}
|
||||||
for config_class in list(CONFIG_MAPPING.values()):
|
for _config_class in list(CONFIG_MAPPING.values()):
|
||||||
unused_attributes = check_config_attributes_being_used(config_class)
|
# Some config classes are not in `CONFIG_MAPPING` (e.g. `CLIPVisionConfig`, `Blip2VisionConfig`, etc.)
|
||||||
if len(unused_attributes) > 0:
|
config_classes_in_module = [
|
||||||
configs_with_unused_attributes[config_class.__name__] = unused_attributes
|
cls
|
||||||
|
for name, cls in inspect.getmembers(
|
||||||
|
inspect.getmodule(_config_class),
|
||||||
|
lambda x: inspect.isclass(x)
|
||||||
|
and issubclass(x, PretrainedConfig)
|
||||||
|
and inspect.getmodule(x) == inspect.getmodule(_config_class),
|
||||||
|
)
|
||||||
|
]
|
||||||
|
for config_class in config_classes_in_module:
|
||||||
|
unused_attributes = check_config_attributes_being_used(config_class)
|
||||||
|
if len(unused_attributes) > 0:
|
||||||
|
configs_with_unused_attributes[config_class.__name__] = unused_attributes
|
||||||
|
|
||||||
if len(configs_with_unused_attributes) > 0:
|
if len(configs_with_unused_attributes) > 0:
|
||||||
error = "The following configuration classes contain unused attributes in the corresponding modeling files:\n"
|
error = "The following configuration classes contain unused attributes in the corresponding modeling files:\n"
|
||||||
|
|||||||
Reference in New Issue
Block a user