Fix check_config_attributes: check all configuration classes (#24231)
* fix --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
@@ -206,8 +206,6 @@ class AlignVisionConfig(PretrainedConfig):
|
||||
The epsilon used by the batch normalization layers.
|
||||
batch_norm_momentum (`float`, *optional*, defaults to 0.99):
|
||||
The momentum used by the batch normalization layers.
|
||||
dropout_rate (`float`, *optional*, defaults to 0.5):
|
||||
The dropout rate to be applied before final classifier layer.
|
||||
drop_connect_rate (`float`, *optional*, defaults to 0.2):
|
||||
The drop rate for skip connections.
|
||||
|
||||
@@ -249,7 +247,6 @@ class AlignVisionConfig(PretrainedConfig):
|
||||
initializer_range: float = 0.02,
|
||||
batch_norm_eps: float = 0.001,
|
||||
batch_norm_momentum: float = 0.99,
|
||||
dropout_rate: float = 0.5,
|
||||
drop_connect_rate: float = 0.2,
|
||||
**kwargs,
|
||||
):
|
||||
@@ -274,7 +271,6 @@ class AlignVisionConfig(PretrainedConfig):
|
||||
self.initializer_range = initializer_range
|
||||
self.batch_norm_eps = batch_norm_eps
|
||||
self.batch_norm_momentum = batch_norm_momentum
|
||||
self.dropout_rate = dropout_rate
|
||||
self.drop_connect_rate = drop_connect_rate
|
||||
self.num_hidden_layers = sum(num_block_repeats) * 4
|
||||
|
||||
|
||||
@@ -234,7 +234,6 @@ class BlipVisionConfig(PretrainedConfig):
|
||||
projection_dim=512,
|
||||
num_hidden_layers=12,
|
||||
num_attention_heads=12,
|
||||
num_channels=3,
|
||||
image_size=384,
|
||||
patch_size=16,
|
||||
hidden_act="gelu",
|
||||
@@ -250,7 +249,6 @@ class BlipVisionConfig(PretrainedConfig):
|
||||
self.projection_dim = projection_dim
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_channels = num_channels
|
||||
self.patch_size = patch_size
|
||||
self.image_size = image_size
|
||||
self.initializer_range = initializer_range
|
||||
|
||||
@@ -58,15 +58,10 @@ class Blip2VisionConfig(PretrainedConfig):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults
|
||||
to 1e-5): The epsilon used by the layer normalization layers.
|
||||
dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
initializer_factor (`float``, *optional*, defaults to 1):
|
||||
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
|
||||
testing).
|
||||
qkv_bias (`bool`, *optional*, defaults to `True`):
|
||||
Whether to add a bias to the queries and values in the self-attention layers.
|
||||
|
||||
@@ -91,18 +86,14 @@ class Blip2VisionConfig(PretrainedConfig):
|
||||
self,
|
||||
hidden_size=1408,
|
||||
intermediate_size=6144,
|
||||
projection_dim=512,
|
||||
num_hidden_layers=39,
|
||||
num_attention_heads=16,
|
||||
num_channels=3,
|
||||
image_size=224,
|
||||
patch_size=14,
|
||||
hidden_act="gelu",
|
||||
layer_norm_eps=0.00001,
|
||||
dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
initializer_range=1e-10,
|
||||
initializer_factor=1.0,
|
||||
qkv_bias=True,
|
||||
**kwargs,
|
||||
):
|
||||
@@ -110,15 +101,11 @@ class Blip2VisionConfig(PretrainedConfig):
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.projection_dim = projection_dim
|
||||
self.dropout = dropout
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_channels = num_channels
|
||||
self.patch_size = patch_size
|
||||
self.image_size = image_size
|
||||
self.initializer_range = initializer_range
|
||||
self.initializer_factor = initializer_factor
|
||||
self.attention_dropout = attention_dropout
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.hidden_act = hidden_act
|
||||
@@ -184,8 +171,6 @@ class Blip2QFormerConfig(PretrainedConfig):
|
||||
[Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
|
||||
For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
|
||||
with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
|
||||
classifier_dropout (`float`, *optional*):
|
||||
The dropout ratio for the classification head.
|
||||
cross_attention_frequency (`int`, *optional*, defaults to 2):
|
||||
The frequency of adding cross-attention to the Transformer layers.
|
||||
encoder_hidden_size (`int`, *optional*, defaults to 1408):
|
||||
@@ -221,7 +206,6 @@ class Blip2QFormerConfig(PretrainedConfig):
|
||||
layer_norm_eps=1e-12,
|
||||
pad_token_id=0,
|
||||
position_embedding_type="absolute",
|
||||
classifier_dropout=None,
|
||||
cross_attention_frequency=2,
|
||||
encoder_hidden_size=1408,
|
||||
**kwargs,
|
||||
@@ -240,7 +224,6 @@ class Blip2QFormerConfig(PretrainedConfig):
|
||||
self.initializer_range = initializer_range
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.position_embedding_type = position_embedding_type
|
||||
self.classifier_dropout = classifier_dropout
|
||||
self.cross_attention_frequency = cross_attention_frequency
|
||||
self.encoder_hidden_size = encoder_hidden_size
|
||||
|
||||
|
||||
@@ -155,8 +155,6 @@ class BridgeTowerTextConfig(PretrainedConfig):
|
||||
initializer_factor (`float``, *optional*, defaults to 1):
|
||||
A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
|
||||
testing).
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon used by the layer normalization layers.
|
||||
position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
|
||||
@@ -170,8 +168,6 @@ class BridgeTowerTextConfig(PretrainedConfig):
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
classifier_dropout (`float`, *optional*):
|
||||
The dropout ratio for the classification head.
|
||||
|
||||
Example:
|
||||
|
||||
@@ -199,14 +195,12 @@ class BridgeTowerTextConfig(PretrainedConfig):
|
||||
attention_probs_dropout_prob=0.1,
|
||||
max_position_embeddings=514,
|
||||
type_vocab_size=1,
|
||||
initializer_range=0.02,
|
||||
layer_norm_eps=1e-05,
|
||||
pad_token_id=1,
|
||||
bos_token_id=0,
|
||||
eos_token_id=2,
|
||||
position_embedding_type="absolute",
|
||||
use_cache=True,
|
||||
classifier_dropout=None,
|
||||
**kwargs,
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
@@ -222,11 +216,9 @@ class BridgeTowerTextConfig(PretrainedConfig):
|
||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.type_vocab_size = type_vocab_size
|
||||
self.initializer_range = initializer_range
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.position_embedding_type = position_embedding_type
|
||||
self.use_cache = use_cache
|
||||
self.classifier_dropout = classifier_dropout
|
||||
self.pad_token_id = pad_token_id
|
||||
self.bos_token_id = bos_token_id
|
||||
self.eos_token_id = eos_token_id
|
||||
|
||||
@@ -65,8 +65,6 @@ class ClapTextConfig(PretrainedConfig):
|
||||
just in case (e.g., 512 or 1024 or 2048).
|
||||
type_vocab_size (`int`, *optional*, defaults to 2):
|
||||
The vocabulary size of the `token_type_ids` passed when calling [`ClapTextModel`].
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
|
||||
@@ -80,8 +78,6 @@ class ClapTextConfig(PretrainedConfig):
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
||||
relevant if `config.is_decoder=True`.
|
||||
classifier_dropout (`float`, *optional*):
|
||||
The dropout ratio for the classification head.
|
||||
projection_hidden_act (`str`, *optional*, defaults to `"relu"`):
|
||||
The non-linear activation function (function or string) in the projection layer. If string, `"gelu"`,
|
||||
`"relu"`, `"silu"` and `"gelu_new"` are supported.
|
||||
@@ -116,7 +112,6 @@ class ClapTextConfig(PretrainedConfig):
|
||||
attention_probs_dropout_prob=0.1,
|
||||
max_position_embeddings=514,
|
||||
type_vocab_size=1,
|
||||
initializer_range=0.02,
|
||||
initializer_factor=1.0,
|
||||
layer_norm_eps=1e-12,
|
||||
projection_dim=512,
|
||||
@@ -125,7 +120,6 @@ class ClapTextConfig(PretrainedConfig):
|
||||
eos_token_id=2,
|
||||
position_embedding_type="absolute",
|
||||
use_cache=True,
|
||||
classifier_dropout=None,
|
||||
projection_hidden_act="relu",
|
||||
**kwargs,
|
||||
):
|
||||
@@ -141,12 +135,10 @@ class ClapTextConfig(PretrainedConfig):
|
||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||
self.max_position_embeddings = max_position_embeddings
|
||||
self.type_vocab_size = type_vocab_size
|
||||
self.initializer_range = initializer_range
|
||||
self.initializer_factor = initializer_factor
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.position_embedding_type = position_embedding_type
|
||||
self.use_cache = use_cache
|
||||
self.classifier_dropout = classifier_dropout
|
||||
self.projection_hidden_act = projection_hidden_act
|
||||
self.projection_dim = projection_dim
|
||||
|
||||
|
||||
@@ -187,16 +187,10 @@ class Pix2StructVisionConfig(PretrainedConfig):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||
d_kv (`int`, *optional*, defaults to 64):
|
||||
Dimensionality of the key, query, value projections per attention head.
|
||||
projection_dim (`int`, *optional*, defaults to 768):
|
||||
Dimensionality of the projection layer in the Transformer encoder.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 12):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
num_channels (`int`, *optional*, defaults to 3):
|
||||
Number of channels of the input images.
|
||||
patch_size (`int`, *optional*, defaults to 16):
|
||||
The size (resolution) of each patch.
|
||||
dense_act_fn (`str` or `function`, *optional*, defaults to `"gelu_new"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||
`"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
|
||||
@@ -213,8 +207,6 @@ class Pix2StructVisionConfig(PretrainedConfig):
|
||||
testing).
|
||||
seq_len (`int`, *optional*, defaults to 4096):
|
||||
Maximum sequence length (here number of patches) supported by the model.
|
||||
layer_norm_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to add a bias to the layer normalization layers.
|
||||
relative_attention_num_buckets (`int`, *optional*, defaults to 32):
|
||||
The number of buckets to use for each attention layer.
|
||||
relative_attention_max_distance (`int`, *optional*, defaults to 128):
|
||||
@@ -243,11 +235,8 @@ class Pix2StructVisionConfig(PretrainedConfig):
|
||||
patch_embed_hidden_size=768,
|
||||
d_ff=2048,
|
||||
d_kv=64,
|
||||
projection_dim=768,
|
||||
num_hidden_layers=12,
|
||||
num_attention_heads=12,
|
||||
num_channels=3,
|
||||
patch_size=16,
|
||||
dense_act_fn="gelu_new",
|
||||
layer_norm_eps=1e-6,
|
||||
dropout_rate=0.0,
|
||||
@@ -255,7 +244,6 @@ class Pix2StructVisionConfig(PretrainedConfig):
|
||||
initializer_range=1e-10,
|
||||
initializer_factor=1.0,
|
||||
seq_len=4096,
|
||||
layer_norm_bias=False,
|
||||
relative_attention_num_buckets=32,
|
||||
relative_attention_max_distance=128,
|
||||
**kwargs,
|
||||
@@ -265,19 +253,15 @@ class Pix2StructVisionConfig(PretrainedConfig):
|
||||
self.hidden_size = hidden_size
|
||||
self.patch_embed_hidden_size = patch_embed_hidden_size
|
||||
self.d_ff = d_ff
|
||||
self.projection_dim = projection_dim
|
||||
self.dropout_rate = dropout_rate
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.num_channels = num_channels
|
||||
self.patch_size = patch_size
|
||||
self.initializer_range = initializer_range
|
||||
self.initializer_factor = initializer_factor
|
||||
self.attention_dropout = attention_dropout
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.dense_act_fn = dense_act_fn
|
||||
self.seq_len = seq_len
|
||||
self.layer_norm_bias = layer_norm_bias
|
||||
self.relative_attention_num_buckets = relative_attention_num_buckets
|
||||
self.relative_attention_max_distance = relative_attention_max_distance
|
||||
self.d_kv = d_kv
|
||||
|
||||
@@ -150,10 +150,6 @@ class SamVisionConfig(PretrainedConfig):
|
||||
Args:
|
||||
hidden_size (`int`, *optional*, defaults to 768):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
intermediate_size (`int`, *optional*, defaults to 6144):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||
projection_dim (`int`, *optional*, defaults to 512):
|
||||
Dimensionality of the projection layer in the Transformer encoder.
|
||||
output_channels (`int`, *optional*, defaults to 256):
|
||||
Dimensionality of the output channels in the Patch Encoder.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 12):
|
||||
@@ -170,14 +166,10 @@ class SamVisionConfig(PretrainedConfig):
|
||||
The non-linear activation function (function or string)
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-6):
|
||||
The epsilon used by the layer normalization layers.
|
||||
dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout probability.
|
||||
attention_dropout (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
initializer_range (`float`, *optional*, defaults to 1e-10):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
initializer_factor (`float`, *optional*, defaults to 1.0):
|
||||
A factor for multiplying the initializer range.
|
||||
qkv_bias (`bool`, *optional*, defaults to `True`):
|
||||
Whether to add a bias to query, key, value projections.
|
||||
mlp_ratio (`float`, *optional*, defaults to 4.0):
|
||||
@@ -200,8 +192,6 @@ class SamVisionConfig(PretrainedConfig):
|
||||
def __init__(
|
||||
self,
|
||||
hidden_size=768,
|
||||
intermediate_size=6144,
|
||||
projection_dim=512,
|
||||
output_channels=256,
|
||||
num_hidden_layers=12,
|
||||
num_attention_heads=12,
|
||||
@@ -210,10 +200,8 @@ class SamVisionConfig(PretrainedConfig):
|
||||
patch_size=16,
|
||||
hidden_act="gelu",
|
||||
layer_norm_eps=1e-06,
|
||||
dropout=0.0,
|
||||
attention_dropout=0.0,
|
||||
initializer_range=1e-10,
|
||||
initializer_factor=1.0,
|
||||
qkv_bias=True,
|
||||
mlp_ratio=4.0,
|
||||
use_abs_pos=True,
|
||||
@@ -227,8 +215,6 @@ class SamVisionConfig(PretrainedConfig):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.hidden_size = hidden_size
|
||||
self.intermediate_size = intermediate_size
|
||||
self.projection_dim = projection_dim
|
||||
self.output_channels = output_channels
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
@@ -237,10 +223,8 @@ class SamVisionConfig(PretrainedConfig):
|
||||
self.patch_size = patch_size
|
||||
self.hidden_act = hidden_act
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
self.dropout = dropout
|
||||
self.attention_dropout = attention_dropout
|
||||
self.initializer_range = initializer_range
|
||||
self.initializer_factor = initializer_factor
|
||||
self.qkv_bias = qkv_bias
|
||||
self.mlp_ratio = mlp_ratio
|
||||
self.use_abs_pos = use_abs_pos
|
||||
|
||||
@@ -17,6 +17,7 @@ import inspect
|
||||
import os
|
||||
import re
|
||||
|
||||
from transformers.configuration_utils import PretrainedConfig
|
||||
from transformers.utils import direct_transformers_import
|
||||
|
||||
|
||||
@@ -77,6 +78,12 @@ SPECIAL_CASES_TO_ALLOW = {
|
||||
"TimeSeriesTransformerConfig": ["num_static_real_features", "num_time_features"],
|
||||
# used internally to calculate the feature size
|
||||
"AutoformerConfig": ["num_static_real_features", "num_time_features"],
|
||||
# used internally to calculate `mlp_dim`
|
||||
"SamVisionConfig": ["mlp_ratio"],
|
||||
# For (head) training, but so far not implemented
|
||||
"ClapAudioConfig": ["num_classes"],
|
||||
# Not used, but providing useful information to users
|
||||
"SpeechT5HifiGanConfig": ["sampling_rate"],
|
||||
}
|
||||
|
||||
|
||||
@@ -113,6 +120,10 @@ SPECIAL_CASES_TO_ALLOW.update(
|
||||
"VanConfig": True,
|
||||
"WavLMConfig": True,
|
||||
"WhisperConfig": True,
|
||||
# TODO: @Arthur (for `alignment_head` and `alignment_layer`)
|
||||
"JukeboxPriorConfig": True,
|
||||
# TODO: @Younes (for `is_decoder`)
|
||||
"Pix2StructTextConfig": True,
|
||||
}
|
||||
)
|
||||
|
||||
@@ -254,7 +265,18 @@ def check_config_attributes_being_used(config_class):
|
||||
def check_config_attributes():
|
||||
"""Check the arguments in `__init__` of all configuration classes are used in python files"""
|
||||
configs_with_unused_attributes = {}
|
||||
for config_class in list(CONFIG_MAPPING.values()):
|
||||
for _config_class in list(CONFIG_MAPPING.values()):
|
||||
# Some config classes are not in `CONFIG_MAPPING` (e.g. `CLIPVisionConfig`, `Blip2VisionConfig`, etc.)
|
||||
config_classes_in_module = [
|
||||
cls
|
||||
for name, cls in inspect.getmembers(
|
||||
inspect.getmodule(_config_class),
|
||||
lambda x: inspect.isclass(x)
|
||||
and issubclass(x, PretrainedConfig)
|
||||
and inspect.getmodule(x) == inspect.getmodule(_config_class),
|
||||
)
|
||||
]
|
||||
for config_class in config_classes_in_module:
|
||||
unused_attributes = check_config_attributes_being_used(config_class)
|
||||
if len(unused_attributes) > 0:
|
||||
configs_with_unused_attributes[config_class.__name__] = unused_attributes
|
||||
|
||||
Reference in New Issue
Block a user