From 7504be35ab88639ec3def32f0392f0a43c9551ee Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Thu, 15 Jun 2023 11:39:20 +0200 Subject: [PATCH] Fix `check_config_attributes`: check all configuration classes (#24231) * fix --------- Co-authored-by: ydshieh --- .../models/align/configuration_align.py | 4 --- .../models/blip/configuration_blip.py | 2 -- .../models/blip_2/configuration_blip_2.py | 17 ----------- .../bridgetower/configuration_bridgetower.py | 8 ----- .../models/clap/configuration_clap.py | 8 ----- .../pix2struct/configuration_pix2struct.py | 16 ---------- .../models/sam/configuration_sam.py | 16 ---------- utils/check_config_attributes.py | 30 ++++++++++++++++--- 8 files changed, 26 insertions(+), 75 deletions(-) diff --git a/src/transformers/models/align/configuration_align.py b/src/transformers/models/align/configuration_align.py index 0436b278f0..cb7baddfe1 100644 --- a/src/transformers/models/align/configuration_align.py +++ b/src/transformers/models/align/configuration_align.py @@ -206,8 +206,6 @@ class AlignVisionConfig(PretrainedConfig): The epsilon used by the batch normalization layers. batch_norm_momentum (`float`, *optional*, defaults to 0.99): The momentum used by the batch normalization layers. - dropout_rate (`float`, *optional*, defaults to 0.5): - The dropout rate to be applied before final classifier layer. drop_connect_rate (`float`, *optional*, defaults to 0.2): The drop rate for skip connections. @@ -249,7 +247,6 @@ class AlignVisionConfig(PretrainedConfig): initializer_range: float = 0.02, batch_norm_eps: float = 0.001, batch_norm_momentum: float = 0.99, - dropout_rate: float = 0.5, drop_connect_rate: float = 0.2, **kwargs, ): @@ -274,7 +271,6 @@ class AlignVisionConfig(PretrainedConfig): self.initializer_range = initializer_range self.batch_norm_eps = batch_norm_eps self.batch_norm_momentum = batch_norm_momentum - self.dropout_rate = dropout_rate self.drop_connect_rate = drop_connect_rate self.num_hidden_layers = sum(num_block_repeats) * 4 diff --git a/src/transformers/models/blip/configuration_blip.py b/src/transformers/models/blip/configuration_blip.py index f03f167c29..53a7785282 100644 --- a/src/transformers/models/blip/configuration_blip.py +++ b/src/transformers/models/blip/configuration_blip.py @@ -234,7 +234,6 @@ class BlipVisionConfig(PretrainedConfig): projection_dim=512, num_hidden_layers=12, num_attention_heads=12, - num_channels=3, image_size=384, patch_size=16, hidden_act="gelu", @@ -250,7 +249,6 @@ class BlipVisionConfig(PretrainedConfig): self.projection_dim = projection_dim self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads - self.num_channels = num_channels self.patch_size = patch_size self.image_size = image_size self.initializer_range = initializer_range diff --git a/src/transformers/models/blip_2/configuration_blip_2.py b/src/transformers/models/blip_2/configuration_blip_2.py index 8a80510db9..6adf85e611 100644 --- a/src/transformers/models/blip_2/configuration_blip_2.py +++ b/src/transformers/models/blip_2/configuration_blip_2.py @@ -58,15 +58,10 @@ class Blip2VisionConfig(PretrainedConfig): The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon used by the layer normalization layers. - dropout (`float`, *optional*, defaults to 0.0): - The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - initializer_factor (`float``, *optional*, defaults to 1): - A factor for initializing all weight matrices (should be kept to 1, used internally for initialization - testing). qkv_bias (`bool`, *optional*, defaults to `True`): Whether to add a bias to the queries and values in the self-attention layers. @@ -91,18 +86,14 @@ class Blip2VisionConfig(PretrainedConfig): self, hidden_size=1408, intermediate_size=6144, - projection_dim=512, num_hidden_layers=39, num_attention_heads=16, - num_channels=3, image_size=224, patch_size=14, hidden_act="gelu", layer_norm_eps=0.00001, - dropout=0.0, attention_dropout=0.0, initializer_range=1e-10, - initializer_factor=1.0, qkv_bias=True, **kwargs, ): @@ -110,15 +101,11 @@ class Blip2VisionConfig(PretrainedConfig): self.hidden_size = hidden_size self.intermediate_size = intermediate_size - self.projection_dim = projection_dim - self.dropout = dropout self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads - self.num_channels = num_channels self.patch_size = patch_size self.image_size = image_size self.initializer_range = initializer_range - self.initializer_factor = initializer_factor self.attention_dropout = attention_dropout self.layer_norm_eps = layer_norm_eps self.hidden_act = hidden_act @@ -184,8 +171,6 @@ class Blip2QFormerConfig(PretrainedConfig): [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). - classifier_dropout (`float`, *optional*): - The dropout ratio for the classification head. cross_attention_frequency (`int`, *optional*, defaults to 2): The frequency of adding cross-attention to the Transformer layers. encoder_hidden_size (`int`, *optional*, defaults to 1408): @@ -221,7 +206,6 @@ class Blip2QFormerConfig(PretrainedConfig): layer_norm_eps=1e-12, pad_token_id=0, position_embedding_type="absolute", - classifier_dropout=None, cross_attention_frequency=2, encoder_hidden_size=1408, **kwargs, @@ -240,7 +224,6 @@ class Blip2QFormerConfig(PretrainedConfig): self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.position_embedding_type = position_embedding_type - self.classifier_dropout = classifier_dropout self.cross_attention_frequency = cross_attention_frequency self.encoder_hidden_size = encoder_hidden_size diff --git a/src/transformers/models/bridgetower/configuration_bridgetower.py b/src/transformers/models/bridgetower/configuration_bridgetower.py index 3149b34efa..17c9cadaf8 100644 --- a/src/transformers/models/bridgetower/configuration_bridgetower.py +++ b/src/transformers/models/bridgetower/configuration_bridgetower.py @@ -155,8 +155,6 @@ class BridgeTowerTextConfig(PretrainedConfig): initializer_factor (`float``, *optional*, defaults to 1): A factor for initializing all weight matrices (should be kept to 1, used internally for initialization testing). - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-05): The epsilon used by the layer normalization layers. position_embedding_type (`str`, *optional*, defaults to `"absolute"`): @@ -170,8 +168,6 @@ class BridgeTowerTextConfig(PretrainedConfig): use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if `config.is_decoder=True`. - classifier_dropout (`float`, *optional*): - The dropout ratio for the classification head. Example: @@ -199,14 +195,12 @@ class BridgeTowerTextConfig(PretrainedConfig): attention_probs_dropout_prob=0.1, max_position_embeddings=514, type_vocab_size=1, - initializer_range=0.02, layer_norm_eps=1e-05, pad_token_id=1, bos_token_id=0, eos_token_id=2, position_embedding_type="absolute", use_cache=True, - classifier_dropout=None, **kwargs, ): super().__init__(**kwargs) @@ -222,11 +216,9 @@ class BridgeTowerTextConfig(PretrainedConfig): self.attention_probs_dropout_prob = attention_probs_dropout_prob self.max_position_embeddings = max_position_embeddings self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.position_embedding_type = position_embedding_type self.use_cache = use_cache - self.classifier_dropout = classifier_dropout self.pad_token_id = pad_token_id self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py index 13d1f7b7e0..9886c614da 100644 --- a/src/transformers/models/clap/configuration_clap.py +++ b/src/transformers/models/clap/configuration_clap.py @@ -65,8 +65,6 @@ class ClapTextConfig(PretrainedConfig): just in case (e.g., 512 or 1024 or 2048). type_vocab_size (`int`, *optional*, defaults to 2): The vocabulary size of the `token_type_ids` passed when calling [`ClapTextModel`]. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. position_embedding_type (`str`, *optional*, defaults to `"absolute"`): @@ -80,8 +78,6 @@ class ClapTextConfig(PretrainedConfig): use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). Only relevant if `config.is_decoder=True`. - classifier_dropout (`float`, *optional*): - The dropout ratio for the classification head. projection_hidden_act (`str`, *optional*, defaults to `"relu"`): The non-linear activation function (function or string) in the projection layer. If string, `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported. @@ -116,7 +112,6 @@ class ClapTextConfig(PretrainedConfig): attention_probs_dropout_prob=0.1, max_position_embeddings=514, type_vocab_size=1, - initializer_range=0.02, initializer_factor=1.0, layer_norm_eps=1e-12, projection_dim=512, @@ -125,7 +120,6 @@ class ClapTextConfig(PretrainedConfig): eos_token_id=2, position_embedding_type="absolute", use_cache=True, - classifier_dropout=None, projection_hidden_act="relu", **kwargs, ): @@ -141,12 +135,10 @@ class ClapTextConfig(PretrainedConfig): self.attention_probs_dropout_prob = attention_probs_dropout_prob self.max_position_embeddings = max_position_embeddings self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range self.initializer_factor = initializer_factor self.layer_norm_eps = layer_norm_eps self.position_embedding_type = position_embedding_type self.use_cache = use_cache - self.classifier_dropout = classifier_dropout self.projection_hidden_act = projection_hidden_act self.projection_dim = projection_dim diff --git a/src/transformers/models/pix2struct/configuration_pix2struct.py b/src/transformers/models/pix2struct/configuration_pix2struct.py index 32aa34941f..cf1dd91b46 100644 --- a/src/transformers/models/pix2struct/configuration_pix2struct.py +++ b/src/transformers/models/pix2struct/configuration_pix2struct.py @@ -187,16 +187,10 @@ class Pix2StructVisionConfig(PretrainedConfig): Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. d_kv (`int`, *optional*, defaults to 64): Dimensionality of the key, query, value projections per attention head. - projection_dim (`int`, *optional*, defaults to 768): - Dimensionality of the projection layer in the Transformer encoder. num_hidden_layers (`int`, *optional*, defaults to 12): Number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 12): Number of attention heads for each attention layer in the Transformer encoder. - num_channels (`int`, *optional*, defaults to 3): - Number of channels of the input images. - patch_size (`int`, *optional*, defaults to 16): - The size (resolution) of each patch. dense_act_fn (`str` or `function`, *optional*, defaults to `"gelu_new"`): The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. @@ -213,8 +207,6 @@ class Pix2StructVisionConfig(PretrainedConfig): testing). seq_len (`int`, *optional*, defaults to 4096): Maximum sequence length (here number of patches) supported by the model. - layer_norm_bias (`bool`, *optional*, defaults to `False`): - Whether or not to add a bias to the layer normalization layers. relative_attention_num_buckets (`int`, *optional*, defaults to 32): The number of buckets to use for each attention layer. relative_attention_max_distance (`int`, *optional*, defaults to 128): @@ -243,11 +235,8 @@ class Pix2StructVisionConfig(PretrainedConfig): patch_embed_hidden_size=768, d_ff=2048, d_kv=64, - projection_dim=768, num_hidden_layers=12, num_attention_heads=12, - num_channels=3, - patch_size=16, dense_act_fn="gelu_new", layer_norm_eps=1e-6, dropout_rate=0.0, @@ -255,7 +244,6 @@ class Pix2StructVisionConfig(PretrainedConfig): initializer_range=1e-10, initializer_factor=1.0, seq_len=4096, - layer_norm_bias=False, relative_attention_num_buckets=32, relative_attention_max_distance=128, **kwargs, @@ -265,19 +253,15 @@ class Pix2StructVisionConfig(PretrainedConfig): self.hidden_size = hidden_size self.patch_embed_hidden_size = patch_embed_hidden_size self.d_ff = d_ff - self.projection_dim = projection_dim self.dropout_rate = dropout_rate self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size self.initializer_range = initializer_range self.initializer_factor = initializer_factor self.attention_dropout = attention_dropout self.layer_norm_eps = layer_norm_eps self.dense_act_fn = dense_act_fn self.seq_len = seq_len - self.layer_norm_bias = layer_norm_bias self.relative_attention_num_buckets = relative_attention_num_buckets self.relative_attention_max_distance = relative_attention_max_distance self.d_kv = d_kv diff --git a/src/transformers/models/sam/configuration_sam.py b/src/transformers/models/sam/configuration_sam.py index fc3701fca7..87427d4573 100644 --- a/src/transformers/models/sam/configuration_sam.py +++ b/src/transformers/models/sam/configuration_sam.py @@ -150,10 +150,6 @@ class SamVisionConfig(PretrainedConfig): Args: hidden_size (`int`, *optional*, defaults to 768): Dimensionality of the encoder layers and the pooler layer. - intermediate_size (`int`, *optional*, defaults to 6144): - Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. - projection_dim (`int`, *optional*, defaults to 512): - Dimensionality of the projection layer in the Transformer encoder. output_channels (`int`, *optional*, defaults to 256): Dimensionality of the output channels in the Patch Encoder. num_hidden_layers (`int`, *optional*, defaults to 12): @@ -170,14 +166,10 @@ class SamVisionConfig(PretrainedConfig): The non-linear activation function (function or string) layer_norm_eps (`float`, *optional*, defaults to 1e-6): The epsilon used by the layer normalization layers. - dropout (`float`, *optional*, defaults to 0.0): - The dropout probability. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. initializer_range (`float`, *optional*, defaults to 1e-10): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - initializer_factor (`float`, *optional*, defaults to 1.0): - A factor for multiplying the initializer range. qkv_bias (`bool`, *optional*, defaults to `True`): Whether to add a bias to query, key, value projections. mlp_ratio (`float`, *optional*, defaults to 4.0): @@ -200,8 +192,6 @@ class SamVisionConfig(PretrainedConfig): def __init__( self, hidden_size=768, - intermediate_size=6144, - projection_dim=512, output_channels=256, num_hidden_layers=12, num_attention_heads=12, @@ -210,10 +200,8 @@ class SamVisionConfig(PretrainedConfig): patch_size=16, hidden_act="gelu", layer_norm_eps=1e-06, - dropout=0.0, attention_dropout=0.0, initializer_range=1e-10, - initializer_factor=1.0, qkv_bias=True, mlp_ratio=4.0, use_abs_pos=True, @@ -227,8 +215,6 @@ class SamVisionConfig(PretrainedConfig): super().__init__(**kwargs) self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.projection_dim = projection_dim self.output_channels = output_channels self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads @@ -237,10 +223,8 @@ class SamVisionConfig(PretrainedConfig): self.patch_size = patch_size self.hidden_act = hidden_act self.layer_norm_eps = layer_norm_eps - self.dropout = dropout self.attention_dropout = attention_dropout self.initializer_range = initializer_range - self.initializer_factor = initializer_factor self.qkv_bias = qkv_bias self.mlp_ratio = mlp_ratio self.use_abs_pos = use_abs_pos diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py index 63b1bacbbe..dad6888c80 100644 --- a/utils/check_config_attributes.py +++ b/utils/check_config_attributes.py @@ -17,6 +17,7 @@ import inspect import os import re +from transformers.configuration_utils import PretrainedConfig from transformers.utils import direct_transformers_import @@ -77,6 +78,12 @@ SPECIAL_CASES_TO_ALLOW = { "TimeSeriesTransformerConfig": ["num_static_real_features", "num_time_features"], # used internally to calculate the feature size "AutoformerConfig": ["num_static_real_features", "num_time_features"], + # used internally to calculate `mlp_dim` + "SamVisionConfig": ["mlp_ratio"], + # For (head) training, but so far not implemented + "ClapAudioConfig": ["num_classes"], + # Not used, but providing useful information to users + "SpeechT5HifiGanConfig": ["sampling_rate"], } @@ -113,6 +120,10 @@ SPECIAL_CASES_TO_ALLOW.update( "VanConfig": True, "WavLMConfig": True, "WhisperConfig": True, + # TODO: @Arthur (for `alignment_head` and `alignment_layer`) + "JukeboxPriorConfig": True, + # TODO: @Younes (for `is_decoder`) + "Pix2StructTextConfig": True, } ) @@ -254,10 +265,21 @@ def check_config_attributes_being_used(config_class): def check_config_attributes(): """Check the arguments in `__init__` of all configuration classes are used in python files""" configs_with_unused_attributes = {} - for config_class in list(CONFIG_MAPPING.values()): - unused_attributes = check_config_attributes_being_used(config_class) - if len(unused_attributes) > 0: - configs_with_unused_attributes[config_class.__name__] = unused_attributes + for _config_class in list(CONFIG_MAPPING.values()): + # Some config classes are not in `CONFIG_MAPPING` (e.g. `CLIPVisionConfig`, `Blip2VisionConfig`, etc.) + config_classes_in_module = [ + cls + for name, cls in inspect.getmembers( + inspect.getmodule(_config_class), + lambda x: inspect.isclass(x) + and issubclass(x, PretrainedConfig) + and inspect.getmodule(x) == inspect.getmodule(_config_class), + ) + ] + for config_class in config_classes_in_module: + unused_attributes = check_config_attributes_being_used(config_class) + if len(unused_attributes) > 0: + configs_with_unused_attributes[config_class.__name__] = unused_attributes if len(configs_with_unused_attributes) > 0: error = "The following configuration classes contain unused attributes in the corresponding modeling files:\n"