From eb734e51479be7d16d41d0f60c565cac3e57367a Mon Sep 17 00:00:00 2001 From: Gizem Date: Thu, 12 Oct 2023 07:51:34 -0700 Subject: [PATCH] [docstring] Fix `UniSpeech`, `UniSpeechSat`, `Wav2Vec2ForCTC` (#26664) * Remove UniSpeechConfig * Remove , at the end otherwise check_docstring changes order * Auto add new docstring * Update docstring for UniSpeechConfig * Remove from check_docstrings * Remove UniSpeechSatConfig and UniSpeechSatForCTC from check_docstrings * Remove , at the end * Fix docstring * Update docstring for Wav2Vec2ForCTC * Update Wav2Vec2ForCTC docstring Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com> * fix style --------- Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com> --- .../unispeech/configuration_unispeech.py | 29 ++++++++++++------- .../models/unispeech/modeling_unispeech.py | 6 ++++ .../configuration_unispeech_sat.py | 29 ++++++++++++------- .../unispeech_sat/modeling_unispeech_sat.py | 6 ++++ .../models/wav2vec2/modeling_wav2vec2.py | 6 ++++ utils/check_docstrings.py | 5 ---- 6 files changed, 54 insertions(+), 27 deletions(-) diff --git a/src/transformers/models/unispeech/configuration_unispeech.py b/src/transformers/models/unispeech/configuration_unispeech.py index eda06fa3d4..0cf270d1fa 100644 --- a/src/transformers/models/unispeech/configuration_unispeech.py +++ b/src/transformers/models/unispeech/configuration_unispeech.py @@ -65,6 +65,10 @@ class UniSpeechConfig(PretrainedConfig): The dropout ratio for activations inside the fully connected layer. attention_dropout (`float`, *optional*, defaults to 0.1): The dropout ratio for the attention probabilities. + feat_proj_dropout (`float`, *optional*, defaults to 0.0): + The dropout probability for output of the feature encoder. + feat_quantizer_dropout (`float`, *optional*, defaults to 0.0): + The dropout probabilitiy for the output of the feature encoder that's used by the quantizer. final_dropout (`float`, *optional*, defaults to 0.1): The dropout probability for the final projection layer of [`UniSpeechForCTC`]. layerdrop (`float`, *optional*, defaults to 0.1): @@ -72,26 +76,22 @@ class UniSpeechConfig(PretrainedConfig): details. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - layer_norm_eps (`float`, *optional*, defaults to 1e-12): + layer_norm_eps (`float`, *optional*, defaults to 1e-05): The epsilon used by the layer normalization layers. feat_extract_norm (`str`, *optional*, defaults to `"group"`): The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D convolutional layers. - feat_proj_dropout (`float`, *optional*, defaults to 0.0): - The dropout probability for output of the feature encoder. - feat_extract_activation (`str, `optional`, defaults to `"gelu"`): + feat_extract_activation (`str, *optional*, defaults to `"gelu"`): The non-linear activation function (function or string) in the 1D convolutional layers of the feature extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. - feat_quantizer_dropout (`float`, *optional*, defaults to 0.0): - The dropout probabilitiy for quantized feature encoder states. conv_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`): A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers. conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`): A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*. - conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`): + conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 2, 2)`): A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The length of *conv_kernel* defines the number of convolutional layers and has to match the length of *conv_dim*. @@ -118,7 +118,7 @@ class UniSpeechConfig(PretrainedConfig): actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`. mask_time_length (`int`, *optional*, defaults to 10): Length of vector span along the time axis. - mask_time_min_masks (`int`, *optional*, defaults to 2),: + mask_time_min_masks (`int`, *optional*, defaults to 2): The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step, irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks'' @@ -131,7 +131,7 @@ class UniSpeechConfig(PretrainedConfig): True`. mask_feature_length (`int`, *optional*, defaults to 10): Length of vector span along the feature axis. - mask_feature_min_masks (`int`, *optional*, defaults to 0),: + mask_feature_min_masks (`int`, *optional*, defaults to 0): The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time step, irrespectively of `mask_feature_prob`. Only relevant if ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks'' @@ -141,8 +141,6 @@ class UniSpeechConfig(PretrainedConfig): Number of codevector groups for product codevector quantization. contrastive_logits_temperature (`float`, *optional*, defaults to 0.1): The temperature *kappa* in the contrastive loss. - feat_quantizer_dropout (`float`, *optional*, defaults to 0.0): - The dropout probabilitiy for the output of the feature encoder that's used by the quantizer. num_negatives (`int`, *optional*, defaults to 100): Number of negative samples for the contrastive loss. codevector_dim (`int`, *optional*, defaults to 256): @@ -163,6 +161,15 @@ class UniSpeechConfig(PretrainedConfig): instance of [`UniSpeechForSequenceClassification`]. classifier_proj_size (`int`, *optional*, defaults to 256): Dimensionality of the projection before token mean-pooling for classification. + num_ctc_classes (`int`, *optional*, defaults to 80): + Specifies the number of classes (phoneme tokens and blank token) for phoneme-level CTC loss. Only relevant + when using an instance of [`UniSpeechForPreTraining`]. + pad_token_id (`int`, *optional*, defaults to 0): + The id of the padding token. + bos_token_id (`int`, *optional*, defaults to 1): + The id of the "beginning-of-sequence" token. + eos_token_id (`int`, *optional*, defaults to 2): + The id of the "end-of-sequence" token. replace_prob (`float`, *optional*, defaults to 0.5): Propability that transformer feature is replaced by quantized feature for pretraining. diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py index a72f56fd2f..c475ab7f80 100755 --- a/src/transformers/models/unispeech/modeling_unispeech.py +++ b/src/transformers/models/unispeech/modeling_unispeech.py @@ -1374,6 +1374,12 @@ class UniSpeechForPreTraining(UniSpeechPreTrainedModel): @add_start_docstrings( """UniSpeech Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""", UNISPEECH_START_DOCSTRING, + """ + target_lang (`str`, *optional*): + Language id of adapter weights. Adapter weights are stored in the format adapter..safetensors or + adapter..bin. Only relevant when using an instance of [`UniSpeechForCTC`] with adapters. Uses 'eng' + by default. + """, ) # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->UniSpeech, wav2vec2->unispeech, WAV_2_VEC_2->UNISPEECH class UniSpeechForCTC(UniSpeechPreTrainedModel): diff --git a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py index a8ca718060..9d0a306115 100644 --- a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py @@ -66,6 +66,10 @@ class UniSpeechSatConfig(PretrainedConfig): The dropout ratio for activations inside the fully connected layer. attention_dropout (`float`, *optional*, defaults to 0.1): The dropout ratio for the attention probabilities. + feat_proj_dropout (`float`, *optional*, defaults to 0.0): + The dropout probability for output of the feature encoder. + feat_quantizer_dropout (`float`, *optional*, defaults to 0.0): + The dropout probabilitiy for the output of the feature encoder that's used by the quantizer. final_dropout (`float`, *optional*, defaults to 0.1): The dropout probability for the final projection layer of [`UniSpeechSatForCTC`]. layerdrop (`float`, *optional*, defaults to 0.1): @@ -73,26 +77,22 @@ class UniSpeechSatConfig(PretrainedConfig): details. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - layer_norm_eps (`float`, *optional*, defaults to 1e-12): + layer_norm_eps (`float`, *optional*, defaults to 1e-05): The epsilon used by the layer normalization layers. feat_extract_norm (`str`, *optional*, defaults to `"group"`): The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D convolutional layers. - feat_proj_dropout (`float`, *optional*, defaults to 0.0): - The dropout probability for output of the feature encoder. - feat_extract_activation (`str, `optional`, defaults to `"gelu"`): + feat_extract_activation (`str, *optional*, defaults to `"gelu"`): The non-linear activation function (function or string) in the 1D convolutional layers of the feature extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. - feat_quantizer_dropout (`float`, *optional*, defaults to 0.0): - The dropout probabilitiy for quantized feature encoder states. conv_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`): A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers. conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`): A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*. - conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`): + conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 2, 2)`): A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The length of *conv_kernel* defines the number of convolutional layers and has to match the length of *conv_dim*. @@ -119,7 +119,7 @@ class UniSpeechSatConfig(PretrainedConfig): actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`. mask_time_length (`int`, *optional*, defaults to 10): Length of vector span along the time axis. - mask_time_min_masks (`int`, *optional*, defaults to 2),: + mask_time_min_masks (`int`, *optional*, defaults to 2): The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step, irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks'' @@ -132,7 +132,7 @@ class UniSpeechSatConfig(PretrainedConfig): True`. mask_feature_length (`int`, *optional*, defaults to 10): Length of vector span along the feature axis. - mask_feature_min_masks (`int`, *optional*, defaults to 0),: + mask_feature_min_masks (`int`, *optional*, defaults to 0): The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time step, irrespectively of `mask_feature_prob`. Only relevant if ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks'' @@ -142,8 +142,6 @@ class UniSpeechSatConfig(PretrainedConfig): Number of codevector groups for product codevector quantization. contrastive_logits_temperature (`float`, *optional*, defaults to 0.1): The temperature *kappa* in the contrastive loss. - feat_quantizer_dropout (`float`, *optional*, defaults to 0.0): - The dropout probabilitiy for the output of the feature encoder that's used by the quantizer. num_negatives (`int`, *optional*, defaults to 100): Number of negative samples for the contrastive loss. codevector_dim (`int`, *optional*, defaults to 256): @@ -175,6 +173,15 @@ class UniSpeechSatConfig(PretrainedConfig): *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*. xvector_output_dim (`int`, *optional*, defaults to 512): Dimensionality of the *XVector* embedding vectors. + pad_token_id (`int`, *optional*, defaults to 0): + The id of the padding token. + bos_token_id (`int`, *optional*, defaults to 1): + The id of the "beginning-of-sequence" token. + eos_token_id (`int`, *optional*, defaults to 2): + The id of the "end-of-sequence" token. + num_clusters (`int`, *optional*, defaults to 504): + Number of clusters for weak labeling. Only relevant when using an instance of + [`UniSpeechSatForPreTraining`]. Example: diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py index 50b9093104..3fcc9549bb 100755 --- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py @@ -1381,6 +1381,12 @@ class UniSpeechSatForPreTraining(UniSpeechSatPreTrainedModel): @add_start_docstrings( """UniSpeechSat Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""", UNISPEECH_SAT_START_DOCSTRING, + """ + target_lang (`str`, *optional*): + Language id of adapter weights. Adapter weights are stored in the format adapter..safetensors or + adapter..bin. Only relevant when using an instance of [`UniSpeechSatForCTC`] with adapters. Uses + 'eng' by default. + """, ) # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->UniSpeechSat, wav2vec2->unispeech_sat, WAV_2_VEC_2->UNISPEECH_SAT class UniSpeechSatForCTC(UniSpeechSatPreTrainedModel): diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index a495cbcf45..a6e02a0476 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -1871,6 +1871,12 @@ class Wav2Vec2ForMaskedLM(Wav2Vec2PreTrainedModel): @add_start_docstrings( """Wav2Vec2 Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""", WAV_2_VEC_2_START_DOCSTRING, + """ + target_lang (`str`, *optional*): + Language id of adapter weights. Adapter weights are stored in the format adapter..safetensors or + adapter..bin. Only relevant when using an instance of [`Wav2Vec2ForCTC`] with adapters. Uses 'eng' by + default. + """, ) class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel): def __init__(self, config, target_lang: Optional[str] = None): diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py index ef4562aa9a..6ad68d3c67 100644 --- a/utils/check_docstrings.py +++ b/utils/check_docstrings.py @@ -760,10 +760,6 @@ OBJECTS_TO_IGNORE = [ "TranslationPipeline", "TvltImageProcessor", "UMT5Config", - "UniSpeechConfig", - "UniSpeechForCTC", - "UniSpeechSatConfig", - "UniSpeechSatForCTC", "UperNetConfig", "UperNetForSemanticSegmentation", "ViTHybridImageProcessor", @@ -787,7 +783,6 @@ OBJECTS_TO_IGNORE = [ "Wav2Vec2ConformerConfig", "Wav2Vec2ConformerForCTC", "Wav2Vec2FeatureExtractor", - "Wav2Vec2ForCTC", "Wav2Vec2PhonemeCTCTokenizer", "WavLMConfig", "WavLMForCTC",