From eb734e51479be7d16d41d0f60c565cac3e57367a Mon Sep 17 00:00:00 2001
From: Gizem <gizemtabak7@gmail.com>
Date: Thu, 12 Oct 2023 07:51:34 -0700
Subject: [PATCH] [docstring] Fix `UniSpeech`, `UniSpeechSat`, `Wav2Vec2ForCTC`
 (#26664)

* Remove UniSpeechConfig

* Remove , at the end otherwise check_docstring changes order

* Auto add new docstring

* Update docstring for UniSpeechConfig

* Remove from check_docstrings

* Remove UniSpeechSatConfig and UniSpeechSatForCTC from check_docstrings

* Remove , at the end

* Fix docstring

* Update docstring for Wav2Vec2ForCTC

* Update Wav2Vec2ForCTC docstring

Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>

* fix style

---------

Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
---
 .../unispeech/configuration_unispeech.py      | 29 ++++++++++++-------
 .../models/unispeech/modeling_unispeech.py    |  6 ++++
 .../configuration_unispeech_sat.py            | 29 ++++++++++++-------
 .../unispeech_sat/modeling_unispeech_sat.py   |  6 ++++
 .../models/wav2vec2/modeling_wav2vec2.py      |  6 ++++
 utils/check_docstrings.py                     |  5 ----
 6 files changed, 54 insertions(+), 27 deletions(-)

diff --git a/src/transformers/models/unispeech/configuration_unispeech.py b/src/transformers/models/unispeech/configuration_unispeech.py
index eda06fa3d4..0cf270d1fa 100644
--- a/src/transformers/models/unispeech/configuration_unispeech.py
+++ b/src/transformers/models/unispeech/configuration_unispeech.py
@@ -65,6 +65,10 @@ class UniSpeechConfig(PretrainedConfig):
             The dropout ratio for activations inside the fully connected layer.
         attention_dropout (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for output of the feature encoder.
+        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
         final_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for the final projection layer of [`UniSpeechForCTC`].
         layerdrop (`float`, *optional*, defaults to 0.1):
@@ -72,26 +76,22 @@ class UniSpeechConfig(PretrainedConfig):
             details.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         feat_extract_norm (`str`, *optional*, defaults to `"group"`):
             The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
             normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
             convolutional layers.
-        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probability for output of the feature encoder.
-        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
+        feat_extract_activation (`str, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the 1D convolutional layers of the feature
             extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for quantized feature encoder states.
         conv_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
             A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
             feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
         conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
             A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
             of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
-        conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
+        conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 2, 2)`):
             A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
             length of *conv_kernel* defines the number of convolutional layers and has to match the length of
             *conv_dim*.
@@ -118,7 +118,7 @@ class UniSpeechConfig(PretrainedConfig):
             actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
         mask_time_length (`int`, *optional*, defaults to 10):
             Length of vector span along the time axis.
-        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+        mask_time_min_masks (`int`, *optional*, defaults to 2):
             The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
             irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
             mask_time_min_masks''
@@ -131,7 +131,7 @@ class UniSpeechConfig(PretrainedConfig):
             True`.
         mask_feature_length (`int`, *optional*, defaults to 10):
             Length of vector span along the feature axis.
-        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
+        mask_feature_min_masks (`int`, *optional*, defaults to 0):
             The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
             step, irrespectively of `mask_feature_prob`. Only relevant if
             ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
@@ -141,8 +141,6 @@ class UniSpeechConfig(PretrainedConfig):
             Number of codevector groups for product codevector quantization.
         contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
             The temperature *kappa* in the contrastive loss.
-        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
         num_negatives (`int`, *optional*, defaults to 100):
             Number of negative samples for the contrastive loss.
         codevector_dim (`int`, *optional*, defaults to 256):
@@ -163,6 +161,15 @@ class UniSpeechConfig(PretrainedConfig):
             instance of [`UniSpeechForSequenceClassification`].
         classifier_proj_size (`int`, *optional*, defaults to 256):
             Dimensionality of the projection before token mean-pooling for classification.
+        num_ctc_classes (`int`, *optional*, defaults to 80):
+            Specifies the number of classes (phoneme tokens and blank token) for phoneme-level CTC loss. Only relevant
+            when using an instance of [`UniSpeechForPreTraining`].
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
         replace_prob (`float`, *optional*, defaults to 0.5):
             Propability that transformer feature is replaced by quantized feature for pretraining.
 
diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py
index a72f56fd2f..c475ab7f80 100755
--- a/src/transformers/models/unispeech/modeling_unispeech.py
+++ b/src/transformers/models/unispeech/modeling_unispeech.py
@@ -1374,6 +1374,12 @@ class UniSpeechForPreTraining(UniSpeechPreTrainedModel):
 @add_start_docstrings(
     """UniSpeech Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
     UNISPEECH_START_DOCSTRING,
+    """
+        target_lang (`str`, *optional*):
+            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
+            adapter.<lang>.bin. Only relevant when using an instance of [`UniSpeechForCTC`] with adapters. Uses 'eng'
+            by default.
+    """,
 )
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->UniSpeech, wav2vec2->unispeech, WAV_2_VEC_2->UNISPEECH
 class UniSpeechForCTC(UniSpeechPreTrainedModel):
diff --git a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
index a8ca718060..9d0a306115 100644
--- a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
@@ -66,6 +66,10 @@ class UniSpeechSatConfig(PretrainedConfig):
             The dropout ratio for activations inside the fully connected layer.
         attention_dropout (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for output of the feature encoder.
+        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
         final_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for the final projection layer of [`UniSpeechSatForCTC`].
         layerdrop (`float`, *optional*, defaults to 0.1):
@@ -73,26 +77,22 @@ class UniSpeechSatConfig(PretrainedConfig):
             details.
         initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
             The epsilon used by the layer normalization layers.
         feat_extract_norm (`str`, *optional*, defaults to `"group"`):
             The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
             normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
             convolutional layers.
-        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probability for output of the feature encoder.
-        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
+        feat_extract_activation (`str, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the 1D convolutional layers of the feature
             extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
-        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for quantized feature encoder states.
         conv_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
             A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
             feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
         conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
             A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
             of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
-        conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
+        conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 2, 2)`):
             A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
             length of *conv_kernel* defines the number of convolutional layers and has to match the length of
             *conv_dim*.
@@ -119,7 +119,7 @@ class UniSpeechSatConfig(PretrainedConfig):
             actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
         mask_time_length (`int`, *optional*, defaults to 10):
             Length of vector span along the time axis.
-        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+        mask_time_min_masks (`int`, *optional*, defaults to 2):
             The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
             irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
             mask_time_min_masks''
@@ -132,7 +132,7 @@ class UniSpeechSatConfig(PretrainedConfig):
             True`.
         mask_feature_length (`int`, *optional*, defaults to 10):
             Length of vector span along the feature axis.
-        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
+        mask_feature_min_masks (`int`, *optional*, defaults to 0):
             The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
             step, irrespectively of `mask_feature_prob`. Only relevant if
             ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
@@ -142,8 +142,6 @@ class UniSpeechSatConfig(PretrainedConfig):
             Number of codevector groups for product codevector quantization.
         contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
             The temperature *kappa* in the contrastive loss.
-        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
         num_negatives (`int`, *optional*, defaults to 100):
             Number of negative samples for the contrastive loss.
         codevector_dim (`int`, *optional*, defaults to 256):
@@ -175,6 +173,15 @@ class UniSpeechSatConfig(PretrainedConfig):
             *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
         xvector_output_dim (`int`, *optional*, defaults to 512):
             Dimensionality of the *XVector* embedding vectors.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        num_clusters (`int`, *optional*, defaults to 504):
+            Number of clusters for weak labeling. Only relevant when using an instance of
+            [`UniSpeechSatForPreTraining`].
 
     Example:
 
diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
index 50b9093104..3fcc9549bb 100755
--- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@@ -1381,6 +1381,12 @@ class UniSpeechSatForPreTraining(UniSpeechSatPreTrainedModel):
 @add_start_docstrings(
     """UniSpeechSat Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
     UNISPEECH_SAT_START_DOCSTRING,
+    """
+        target_lang (`str`, *optional*):
+            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
+            adapter.<lang>.bin. Only relevant when using an instance of [`UniSpeechSatForCTC`] with adapters. Uses
+            'eng' by default.
+    """,
 )
 # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->UniSpeechSat, wav2vec2->unispeech_sat, WAV_2_VEC_2->UNISPEECH_SAT
 class UniSpeechSatForCTC(UniSpeechSatPreTrainedModel):
diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index a495cbcf45..a6e02a0476 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -1871,6 +1871,12 @@ class Wav2Vec2ForMaskedLM(Wav2Vec2PreTrainedModel):
 @add_start_docstrings(
     """Wav2Vec2 Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
     WAV_2_VEC_2_START_DOCSTRING,
+    """
+        target_lang (`str`, *optional*):
+            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
+            adapter.<lang>.bin. Only relevant when using an instance of [`Wav2Vec2ForCTC`] with adapters. Uses 'eng' by
+            default.
+    """,
 )
 class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel):
     def __init__(self, config, target_lang: Optional[str] = None):
diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py
index ef4562aa9a..6ad68d3c67 100644
--- a/utils/check_docstrings.py
+++ b/utils/check_docstrings.py
@@ -760,10 +760,6 @@ OBJECTS_TO_IGNORE = [
     "TranslationPipeline",
     "TvltImageProcessor",
     "UMT5Config",
-    "UniSpeechConfig",
-    "UniSpeechForCTC",
-    "UniSpeechSatConfig",
-    "UniSpeechSatForCTC",
     "UperNetConfig",
     "UperNetForSemanticSegmentation",
     "ViTHybridImageProcessor",
@@ -787,7 +783,6 @@ OBJECTS_TO_IGNORE = [
     "Wav2Vec2ConformerConfig",
     "Wav2Vec2ConformerForCTC",
     "Wav2Vec2FeatureExtractor",
-    "Wav2Vec2ForCTC",
     "Wav2Vec2PhonemeCTCTokenizer",
     "WavLMConfig",
     "WavLMForCTC",