[docstring] Fix UniSpeech, UniSpeechSat, Wav2Vec2ForCTC (#26664)
* Remove UniSpeechConfig * Remove , at the end otherwise check_docstring changes order * Auto add new docstring * Update docstring for UniSpeechConfig * Remove from check_docstrings * Remove UniSpeechSatConfig and UniSpeechSatForCTC from check_docstrings * Remove , at the end * Fix docstring * Update docstring for Wav2Vec2ForCTC * Update Wav2Vec2ForCTC docstring Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com> * fix style --------- Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
This commit is contained in:
@@ -65,6 +65,10 @@ class UniSpeechConfig(PretrainedConfig):
|
|||||||
The dropout ratio for activations inside the fully connected layer.
|
The dropout ratio for activations inside the fully connected layer.
|
||||||
attention_dropout (`float`, *optional*, defaults to 0.1):
|
attention_dropout (`float`, *optional*, defaults to 0.1):
|
||||||
The dropout ratio for the attention probabilities.
|
The dropout ratio for the attention probabilities.
|
||||||
|
feat_proj_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
|
The dropout probability for output of the feature encoder.
|
||||||
|
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
|
The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
|
||||||
final_dropout (`float`, *optional*, defaults to 0.1):
|
final_dropout (`float`, *optional*, defaults to 0.1):
|
||||||
The dropout probability for the final projection layer of [`UniSpeechForCTC`].
|
The dropout probability for the final projection layer of [`UniSpeechForCTC`].
|
||||||
layerdrop (`float`, *optional*, defaults to 0.1):
|
layerdrop (`float`, *optional*, defaults to 0.1):
|
||||||
@@ -72,26 +76,22 @@ class UniSpeechConfig(PretrainedConfig):
|
|||||||
details.
|
details.
|
||||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||||
The epsilon used by the layer normalization layers.
|
The epsilon used by the layer normalization layers.
|
||||||
feat_extract_norm (`str`, *optional*, defaults to `"group"`):
|
feat_extract_norm (`str`, *optional*, defaults to `"group"`):
|
||||||
The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
|
The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
|
||||||
normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
|
normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
|
||||||
convolutional layers.
|
convolutional layers.
|
||||||
feat_proj_dropout (`float`, *optional*, defaults to 0.0):
|
feat_extract_activation (`str, *optional*, defaults to `"gelu"`):
|
||||||
The dropout probability for output of the feature encoder.
|
|
||||||
feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
|
|
||||||
The non-linear activation function (function or string) in the 1D convolutional layers of the feature
|
The non-linear activation function (function or string) in the 1D convolutional layers of the feature
|
||||||
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
|
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
|
||||||
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
|
|
||||||
The dropout probabilitiy for quantized feature encoder states.
|
|
||||||
conv_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
|
conv_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
|
||||||
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
|
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
|
||||||
feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
|
feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
|
||||||
conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
|
conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
|
||||||
A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
|
A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
|
||||||
of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
|
of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
|
||||||
conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
|
conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 2, 2)`):
|
||||||
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
|
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
|
||||||
length of *conv_kernel* defines the number of convolutional layers and has to match the length of
|
length of *conv_kernel* defines the number of convolutional layers and has to match the length of
|
||||||
*conv_dim*.
|
*conv_dim*.
|
||||||
@@ -118,7 +118,7 @@ class UniSpeechConfig(PretrainedConfig):
|
|||||||
actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
|
actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
|
||||||
mask_time_length (`int`, *optional*, defaults to 10):
|
mask_time_length (`int`, *optional*, defaults to 10):
|
||||||
Length of vector span along the time axis.
|
Length of vector span along the time axis.
|
||||||
mask_time_min_masks (`int`, *optional*, defaults to 2),:
|
mask_time_min_masks (`int`, *optional*, defaults to 2):
|
||||||
The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
|
The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
|
||||||
irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
|
irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
|
||||||
mask_time_min_masks''
|
mask_time_min_masks''
|
||||||
@@ -131,7 +131,7 @@ class UniSpeechConfig(PretrainedConfig):
|
|||||||
True`.
|
True`.
|
||||||
mask_feature_length (`int`, *optional*, defaults to 10):
|
mask_feature_length (`int`, *optional*, defaults to 10):
|
||||||
Length of vector span along the feature axis.
|
Length of vector span along the feature axis.
|
||||||
mask_feature_min_masks (`int`, *optional*, defaults to 0),:
|
mask_feature_min_masks (`int`, *optional*, defaults to 0):
|
||||||
The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
|
The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
|
||||||
step, irrespectively of `mask_feature_prob`. Only relevant if
|
step, irrespectively of `mask_feature_prob`. Only relevant if
|
||||||
''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
|
''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
|
||||||
@@ -141,8 +141,6 @@ class UniSpeechConfig(PretrainedConfig):
|
|||||||
Number of codevector groups for product codevector quantization.
|
Number of codevector groups for product codevector quantization.
|
||||||
contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
|
contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
|
||||||
The temperature *kappa* in the contrastive loss.
|
The temperature *kappa* in the contrastive loss.
|
||||||
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
|
|
||||||
The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
|
|
||||||
num_negatives (`int`, *optional*, defaults to 100):
|
num_negatives (`int`, *optional*, defaults to 100):
|
||||||
Number of negative samples for the contrastive loss.
|
Number of negative samples for the contrastive loss.
|
||||||
codevector_dim (`int`, *optional*, defaults to 256):
|
codevector_dim (`int`, *optional*, defaults to 256):
|
||||||
@@ -163,6 +161,15 @@ class UniSpeechConfig(PretrainedConfig):
|
|||||||
instance of [`UniSpeechForSequenceClassification`].
|
instance of [`UniSpeechForSequenceClassification`].
|
||||||
classifier_proj_size (`int`, *optional*, defaults to 256):
|
classifier_proj_size (`int`, *optional*, defaults to 256):
|
||||||
Dimensionality of the projection before token mean-pooling for classification.
|
Dimensionality of the projection before token mean-pooling for classification.
|
||||||
|
num_ctc_classes (`int`, *optional*, defaults to 80):
|
||||||
|
Specifies the number of classes (phoneme tokens and blank token) for phoneme-level CTC loss. Only relevant
|
||||||
|
when using an instance of [`UniSpeechForPreTraining`].
|
||||||
|
pad_token_id (`int`, *optional*, defaults to 0):
|
||||||
|
The id of the padding token.
|
||||||
|
bos_token_id (`int`, *optional*, defaults to 1):
|
||||||
|
The id of the "beginning-of-sequence" token.
|
||||||
|
eos_token_id (`int`, *optional*, defaults to 2):
|
||||||
|
The id of the "end-of-sequence" token.
|
||||||
replace_prob (`float`, *optional*, defaults to 0.5):
|
replace_prob (`float`, *optional*, defaults to 0.5):
|
||||||
Propability that transformer feature is replaced by quantized feature for pretraining.
|
Propability that transformer feature is replaced by quantized feature for pretraining.
|
||||||
|
|
||||||
|
|||||||
@@ -1374,6 +1374,12 @@ class UniSpeechForPreTraining(UniSpeechPreTrainedModel):
|
|||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
"""UniSpeech Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
|
"""UniSpeech Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
|
||||||
UNISPEECH_START_DOCSTRING,
|
UNISPEECH_START_DOCSTRING,
|
||||||
|
"""
|
||||||
|
target_lang (`str`, *optional*):
|
||||||
|
Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
|
||||||
|
adapter.<lang>.bin. Only relevant when using an instance of [`UniSpeechForCTC`] with adapters. Uses 'eng'
|
||||||
|
by default.
|
||||||
|
""",
|
||||||
)
|
)
|
||||||
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->UniSpeech, wav2vec2->unispeech, WAV_2_VEC_2->UNISPEECH
|
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->UniSpeech, wav2vec2->unispeech, WAV_2_VEC_2->UNISPEECH
|
||||||
class UniSpeechForCTC(UniSpeechPreTrainedModel):
|
class UniSpeechForCTC(UniSpeechPreTrainedModel):
|
||||||
|
|||||||
@@ -66,6 +66,10 @@ class UniSpeechSatConfig(PretrainedConfig):
|
|||||||
The dropout ratio for activations inside the fully connected layer.
|
The dropout ratio for activations inside the fully connected layer.
|
||||||
attention_dropout (`float`, *optional*, defaults to 0.1):
|
attention_dropout (`float`, *optional*, defaults to 0.1):
|
||||||
The dropout ratio for the attention probabilities.
|
The dropout ratio for the attention probabilities.
|
||||||
|
feat_proj_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
|
The dropout probability for output of the feature encoder.
|
||||||
|
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
|
The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
|
||||||
final_dropout (`float`, *optional*, defaults to 0.1):
|
final_dropout (`float`, *optional*, defaults to 0.1):
|
||||||
The dropout probability for the final projection layer of [`UniSpeechSatForCTC`].
|
The dropout probability for the final projection layer of [`UniSpeechSatForCTC`].
|
||||||
layerdrop (`float`, *optional*, defaults to 0.1):
|
layerdrop (`float`, *optional*, defaults to 0.1):
|
||||||
@@ -73,26 +77,22 @@ class UniSpeechSatConfig(PretrainedConfig):
|
|||||||
details.
|
details.
|
||||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
|
||||||
The epsilon used by the layer normalization layers.
|
The epsilon used by the layer normalization layers.
|
||||||
feat_extract_norm (`str`, *optional*, defaults to `"group"`):
|
feat_extract_norm (`str`, *optional*, defaults to `"group"`):
|
||||||
The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
|
The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
|
||||||
normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
|
normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
|
||||||
convolutional layers.
|
convolutional layers.
|
||||||
feat_proj_dropout (`float`, *optional*, defaults to 0.0):
|
feat_extract_activation (`str, *optional*, defaults to `"gelu"`):
|
||||||
The dropout probability for output of the feature encoder.
|
|
||||||
feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
|
|
||||||
The non-linear activation function (function or string) in the 1D convolutional layers of the feature
|
The non-linear activation function (function or string) in the 1D convolutional layers of the feature
|
||||||
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
|
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
|
||||||
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
|
|
||||||
The dropout probabilitiy for quantized feature encoder states.
|
|
||||||
conv_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
|
conv_dim (`Tuple[int]` or `List[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
|
||||||
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
|
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
|
||||||
feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
|
feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
|
||||||
conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
|
conv_stride (`Tuple[int]` or `List[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
|
||||||
A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
|
A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
|
||||||
of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
|
of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
|
||||||
conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
|
conv_kernel (`Tuple[int]` or `List[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 2, 2)`):
|
||||||
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
|
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
|
||||||
length of *conv_kernel* defines the number of convolutional layers and has to match the length of
|
length of *conv_kernel* defines the number of convolutional layers and has to match the length of
|
||||||
*conv_dim*.
|
*conv_dim*.
|
||||||
@@ -119,7 +119,7 @@ class UniSpeechSatConfig(PretrainedConfig):
|
|||||||
actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
|
actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
|
||||||
mask_time_length (`int`, *optional*, defaults to 10):
|
mask_time_length (`int`, *optional*, defaults to 10):
|
||||||
Length of vector span along the time axis.
|
Length of vector span along the time axis.
|
||||||
mask_time_min_masks (`int`, *optional*, defaults to 2),:
|
mask_time_min_masks (`int`, *optional*, defaults to 2):
|
||||||
The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
|
The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
|
||||||
irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
|
irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
|
||||||
mask_time_min_masks''
|
mask_time_min_masks''
|
||||||
@@ -132,7 +132,7 @@ class UniSpeechSatConfig(PretrainedConfig):
|
|||||||
True`.
|
True`.
|
||||||
mask_feature_length (`int`, *optional*, defaults to 10):
|
mask_feature_length (`int`, *optional*, defaults to 10):
|
||||||
Length of vector span along the feature axis.
|
Length of vector span along the feature axis.
|
||||||
mask_feature_min_masks (`int`, *optional*, defaults to 0),:
|
mask_feature_min_masks (`int`, *optional*, defaults to 0):
|
||||||
The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
|
The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
|
||||||
step, irrespectively of `mask_feature_prob`. Only relevant if
|
step, irrespectively of `mask_feature_prob`. Only relevant if
|
||||||
''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
|
''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
|
||||||
@@ -142,8 +142,6 @@ class UniSpeechSatConfig(PretrainedConfig):
|
|||||||
Number of codevector groups for product codevector quantization.
|
Number of codevector groups for product codevector quantization.
|
||||||
contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
|
contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
|
||||||
The temperature *kappa* in the contrastive loss.
|
The temperature *kappa* in the contrastive loss.
|
||||||
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
|
|
||||||
The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
|
|
||||||
num_negatives (`int`, *optional*, defaults to 100):
|
num_negatives (`int`, *optional*, defaults to 100):
|
||||||
Number of negative samples for the contrastive loss.
|
Number of negative samples for the contrastive loss.
|
||||||
codevector_dim (`int`, *optional*, defaults to 256):
|
codevector_dim (`int`, *optional*, defaults to 256):
|
||||||
@@ -175,6 +173,15 @@ class UniSpeechSatConfig(PretrainedConfig):
|
|||||||
*XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
|
*XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
|
||||||
xvector_output_dim (`int`, *optional*, defaults to 512):
|
xvector_output_dim (`int`, *optional*, defaults to 512):
|
||||||
Dimensionality of the *XVector* embedding vectors.
|
Dimensionality of the *XVector* embedding vectors.
|
||||||
|
pad_token_id (`int`, *optional*, defaults to 0):
|
||||||
|
The id of the padding token.
|
||||||
|
bos_token_id (`int`, *optional*, defaults to 1):
|
||||||
|
The id of the "beginning-of-sequence" token.
|
||||||
|
eos_token_id (`int`, *optional*, defaults to 2):
|
||||||
|
The id of the "end-of-sequence" token.
|
||||||
|
num_clusters (`int`, *optional*, defaults to 504):
|
||||||
|
Number of clusters for weak labeling. Only relevant when using an instance of
|
||||||
|
[`UniSpeechSatForPreTraining`].
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
|
|||||||
@@ -1381,6 +1381,12 @@ class UniSpeechSatForPreTraining(UniSpeechSatPreTrainedModel):
|
|||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
"""UniSpeechSat Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
|
"""UniSpeechSat Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
|
||||||
UNISPEECH_SAT_START_DOCSTRING,
|
UNISPEECH_SAT_START_DOCSTRING,
|
||||||
|
"""
|
||||||
|
target_lang (`str`, *optional*):
|
||||||
|
Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
|
||||||
|
adapter.<lang>.bin. Only relevant when using an instance of [`UniSpeechSatForCTC`] with adapters. Uses
|
||||||
|
'eng' by default.
|
||||||
|
""",
|
||||||
)
|
)
|
||||||
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->UniSpeechSat, wav2vec2->unispeech_sat, WAV_2_VEC_2->UNISPEECH_SAT
|
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC with Wav2Vec2->UniSpeechSat, wav2vec2->unispeech_sat, WAV_2_VEC_2->UNISPEECH_SAT
|
||||||
class UniSpeechSatForCTC(UniSpeechSatPreTrainedModel):
|
class UniSpeechSatForCTC(UniSpeechSatPreTrainedModel):
|
||||||
|
|||||||
@@ -1871,6 +1871,12 @@ class Wav2Vec2ForMaskedLM(Wav2Vec2PreTrainedModel):
|
|||||||
@add_start_docstrings(
|
@add_start_docstrings(
|
||||||
"""Wav2Vec2 Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
|
"""Wav2Vec2 Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
|
||||||
WAV_2_VEC_2_START_DOCSTRING,
|
WAV_2_VEC_2_START_DOCSTRING,
|
||||||
|
"""
|
||||||
|
target_lang (`str`, *optional*):
|
||||||
|
Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
|
||||||
|
adapter.<lang>.bin. Only relevant when using an instance of [`Wav2Vec2ForCTC`] with adapters. Uses 'eng' by
|
||||||
|
default.
|
||||||
|
""",
|
||||||
)
|
)
|
||||||
class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel):
|
class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel):
|
||||||
def __init__(self, config, target_lang: Optional[str] = None):
|
def __init__(self, config, target_lang: Optional[str] = None):
|
||||||
|
|||||||
@@ -760,10 +760,6 @@ OBJECTS_TO_IGNORE = [
|
|||||||
"TranslationPipeline",
|
"TranslationPipeline",
|
||||||
"TvltImageProcessor",
|
"TvltImageProcessor",
|
||||||
"UMT5Config",
|
"UMT5Config",
|
||||||
"UniSpeechConfig",
|
|
||||||
"UniSpeechForCTC",
|
|
||||||
"UniSpeechSatConfig",
|
|
||||||
"UniSpeechSatForCTC",
|
|
||||||
"UperNetConfig",
|
"UperNetConfig",
|
||||||
"UperNetForSemanticSegmentation",
|
"UperNetForSemanticSegmentation",
|
||||||
"ViTHybridImageProcessor",
|
"ViTHybridImageProcessor",
|
||||||
@@ -787,7 +783,6 @@ OBJECTS_TO_IGNORE = [
|
|||||||
"Wav2Vec2ConformerConfig",
|
"Wav2Vec2ConformerConfig",
|
||||||
"Wav2Vec2ConformerForCTC",
|
"Wav2Vec2ConformerForCTC",
|
||||||
"Wav2Vec2FeatureExtractor",
|
"Wav2Vec2FeatureExtractor",
|
||||||
"Wav2Vec2ForCTC",
|
|
||||||
"Wav2Vec2PhonemeCTCTokenizer",
|
"Wav2Vec2PhonemeCTCTokenizer",
|
||||||
"WavLMConfig",
|
"WavLMConfig",
|
||||||
"WavLMForCTC",
|
"WavLMForCTC",
|
||||||
|
|||||||
Reference in New Issue
Block a user