[Wav2Vec2] Rename model's feature extractor to feature encoder (#14959)
* rename classes * clean up more namings * remove bogus file * Apply suggestions from code review * Apply suggestions from code review * replace more names * more regex replace * make style * correct * correct more * make style * finish * correct more in wav2vec2 * make style * improve freeze_extractor * add aliases * add tf aliases
This commit is contained in:
committed by
GitHub
parent
1bfa347707
commit
600496fa50
@@ -17,6 +17,7 @@
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import warnings
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from random import randint
|
from random import randint
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
@@ -76,24 +77,24 @@ class DataTrainingArguments:
|
|||||||
eval_file: Optional[str] = field(
|
eval_file: Optional[str] = field(
|
||||||
default=None, metadata={"help": "A file containing the validation audio paths and labels."}
|
default=None, metadata={"help": "A file containing the validation audio paths and labels."}
|
||||||
)
|
)
|
||||||
train_split_name: Optional[str] = field(
|
train_split_name: str = field(
|
||||||
default="train",
|
default="train",
|
||||||
metadata={
|
metadata={
|
||||||
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
|
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
eval_split_name: Optional[str] = field(
|
eval_split_name: str = field(
|
||||||
default="validation",
|
default="validation",
|
||||||
metadata={
|
metadata={
|
||||||
"help": "The name of the training data set split to use (via the datasets library). Defaults to "
|
"help": "The name of the training data set split to use (via the datasets library). Defaults to "
|
||||||
"'validation'"
|
"'validation'"
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
audio_column_name: Optional[str] = field(
|
audio_column_name: str = field(
|
||||||
default="audio",
|
default="audio",
|
||||||
metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
|
metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
|
||||||
)
|
)
|
||||||
label_column_name: Optional[str] = field(
|
label_column_name: str = field(
|
||||||
default="label", metadata={"help": "The name of the dataset column containing the labels. Defaults to 'label'"}
|
default="label", metadata={"help": "The name of the dataset column containing the labels. Defaults to 'label'"}
|
||||||
)
|
)
|
||||||
max_train_samples: Optional[int] = field(
|
max_train_samples: Optional[int] = field(
|
||||||
@@ -110,7 +111,7 @@ class DataTrainingArguments:
|
|||||||
"value if set."
|
"value if set."
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
max_length_seconds: Optional[float] = field(
|
max_length_seconds: float = field(
|
||||||
default=20,
|
default=20,
|
||||||
metadata={"help": "Audio clips will be randomly cut to this length during training if the value is set."},
|
metadata={"help": "Audio clips will be randomly cut to this length during training if the value is set."},
|
||||||
)
|
)
|
||||||
@@ -136,11 +137,13 @@ class ModelArguments:
|
|||||||
default="main",
|
default="main",
|
||||||
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
|
metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
|
||||||
)
|
)
|
||||||
feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."})
|
feature_extractor_name: Optional[str] = field(
|
||||||
freeze_feature_extractor: Optional[bool] = field(
|
default=None, metadata={"help": "Name or path of preprocessor config."}
|
||||||
default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
|
|
||||||
)
|
)
|
||||||
attention_mask: Optional[bool] = field(
|
freeze_feature_encoder: bool = field(
|
||||||
|
default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
|
||||||
|
)
|
||||||
|
attention_mask: bool = field(
|
||||||
default=True, metadata={"help": "Whether to generate an attention mask in the feature extractor."}
|
default=True, metadata={"help": "Whether to generate an attention mask in the feature extractor."}
|
||||||
)
|
)
|
||||||
use_auth_token: bool = field(
|
use_auth_token: bool = field(
|
||||||
@@ -150,6 +153,24 @@ class ModelArguments:
|
|||||||
"with private models)."
|
"with private models)."
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
freeze_feature_extractor: Optional[bool] = field(
|
||||||
|
default=None, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
|
||||||
|
)
|
||||||
|
|
||||||
|
def __post_init__(self):
|
||||||
|
if not self.freeze_feature_extractor and self.freeze_feature_encoder:
|
||||||
|
warnings.warn(
|
||||||
|
"The argument `--freeze_feature_extractor` is deprecated and "
|
||||||
|
"will be removed in a future version. Use `--freeze_feature_encoder`"
|
||||||
|
"instead. Setting `freeze_feature_encoder==True`.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
if self.freeze_feature_extractor and not self.freeze_feature_encoder:
|
||||||
|
raise ValueError(
|
||||||
|
"The argument `--freeze_feature_extractor` is deprecated and "
|
||||||
|
"should not be used in combination with `--freeze_feature_encoder`."
|
||||||
|
"Only make use of `--freeze_feature_encoder`."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
@@ -302,8 +323,8 @@ def main():
|
|||||||
)
|
)
|
||||||
|
|
||||||
# freeze the convolutional waveform encoder
|
# freeze the convolutional waveform encoder
|
||||||
if model_args.freeze_feature_extractor:
|
if model_args.freeze_feature_encoder:
|
||||||
model.freeze_feature_extractor()
|
model.freeze_feature_encoder()
|
||||||
|
|
||||||
if training_args.do_train:
|
if training_args.do_train:
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
|
|||||||
@@ -78,7 +78,7 @@ python run_speech_recognition_ctc.py \
|
|||||||
--eval_steps="100" \
|
--eval_steps="100" \
|
||||||
--layerdrop="0.0" \
|
--layerdrop="0.0" \
|
||||||
--save_total_limit="3" \
|
--save_total_limit="3" \
|
||||||
--freeze_feature_extractor \
|
--freeze_feature_encoder \
|
||||||
--gradient_checkpointing \
|
--gradient_checkpointing \
|
||||||
--chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” <20> \
|
--chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” <20> \
|
||||||
--fp16 \
|
--fp16 \
|
||||||
@@ -113,7 +113,7 @@ python -m torch.distributed.launch \
|
|||||||
--logging_steps="1" \
|
--logging_steps="1" \
|
||||||
--layerdrop="0.0" \
|
--layerdrop="0.0" \
|
||||||
--save_total_limit="3" \
|
--save_total_limit="3" \
|
||||||
--freeze_feature_extractor \
|
--freeze_feature_encoder \
|
||||||
--gradient_checkpointing \
|
--gradient_checkpointing \
|
||||||
--chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” <20> \
|
--chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” <20> \
|
||||||
--fp16 \
|
--fp16 \
|
||||||
@@ -304,7 +304,7 @@ python run_speech_recognition_seq2seq.py \
|
|||||||
--eval_steps="400" \
|
--eval_steps="400" \
|
||||||
--logging_steps="10" \
|
--logging_steps="10" \
|
||||||
--save_total_limit="1" \
|
--save_total_limit="1" \
|
||||||
--freeze_feature_extractor \
|
--freeze_feature_encoder \
|
||||||
--gradient_checkpointing \
|
--gradient_checkpointing \
|
||||||
--fp16 \
|
--fp16 \
|
||||||
--group_by_length \
|
--group_by_length \
|
||||||
@@ -346,7 +346,7 @@ python -m torch.distributed.launch \
|
|||||||
--eval_steps="400" \
|
--eval_steps="400" \
|
||||||
--logging_steps="10" \
|
--logging_steps="10" \
|
||||||
--save_total_limit="1" \
|
--save_total_limit="1" \
|
||||||
--freeze_feature_extractor \
|
--freeze_feature_encoder \
|
||||||
--gradient_checkpointing \
|
--gradient_checkpointing \
|
||||||
--fp16 \
|
--fp16 \
|
||||||
--group_by_length \
|
--group_by_length \
|
||||||
|
|||||||
@@ -78,29 +78,27 @@ class ModelArguments:
|
|||||||
default=None,
|
default=None,
|
||||||
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
|
metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
|
||||||
)
|
)
|
||||||
freeze_feature_extractor: Optional[bool] = field(
|
freeze_feature_encoder: bool = field(
|
||||||
default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
|
default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
|
||||||
)
|
)
|
||||||
attention_dropout: Optional[float] = field(
|
attention_dropout: float = field(
|
||||||
default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."}
|
default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."}
|
||||||
)
|
)
|
||||||
activation_dropout: Optional[float] = field(
|
activation_dropout: float = field(
|
||||||
default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
|
default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."}
|
||||||
)
|
)
|
||||||
feat_proj_dropout: Optional[float] = field(
|
feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."})
|
||||||
default=0.0, metadata={"help": "The dropout ratio for the projected features."}
|
hidden_dropout: float = field(
|
||||||
)
|
|
||||||
hidden_dropout: Optional[float] = field(
|
|
||||||
default=0.0,
|
default=0.0,
|
||||||
metadata={
|
metadata={
|
||||||
"help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
|
"help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler."
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
final_dropout: Optional[float] = field(
|
final_dropout: float = field(
|
||||||
default=0.0,
|
default=0.0,
|
||||||
metadata={"help": "The dropout probability for the final projection layer."},
|
metadata={"help": "The dropout probability for the final projection layer."},
|
||||||
)
|
)
|
||||||
mask_time_prob: Optional[float] = field(
|
mask_time_prob: float = field(
|
||||||
default=0.05,
|
default=0.05,
|
||||||
metadata={
|
metadata={
|
||||||
"help": "Probability of each feature vector along the time axis to be chosen as the start of the vector"
|
"help": "Probability of each feature vector along the time axis to be chosen as the start of the vector"
|
||||||
@@ -108,22 +106,22 @@ class ModelArguments:
|
|||||||
"vectors will be masked along the time axis."
|
"vectors will be masked along the time axis."
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
mask_time_length: Optional[int] = field(
|
mask_time_length: int = field(
|
||||||
default=10,
|
default=10,
|
||||||
metadata={"help": "Length of vector span to mask along the time axis."},
|
metadata={"help": "Length of vector span to mask along the time axis."},
|
||||||
)
|
)
|
||||||
mask_feature_prob: Optional[float] = field(
|
mask_feature_prob: float = field(
|
||||||
default=0.0,
|
default=0.0,
|
||||||
metadata={
|
metadata={
|
||||||
"help": "Probability of each feature vector along the feature axis to be chosen as the start of the vector"
|
"help": "Probability of each feature vector along the feature axis to be chosen as the start of the vector"
|
||||||
"span to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature bins will be masked along the time axis."
|
"span to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature bins will be masked along the time axis."
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
mask_feature_length: Optional[int] = field(
|
mask_feature_length: int = field(
|
||||||
default=10,
|
default=10,
|
||||||
metadata={"help": "Length of vector span to mask along the feature axis."},
|
metadata={"help": "Length of vector span to mask along the feature axis."},
|
||||||
)
|
)
|
||||||
layerdrop: Optional[float] = field(default=0.0, metadata={"help": "The LayerDrop probability."})
|
layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."})
|
||||||
ctc_loss_reduction: Optional[str] = field(
|
ctc_loss_reduction: Optional[str] = field(
|
||||||
default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
|
default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
|
||||||
)
|
)
|
||||||
@@ -142,26 +140,26 @@ class DataTrainingArguments:
|
|||||||
dataset_name: str = field(
|
dataset_name: str = field(
|
||||||
metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
|
metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
|
||||||
)
|
)
|
||||||
dataset_config_name: Optional[str] = field(
|
dataset_config_name: str = field(
|
||||||
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
|
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
|
||||||
)
|
)
|
||||||
train_split_name: Optional[str] = field(
|
train_split_name: str = field(
|
||||||
default="train+validation",
|
default="train+validation",
|
||||||
metadata={
|
metadata={
|
||||||
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
|
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
eval_split_name: Optional[str] = field(
|
eval_split_name: str = field(
|
||||||
default="test",
|
default="test",
|
||||||
metadata={
|
metadata={
|
||||||
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
|
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
audio_column_name: Optional[str] = field(
|
audio_column_name: str = field(
|
||||||
default="audio",
|
default="audio",
|
||||||
metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
|
metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
|
||||||
)
|
)
|
||||||
text_column_name: Optional[str] = field(
|
text_column_name: str = field(
|
||||||
default="text",
|
default="text",
|
||||||
metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
|
metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
|
||||||
)
|
)
|
||||||
@@ -190,20 +188,20 @@ class DataTrainingArguments:
|
|||||||
default=None,
|
default=None,
|
||||||
metadata={"help": "A list of characters to remove from the transcripts."},
|
metadata={"help": "A list of characters to remove from the transcripts."},
|
||||||
)
|
)
|
||||||
eval_metrics: Optional[List[str]] = list_field(
|
eval_metrics: List[str] = list_field(
|
||||||
default=["wer"],
|
default=["wer"],
|
||||||
metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
|
metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
|
||||||
)
|
)
|
||||||
max_duration_in_seconds: Optional[float] = field(
|
max_duration_in_seconds: float = field(
|
||||||
default=20.0,
|
default=20.0,
|
||||||
metadata={
|
metadata={
|
||||||
"help": "Filter audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
|
"help": "Filter audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
min_duration_in_seconds: Optional[float] = field(
|
min_duration_in_seconds: float = field(
|
||||||
default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
|
default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
|
||||||
)
|
)
|
||||||
preprocessing_only: Optional[bool] = field(
|
preprocessing_only: bool = field(
|
||||||
default=False,
|
default=False,
|
||||||
metadata={
|
metadata={
|
||||||
"help": "Whether to only do data preprocessing and skip training. "
|
"help": "Whether to only do data preprocessing and skip training. "
|
||||||
@@ -212,22 +210,22 @@ class DataTrainingArguments:
|
|||||||
"so that the cached datasets can consequently be loaded in distributed training"
|
"so that the cached datasets can consequently be loaded in distributed training"
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
use_auth_token: Optional[bool] = field(
|
use_auth_token: bool = field(
|
||||||
default=False,
|
default=False,
|
||||||
metadata={
|
metadata={
|
||||||
"help": "If :obj:`True`, will use the token generated when running"
|
"help": "If :obj:`True`, will use the token generated when running"
|
||||||
":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
|
":obj:`transformers-cli login` as HTTP bearer authorization for remote files."
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
unk_token: Optional[str] = field(
|
unk_token: str = field(
|
||||||
default="[UNK]",
|
default="[UNK]",
|
||||||
metadata={"help": "The unk token for the tokenizer"},
|
metadata={"help": "The unk token for the tokenizer"},
|
||||||
)
|
)
|
||||||
pad_token: Optional[str] = field(
|
pad_token: str = field(
|
||||||
default="[PAD]",
|
default="[PAD]",
|
||||||
metadata={"help": "The padding token for the tokenizer"},
|
metadata={"help": "The padding token for the tokenizer"},
|
||||||
)
|
)
|
||||||
word_delimiter_token: Optional[str] = field(
|
word_delimiter_token: str = field(
|
||||||
default="|",
|
default="|",
|
||||||
metadata={"help": "The word delimiter token for the tokenizer"},
|
metadata={"help": "The word delimiter token for the tokenizer"},
|
||||||
)
|
)
|
||||||
@@ -545,8 +543,8 @@ def main():
|
|||||||
)
|
)
|
||||||
|
|
||||||
# freeze encoder
|
# freeze encoder
|
||||||
if model_args.freeze_feature_extractor:
|
if model_args.freeze_feature_encoder:
|
||||||
model.freeze_feature_extractor()
|
model.freeze_feature_encoder()
|
||||||
|
|
||||||
# 6. Now we preprocess the datasets including loading the audio, resampling and normalization
|
# 6. Now we preprocess the datasets including loading the audio, resampling and normalization
|
||||||
# Thankfully, `datasets` takes care of automatically loading and resampling the audio,
|
# Thankfully, `datasets` takes care of automatically loading and resampling the audio,
|
||||||
|
|||||||
@@ -91,8 +91,8 @@ class ModelArguments:
|
|||||||
"with private models)."
|
"with private models)."
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
freeze_feature_extractor: Optional[bool] = field(
|
freeze_feature_encoder: bool = field(
|
||||||
default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
|
default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -102,7 +102,7 @@ class DataTrainingArguments:
|
|||||||
Arguments pertaining to what data we are going to input our model for training and eval.
|
Arguments pertaining to what data we are going to input our model for training and eval.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
dataset_name: Optional[str] = field(
|
dataset_name: str = field(
|
||||||
default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
|
default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
|
||||||
)
|
)
|
||||||
dataset_config_name: Optional[str] = field(
|
dataset_config_name: Optional[str] = field(
|
||||||
@@ -133,24 +133,24 @@ class DataTrainingArguments:
|
|||||||
"value if set."
|
"value if set."
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
audio_column_name: Optional[str] = field(
|
audio_column_name: str = field(
|
||||||
default="audio",
|
default="audio",
|
||||||
metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
|
metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"},
|
||||||
)
|
)
|
||||||
text_column_name: Optional[str] = field(
|
text_column_name: str = field(
|
||||||
default="text",
|
default="text",
|
||||||
metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
|
metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"},
|
||||||
)
|
)
|
||||||
max_duration_in_seconds: Optional[float] = field(
|
max_duration_in_seconds: float = field(
|
||||||
default=20.0,
|
default=20.0,
|
||||||
metadata={
|
metadata={
|
||||||
"help": "Truncate audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
|
"help": "Truncate audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`"
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
min_duration_in_seconds: Optional[float] = field(
|
min_duration_in_seconds: float = field(
|
||||||
default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
|
default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"}
|
||||||
)
|
)
|
||||||
preprocessing_only: Optional[bool] = field(
|
preprocessing_only: bool = field(
|
||||||
default=False,
|
default=False,
|
||||||
metadata={
|
metadata={
|
||||||
"help": "Whether to only do data preprocessing and skip training. "
|
"help": "Whether to only do data preprocessing and skip training. "
|
||||||
@@ -159,19 +159,19 @@ class DataTrainingArguments:
|
|||||||
"so that the cached datasets can consequently be loaded in distributed training"
|
"so that the cached datasets can consequently be loaded in distributed training"
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
train_split_name: Optional[str] = field(
|
train_split_name: str = field(
|
||||||
default="train",
|
default="train",
|
||||||
metadata={
|
metadata={
|
||||||
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
|
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
eval_split_name: Optional[str] = field(
|
eval_split_name: str = field(
|
||||||
default="test",
|
default="test",
|
||||||
metadata={
|
metadata={
|
||||||
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
|
"help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'"
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
do_lower_case: Optional[bool] = field(
|
do_lower_case: bool = field(
|
||||||
default=True,
|
default=True,
|
||||||
metadata={"help": "Whether the target text should be lower cased."},
|
metadata={"help": "Whether the target text should be lower cased."},
|
||||||
)
|
)
|
||||||
@@ -335,8 +335,8 @@ def main():
|
|||||||
if model.config.decoder_start_token_id is None:
|
if model.config.decoder_start_token_id is None:
|
||||||
raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
|
raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
|
||||||
|
|
||||||
if model_args.freeze_feature_extractor:
|
if model_args.freeze_feature_encoder:
|
||||||
model.freeze_feature_extractor()
|
model.freeze_feature_encoder()
|
||||||
|
|
||||||
# 6. Resample speech dataset if necassary
|
# 6. Resample speech dataset if necassary
|
||||||
dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
|
dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate
|
||||||
|
|||||||
@@ -64,24 +64,24 @@ class HubertConfig(PretrainedConfig):
|
|||||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||||
The epsilon used by the layer normalization layers.
|
The epsilon used by the layer normalization layers.
|
||||||
feat_extract_norm (`str`, *optional*, defaults to `"group"`):
|
feat_extract_norm (`str`, *optional*, defaults to `"group"`):
|
||||||
The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group
|
The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
|
||||||
normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
|
normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
|
||||||
convolutional layers.
|
convolutional layers.
|
||||||
feat_proj_dropout (`float`, *optional*, defaults to 0.0):
|
feat_proj_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
The dropout probability for output of the feature extractor.
|
The dropout probability for output of the feature encoder.
|
||||||
feat_proj_layer_norm (`bool`, *optional*, defaults to `True`):
|
feat_proj_layer_norm (`bool`, *optional*, defaults to `True`):
|
||||||
Whether to apply LayerNorm to the output of the feature extractor.
|
Whether to apply LayerNorm to the output of the feature encoder.
|
||||||
feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
|
feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
|
||||||
The non-linear activation function (function or string) in the 1D convolutional layers of the feature
|
The non-linear activation function (function or string) in the 1D convolutional layers of the feature
|
||||||
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
|
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
|
||||||
conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
|
conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
|
||||||
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
|
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
|
||||||
feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers.
|
feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
|
||||||
conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
|
conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
|
||||||
A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length
|
A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
|
||||||
of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
|
of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
|
||||||
conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
|
conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
|
||||||
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The
|
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
|
||||||
length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
|
length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
|
||||||
*conv_dim*.
|
*conv_dim*.
|
||||||
conv_bias (`bool`, *optional*, defaults to `False`):
|
conv_bias (`bool`, *optional*, defaults to `False`):
|
||||||
@@ -96,7 +96,7 @@ class HubertConfig(PretrainedConfig):
|
|||||||
True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is
|
True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is
|
||||||
False` corresponds to applying layer norm after the attention layer.
|
False` corresponds to applying layer norm after the attention layer.
|
||||||
apply_spec_augment (`bool`, *optional*, defaults to `True`):
|
apply_spec_augment (`bool`, *optional*, defaults to `True`):
|
||||||
Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see
|
Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
|
||||||
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech
|
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech
|
||||||
Recognition](https://arxiv.org/abs/1904.08779).
|
Recognition](https://arxiv.org/abs/1904.08779).
|
||||||
mask_time_prob (`float`, *optional*, defaults to 0.05):
|
mask_time_prob (`float`, *optional*, defaults to 0.05):
|
||||||
|
|||||||
@@ -14,6 +14,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
""" PyTorch Hubert model."""
|
""" PyTorch Hubert model."""
|
||||||
|
|
||||||
|
import warnings
|
||||||
from typing import Optional, Tuple, Union
|
from typing import Optional, Tuple, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -284,8 +285,8 @@ class HubertSamePadLayer(nn.Module):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureExtractor with Wav2Vec2->Hubert
|
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->Hubert
|
||||||
class HubertFeatureExtractor(nn.Module):
|
class HubertFeatureEncoder(nn.Module):
|
||||||
"""Construct the features from raw audio waveform"""
|
"""Construct the features from raw audio waveform"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
@@ -336,6 +337,17 @@ class HubertFeatureExtractor(nn.Module):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
|
class HubertFeatureExtractor(HubertFeatureEncoder):
|
||||||
|
def __init__(self, config):
|
||||||
|
super().__init__(config)
|
||||||
|
warnings.warn(
|
||||||
|
f"The class `{self.__class__.__name__}` has been depreciated "
|
||||||
|
"and will be removed in Transformers v5. "
|
||||||
|
f"Use `{self.__class__.__bases__[0].__name__}` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class HubertFeatureProjection(nn.Module):
|
class HubertFeatureProjection(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
@@ -902,7 +914,7 @@ class HubertModel(HubertPreTrainedModel):
|
|||||||
def __init__(self, config: HubertConfig):
|
def __init__(self, config: HubertConfig):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
self.config = config
|
self.config = config
|
||||||
self.feature_extractor = HubertFeatureExtractor(config)
|
self.feature_extractor = HubertFeatureEncoder(config)
|
||||||
self.feature_projection = HubertFeatureProjection(config)
|
self.feature_projection = HubertFeatureProjection(config)
|
||||||
|
|
||||||
if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
|
if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
|
||||||
@@ -1063,8 +1075,20 @@ class HubertForCTC(HubertPreTrainedModel):
|
|||||||
|
|
||||||
def freeze_feature_extractor(self):
|
def freeze_feature_extractor(self):
|
||||||
"""
|
"""
|
||||||
Calling this function will disable the gradient computation for the feature extractor so that its parameter
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
will not be updated during training.
|
not be updated during training.
|
||||||
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
|
||||||
|
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
self.freeze_feature_encoder()
|
||||||
|
|
||||||
|
def freeze_feature_encoder(self):
|
||||||
|
"""
|
||||||
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
|
not be updated during training.
|
||||||
"""
|
"""
|
||||||
self.hubert.feature_extractor._freeze_parameters()
|
self.hubert.feature_extractor._freeze_parameters()
|
||||||
|
|
||||||
@@ -1172,8 +1196,20 @@ class HubertForSequenceClassification(HubertPreTrainedModel):
|
|||||||
|
|
||||||
def freeze_feature_extractor(self):
|
def freeze_feature_extractor(self):
|
||||||
"""
|
"""
|
||||||
Calling this function will disable the gradient computation for the feature extractor so that its parameters
|
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
|
||||||
will not be updated during training.
|
not be updated during training.
|
||||||
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
|
||||||
|
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
self.freeze_feature_encoder()
|
||||||
|
|
||||||
|
def freeze_feature_encoder(self):
|
||||||
|
"""
|
||||||
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
|
not be updated during training.
|
||||||
"""
|
"""
|
||||||
self.hubert.feature_extractor._freeze_parameters()
|
self.hubert.feature_extractor._freeze_parameters()
|
||||||
|
|
||||||
|
|||||||
@@ -659,7 +659,7 @@ class TFHubertSamePadLayer(tf.keras.layers.Layer):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
class TFHubertFeatureExtractor(tf.keras.layers.Layer):
|
class TFHubertFeatureEncoder(tf.keras.layers.Layer):
|
||||||
def __init__(self, config: HubertConfig, **kwargs: Any) -> None:
|
def __init__(self, config: HubertConfig, **kwargs: Any) -> None:
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
@@ -686,6 +686,17 @@ class TFHubertFeatureExtractor(tf.keras.layers.Layer):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
|
class TFHubertFeatureExtractor(TFHubertFeatureEncoder):
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super().__init__(config, **kwargs)
|
||||||
|
warnings.warn(
|
||||||
|
f"The class `{self.__class__.__name__}` has been depreciated "
|
||||||
|
"and will be removed in Transformers v5. "
|
||||||
|
f"Use `{self.__class__.__bases__[0].__name__}` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class TFHubertFeatureProjection(tf.keras.layers.Layer):
|
class TFHubertFeatureProjection(tf.keras.layers.Layer):
|
||||||
def __init__(self, config: HubertConfig, **kwargs):
|
def __init__(self, config: HubertConfig, **kwargs):
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
@@ -1116,7 +1127,7 @@ class TFHubertMainLayer(tf.keras.layers.Layer):
|
|||||||
def __init__(self, config: HubertConfig, **kwargs):
|
def __init__(self, config: HubertConfig, **kwargs):
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.config = config
|
self.config = config
|
||||||
self.feature_extractor = TFHubertFeatureExtractor(config, name="feature_extractor")
|
self.feature_extractor = TFHubertFeatureEncoder(config, name="feature_extractor")
|
||||||
self.feature_projection = TFHubertFeatureProjection(config, name="feature_projection")
|
self.feature_projection = TFHubertFeatureProjection(config, name="feature_projection")
|
||||||
|
|
||||||
if config.do_stable_layer_norm:
|
if config.do_stable_layer_norm:
|
||||||
@@ -1490,8 +1501,20 @@ class TFHubertForCTC(TFHubertPreTrainedModel):
|
|||||||
|
|
||||||
def freeze_feature_extractor(self):
|
def freeze_feature_extractor(self):
|
||||||
"""
|
"""
|
||||||
Calling this function will disable the gradient computation for the feature extractor so that its parameter
|
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
|
||||||
will not be updated during training.
|
not be updated during training.
|
||||||
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
|
||||||
|
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
self.freeze_feature_encoder()
|
||||||
|
|
||||||
|
def freeze_feature_encoder(self):
|
||||||
|
"""
|
||||||
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
|
not be updated during training.
|
||||||
"""
|
"""
|
||||||
self.hubert.feature_extractor.trainable = False
|
self.hubert.feature_extractor.trainable = False
|
||||||
|
|
||||||
|
|||||||
@@ -65,22 +65,22 @@ class SEWConfig(PretrainedConfig):
|
|||||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||||
The epsilon used by the layer normalization layers.
|
The epsilon used by the layer normalization layers.
|
||||||
feat_extract_norm (`str`, *optional*, defaults to `"group"`):
|
feat_extract_norm (`str`, *optional*, defaults to `"group"`):
|
||||||
The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group
|
The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
|
||||||
normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
|
normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
|
||||||
convolutional layers.
|
convolutional layers.
|
||||||
feat_proj_dropout (`float`, *optional*, defaults to 0.0):
|
feat_proj_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
The dropout probability for output of the feature extractor.
|
The dropout probability for output of the feature encoder.
|
||||||
feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
|
feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
|
||||||
The non-linear activation function (function or string) in the 1D convolutional layers of the feature
|
The non-linear activation function (function or string) in the 1D convolutional layers of the feature
|
||||||
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
|
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
|
||||||
conv_dim (`Tuple[int]`, *optional*, defaults to `(64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512)`):
|
conv_dim (`Tuple[int]`, *optional*, defaults to `(64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512)`):
|
||||||
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
|
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
|
||||||
feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers.
|
feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
|
||||||
conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1)`):
|
conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1)`):
|
||||||
A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length
|
A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
|
||||||
of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
|
of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
|
||||||
conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1)`):
|
conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1)`):
|
||||||
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The
|
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
|
||||||
length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
|
length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
|
||||||
*conv_dim*.
|
*conv_dim*.
|
||||||
conv_bias (`bool`, *optional*, defaults to `False`):
|
conv_bias (`bool`, *optional*, defaults to `False`):
|
||||||
@@ -91,7 +91,7 @@ class SEWConfig(PretrainedConfig):
|
|||||||
num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
|
num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
|
||||||
Number of groups of 1D convolutional positional embeddings layer.
|
Number of groups of 1D convolutional positional embeddings layer.
|
||||||
apply_spec_augment (`bool`, *optional*, defaults to `True`):
|
apply_spec_augment (`bool`, *optional*, defaults to `True`):
|
||||||
Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see
|
Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
|
||||||
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech
|
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech
|
||||||
Recognition](https://arxiv.org/abs/1904.08779).
|
Recognition](https://arxiv.org/abs/1904.08779).
|
||||||
mask_time_prob (`float`, *optional*, defaults to 0.05):
|
mask_time_prob (`float`, *optional*, defaults to 0.05):
|
||||||
|
|||||||
@@ -15,6 +15,7 @@
|
|||||||
""" PyTorch SEW model."""
|
""" PyTorch SEW model."""
|
||||||
|
|
||||||
import math
|
import math
|
||||||
|
import warnings
|
||||||
from typing import Optional, Tuple, Union
|
from typing import Optional, Tuple, Union
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -301,8 +302,8 @@ class SEWUpsampling(nn.Module):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureExtractor with Wav2Vec2->SEW
|
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->SEW
|
||||||
class SEWFeatureExtractor(nn.Module):
|
class SEWFeatureEncoder(nn.Module):
|
||||||
"""Construct the features from raw audio waveform"""
|
"""Construct the features from raw audio waveform"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
@@ -353,6 +354,17 @@ class SEWFeatureExtractor(nn.Module):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
|
class SEWFeatureExtractor(SEWFeatureEncoder):
|
||||||
|
def __init__(self, config):
|
||||||
|
super().__init__(config)
|
||||||
|
warnings.warn(
|
||||||
|
f"The class `{self.__class__.__name__}` has been depreciated "
|
||||||
|
"and will be removed in Transformers v5. "
|
||||||
|
f"Use `{self.__class__.__bases__[0].__name__}` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->SEW
|
# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->SEW
|
||||||
class SEWAttention(nn.Module):
|
class SEWAttention(nn.Module):
|
||||||
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
||||||
@@ -712,7 +724,7 @@ class SEWPreTrainedModel(PreTrainedModel):
|
|||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
|
|
||||||
def _set_gradient_checkpointing(self, module, value=False):
|
def _set_gradient_checkpointing(self, module, value=False):
|
||||||
if isinstance(module, (SEWEncoder, SEWFeatureExtractor)):
|
if isinstance(module, (SEWEncoder, SEWFeatureEncoder)):
|
||||||
module.gradient_checkpointing = value
|
module.gradient_checkpointing = value
|
||||||
|
|
||||||
def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
|
def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
|
||||||
@@ -797,7 +809,7 @@ class SEWModel(SEWPreTrainedModel):
|
|||||||
def __init__(self, config: SEWConfig):
|
def __init__(self, config: SEWConfig):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
self.config = config
|
self.config = config
|
||||||
self.feature_extractor = SEWFeatureExtractor(config)
|
self.feature_extractor = SEWFeatureEncoder(config)
|
||||||
self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
|
self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
|
||||||
|
|
||||||
self.project_features = config.conv_dim[-1] != config.hidden_size
|
self.project_features = config.conv_dim[-1] != config.hidden_size
|
||||||
@@ -943,8 +955,20 @@ class SEWForCTC(SEWPreTrainedModel):
|
|||||||
|
|
||||||
def freeze_feature_extractor(self):
|
def freeze_feature_extractor(self):
|
||||||
"""
|
"""
|
||||||
Calling this function will disable the gradient computation for the feature extractor so that its parameter
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
will not be updated during training.
|
not be updated during training.
|
||||||
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
|
||||||
|
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
self.freeze_feature_encoder()
|
||||||
|
|
||||||
|
def freeze_feature_encoder(self):
|
||||||
|
"""
|
||||||
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
|
not be updated during training.
|
||||||
"""
|
"""
|
||||||
self.sew.feature_extractor._freeze_parameters()
|
self.sew.feature_extractor._freeze_parameters()
|
||||||
|
|
||||||
@@ -1052,8 +1076,20 @@ class SEWForSequenceClassification(SEWPreTrainedModel):
|
|||||||
|
|
||||||
def freeze_feature_extractor(self):
|
def freeze_feature_extractor(self):
|
||||||
"""
|
"""
|
||||||
Calling this function will disable the gradient computation for the feature extractor so that its parameters
|
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
|
||||||
will not be updated during training.
|
not be updated during training.
|
||||||
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
|
||||||
|
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
self.freeze_feature_encoder()
|
||||||
|
|
||||||
|
def freeze_feature_encoder(self):
|
||||||
|
"""
|
||||||
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
|
not be updated during training.
|
||||||
"""
|
"""
|
||||||
self.sew.feature_extractor._freeze_parameters()
|
self.sew.feature_extractor._freeze_parameters()
|
||||||
|
|
||||||
|
|||||||
@@ -81,24 +81,24 @@ class SEWDConfig(PretrainedConfig):
|
|||||||
layer_norm_eps (`float`, *optional*, defaults to 1e-7):
|
layer_norm_eps (`float`, *optional*, defaults to 1e-7):
|
||||||
The epsilon used by the layer normalization layers in the transformer encoder.
|
The epsilon used by the layer normalization layers in the transformer encoder.
|
||||||
feature_layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
feature_layer_norm_eps (`float`, *optional*, defaults to 1e-5):
|
||||||
The epsilon used by the layer normalization after the feature extractor.
|
The epsilon used by the layer normalization after the feature encoder.
|
||||||
feat_extract_norm (`str`, *optional*, defaults to `"group"`):
|
feat_extract_norm (`str`, *optional*, defaults to `"group"`):
|
||||||
The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group
|
The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
|
||||||
normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
|
normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
|
||||||
convolutional layers.
|
convolutional layers.
|
||||||
feat_proj_dropout (`float`, *optional*, defaults to 0.0):
|
feat_proj_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
The dropout probability for output of the feature extractor.
|
The dropout probability for output of the feature encoder.
|
||||||
feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
|
feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
|
||||||
The non-linear activation function (function or string) in the 1D convolutional layers of the feature
|
The non-linear activation function (function or string) in the 1D convolutional layers of the feature
|
||||||
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
|
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
|
||||||
conv_dim (`Tuple[int]`, *optional*, defaults to `(64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512)`):
|
conv_dim (`Tuple[int]`, *optional*, defaults to `(64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512)`):
|
||||||
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
|
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
|
||||||
feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers.
|
feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
|
||||||
conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1)`):
|
conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1)`):
|
||||||
A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length
|
A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
|
||||||
of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
|
of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
|
||||||
conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1)`):
|
conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1)`):
|
||||||
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The
|
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
|
||||||
length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
|
length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
|
||||||
*conv_dim*.
|
*conv_dim*.
|
||||||
conv_bias (`bool`, *optional*, defaults to `False`):
|
conv_bias (`bool`, *optional*, defaults to `False`):
|
||||||
@@ -109,7 +109,7 @@ class SEWDConfig(PretrainedConfig):
|
|||||||
num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
|
num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
|
||||||
Number of groups of 1D convolutional positional embeddings layer.
|
Number of groups of 1D convolutional positional embeddings layer.
|
||||||
apply_spec_augment (`bool`, *optional*, defaults to `True`):
|
apply_spec_augment (`bool`, *optional*, defaults to `True`):
|
||||||
Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see
|
Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
|
||||||
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech
|
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech
|
||||||
Recognition](https://arxiv.org/abs/1904.08779).
|
Recognition](https://arxiv.org/abs/1904.08779).
|
||||||
mask_time_prob (`float`, *optional*, defaults to 0.05):
|
mask_time_prob (`float`, *optional*, defaults to 0.05):
|
||||||
|
|||||||
@@ -15,6 +15,7 @@
|
|||||||
""" PyTorch SEW model."""
|
""" PyTorch SEW model."""
|
||||||
|
|
||||||
import math
|
import math
|
||||||
|
import warnings
|
||||||
from collections.abc import Sequence
|
from collections.abc import Sequence
|
||||||
from typing import Optional, Tuple, Union
|
from typing import Optional, Tuple, Union
|
||||||
|
|
||||||
@@ -387,8 +388,8 @@ class SEWDUpsampling(nn.Module):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureExtractor with Wav2Vec2->SEWD
|
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->SEWD
|
||||||
class SEWDFeatureExtractor(nn.Module):
|
class SEWDFeatureEncoder(nn.Module):
|
||||||
"""Construct the features from raw audio waveform"""
|
"""Construct the features from raw audio waveform"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
@@ -439,6 +440,17 @@ class SEWDFeatureExtractor(nn.Module):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
|
class SEWDFeatureExtractor(SEWDFeatureEncoder):
|
||||||
|
def __init__(self, config):
|
||||||
|
super().__init__(config)
|
||||||
|
warnings.warn(
|
||||||
|
f"The class `{self.__class__.__name__}` has been depreciated "
|
||||||
|
"and will be removed in Transformers v5. "
|
||||||
|
f"Use `{self.__class__.__bases__[0].__name__}` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.deberta.modeling_deberta.ContextPooler
|
# Copied from transformers.models.deberta.modeling_deberta.ContextPooler
|
||||||
class ContextPooler(nn.Module):
|
class ContextPooler(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
@@ -1333,7 +1345,7 @@ class SEWDModel(SEWDPreTrainedModel):
|
|||||||
def __init__(self, config: SEWDConfig):
|
def __init__(self, config: SEWDConfig):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
self.config = config
|
self.config = config
|
||||||
self.feature_extractor = SEWDFeatureExtractor(config)
|
self.feature_extractor = SEWDFeatureEncoder(config)
|
||||||
self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.feature_layer_norm_eps)
|
self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.feature_layer_norm_eps)
|
||||||
|
|
||||||
self.project_features = config.conv_dim[-1] != config.hidden_size
|
self.project_features = config.conv_dim[-1] != config.hidden_size
|
||||||
@@ -1479,8 +1491,20 @@ class SEWDForCTC(SEWDPreTrainedModel):
|
|||||||
|
|
||||||
def freeze_feature_extractor(self):
|
def freeze_feature_extractor(self):
|
||||||
"""
|
"""
|
||||||
Calling this function will disable the gradient computation for the feature extractor so that its parameter
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
will not be updated during training.
|
not be updated during training.
|
||||||
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
|
||||||
|
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
self.freeze_feature_encoder()
|
||||||
|
|
||||||
|
def freeze_feature_encoder(self):
|
||||||
|
"""
|
||||||
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
|
not be updated during training.
|
||||||
"""
|
"""
|
||||||
self.sew_d.feature_extractor._freeze_parameters()
|
self.sew_d.feature_extractor._freeze_parameters()
|
||||||
|
|
||||||
@@ -1588,8 +1612,20 @@ class SEWDForSequenceClassification(SEWDPreTrainedModel):
|
|||||||
|
|
||||||
def freeze_feature_extractor(self):
|
def freeze_feature_extractor(self):
|
||||||
"""
|
"""
|
||||||
Calling this function will disable the gradient computation for the feature extractor so that its parameters
|
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
|
||||||
will not be updated during training.
|
not be updated during training.
|
||||||
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
|
||||||
|
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
self.freeze_feature_encoder()
|
||||||
|
|
||||||
|
def freeze_feature_encoder(self):
|
||||||
|
"""
|
||||||
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
|
not be updated during training.
|
||||||
"""
|
"""
|
||||||
self.sew_d.feature_extractor._freeze_parameters()
|
self.sew_d.feature_extractor._freeze_parameters()
|
||||||
|
|
||||||
|
|||||||
@@ -265,12 +265,12 @@ class SpeechEncoderDecoderModel(PreTrainedModel):
|
|||||||
def set_output_embeddings(self, new_embeddings):
|
def set_output_embeddings(self, new_embeddings):
|
||||||
return self.decoder.set_output_embeddings(new_embeddings)
|
return self.decoder.set_output_embeddings(new_embeddings)
|
||||||
|
|
||||||
def freeze_feature_extractor(self):
|
def freeze_feature_encoder(self):
|
||||||
"""
|
"""
|
||||||
Calling this function will disable the gradient computation for the feature extractor of the speech encoder so
|
Calling this function will disable the gradient computation for the feature encoder of the speech encoder so
|
||||||
that its parameters will not be updated during training.
|
that its parameters will not be updated during training.
|
||||||
"""
|
"""
|
||||||
self.encoder.freeze_feature_extractor()
|
self.encoder.freeze_feature_encoder()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_pretrained(cls, *args, **kwargs):
|
def from_pretrained(cls, *args, **kwargs):
|
||||||
|
|||||||
@@ -65,24 +65,24 @@ class UniSpeechConfig(PretrainedConfig):
|
|||||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||||
The epsilon used by the layer normalization layers.
|
The epsilon used by the layer normalization layers.
|
||||||
feat_extract_norm (`str`, *optional*, defaults to `"group"`):
|
feat_extract_norm (`str`, *optional*, defaults to `"group"`):
|
||||||
The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group
|
The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
|
||||||
normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
|
normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
|
||||||
convolutional layers.
|
convolutional layers.
|
||||||
feat_proj_dropout (`float`, *optional*, defaults to 0.0):
|
feat_proj_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
The dropout probability for output of the feature extractor.
|
The dropout probability for output of the feature encoder.
|
||||||
feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
|
feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
|
||||||
The non-linear activation function (function or string) in the 1D convolutional layers of the feature
|
The non-linear activation function (function or string) in the 1D convolutional layers of the feature
|
||||||
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
|
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
|
||||||
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
|
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
The dropout probabilitiy for quantized feature extractor states.
|
The dropout probabilitiy for quantized feature encoder states.
|
||||||
conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
|
conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
|
||||||
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
|
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
|
||||||
feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers.
|
feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
|
||||||
conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
|
conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
|
||||||
A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length
|
A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
|
||||||
of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
|
of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
|
||||||
conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
|
conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
|
||||||
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The
|
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
|
||||||
length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
|
length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
|
||||||
*conv_dim*.
|
*conv_dim*.
|
||||||
conv_bias (`bool`, *optional*, defaults to `False`):
|
conv_bias (`bool`, *optional*, defaults to `False`):
|
||||||
@@ -97,7 +97,7 @@ class UniSpeechConfig(PretrainedConfig):
|
|||||||
True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is
|
True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is
|
||||||
False` corresponds to applying layer norm after the attention layer.
|
False` corresponds to applying layer norm after the attention layer.
|
||||||
apply_spec_augment (`bool`, *optional*, defaults to `True`):
|
apply_spec_augment (`bool`, *optional*, defaults to `True`):
|
||||||
Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see
|
Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
|
||||||
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech
|
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech
|
||||||
Recognition](https://arxiv.org/abs/1904.08779).
|
Recognition](https://arxiv.org/abs/1904.08779).
|
||||||
mask_time_prob (`float`, *optional*, defaults to 0.05):
|
mask_time_prob (`float`, *optional*, defaults to 0.05):
|
||||||
@@ -132,7 +132,7 @@ class UniSpeechConfig(PretrainedConfig):
|
|||||||
contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
|
contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
|
||||||
The temperature *kappa* in the contrastive loss.
|
The temperature *kappa* in the contrastive loss.
|
||||||
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
|
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
The dropout probabilitiy for the output of the feature extractor that's used by the quantizer.
|
The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
|
||||||
num_negatives (`int`, *optional*, defaults to 100):
|
num_negatives (`int`, *optional*, defaults to 100):
|
||||||
Number of negative samples for the contrastive loss.
|
Number of negative samples for the contrastive loss.
|
||||||
codevector_dim (`int`, *optional*, defaults to 256):
|
codevector_dim (`int`, *optional*, defaults to 256):
|
||||||
|
|||||||
@@ -15,6 +15,7 @@
|
|||||||
""" PyTorch UniSpeech model."""
|
""" PyTorch UniSpeech model."""
|
||||||
|
|
||||||
import math
|
import math
|
||||||
|
import warnings
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Optional, Tuple, Union
|
from typing import Optional, Tuple, Union
|
||||||
|
|
||||||
@@ -351,8 +352,8 @@ class UniSpeechSamePadLayer(nn.Module):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureExtractor with Wav2Vec2->UniSpeech
|
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->UniSpeech
|
||||||
class UniSpeechFeatureExtractor(nn.Module):
|
class UniSpeechFeatureEncoder(nn.Module):
|
||||||
"""Construct the features from raw audio waveform"""
|
"""Construct the features from raw audio waveform"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
@@ -406,6 +407,17 @@ class UniSpeechFeatureExtractor(nn.Module):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
|
class UniSpeechFeatureExtractor(UniSpeechFeatureEncoder):
|
||||||
|
def __init__(self, config):
|
||||||
|
super().__init__(config)
|
||||||
|
warnings.warn(
|
||||||
|
f"The class `{self.__class__.__name__}` has been depreciated "
|
||||||
|
"and will be removed in Transformers v5. "
|
||||||
|
f"Use `{self.__class__.__bases__[0].__name__}` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection with Wav2Vec2->UniSpeech
|
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection with Wav2Vec2->UniSpeech
|
||||||
class UniSpeechFeatureProjection(nn.Module):
|
class UniSpeechFeatureProjection(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
@@ -980,7 +992,7 @@ class UniSpeechPreTrainedModel(PreTrainedModel):
|
|||||||
return attention_mask
|
return attention_mask
|
||||||
|
|
||||||
def _set_gradient_checkpointing(self, module, value=False):
|
def _set_gradient_checkpointing(self, module, value=False):
|
||||||
if isinstance(module, (UniSpeechEncoder, UniSpeechEncoderStableLayerNorm, UniSpeechFeatureExtractor)):
|
if isinstance(module, (UniSpeechEncoder, UniSpeechEncoderStableLayerNorm, UniSpeechFeatureEncoder)):
|
||||||
module.gradient_checkpointing = value
|
module.gradient_checkpointing = value
|
||||||
|
|
||||||
|
|
||||||
@@ -1049,7 +1061,7 @@ class UniSpeechModel(UniSpeechPreTrainedModel):
|
|||||||
def __init__(self, config: UniSpeechConfig):
|
def __init__(self, config: UniSpeechConfig):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
self.config = config
|
self.config = config
|
||||||
self.feature_extractor = UniSpeechFeatureExtractor(config)
|
self.feature_extractor = UniSpeechFeatureEncoder(config)
|
||||||
self.feature_projection = UniSpeechFeatureProjection(config)
|
self.feature_projection = UniSpeechFeatureProjection(config)
|
||||||
|
|
||||||
if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
|
if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
|
||||||
@@ -1193,8 +1205,20 @@ class UniSpeechForPreTraining(UniSpeechPreTrainedModel):
|
|||||||
|
|
||||||
def freeze_feature_extractor(self):
|
def freeze_feature_extractor(self):
|
||||||
"""
|
"""
|
||||||
Calling this function will disable the gradient computation for the feature extractor so that its parameters
|
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
|
||||||
will not be updated during training.
|
not be updated during training.
|
||||||
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
|
||||||
|
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
self.freeze_feature_encoder()
|
||||||
|
|
||||||
|
def freeze_feature_encoder(self):
|
||||||
|
"""
|
||||||
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
|
not be updated during training.
|
||||||
"""
|
"""
|
||||||
self.unispeech.feature_extractor._freeze_parameters()
|
self.unispeech.feature_extractor._freeze_parameters()
|
||||||
|
|
||||||
@@ -1358,8 +1382,20 @@ class UniSpeechForCTC(UniSpeechPreTrainedModel):
|
|||||||
|
|
||||||
def freeze_feature_extractor(self):
|
def freeze_feature_extractor(self):
|
||||||
"""
|
"""
|
||||||
Calling this function will disable the gradient computation for the feature extractor so that its parameter
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
will not be updated during training.
|
not be updated during training.
|
||||||
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
|
||||||
|
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
self.freeze_feature_encoder()
|
||||||
|
|
||||||
|
def freeze_feature_encoder(self):
|
||||||
|
"""
|
||||||
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
|
not be updated during training.
|
||||||
"""
|
"""
|
||||||
self.unispeech.feature_extractor._freeze_parameters()
|
self.unispeech.feature_extractor._freeze_parameters()
|
||||||
|
|
||||||
@@ -1467,8 +1503,20 @@ class UniSpeechForSequenceClassification(UniSpeechPreTrainedModel):
|
|||||||
|
|
||||||
def freeze_feature_extractor(self):
|
def freeze_feature_extractor(self):
|
||||||
"""
|
"""
|
||||||
Calling this function will disable the gradient computation for the feature extractor so that its parameters
|
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
|
||||||
will not be updated during training.
|
not be updated during training.
|
||||||
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
|
||||||
|
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
self.freeze_feature_encoder()
|
||||||
|
|
||||||
|
def freeze_feature_encoder(self):
|
||||||
|
"""
|
||||||
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
|
not be updated during training.
|
||||||
"""
|
"""
|
||||||
self.unispeech.feature_extractor._freeze_parameters()
|
self.unispeech.feature_extractor._freeze_parameters()
|
||||||
|
|
||||||
|
|||||||
@@ -65,24 +65,24 @@ class UniSpeechSatConfig(PretrainedConfig):
|
|||||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||||
The epsilon used by the layer normalization layers.
|
The epsilon used by the layer normalization layers.
|
||||||
feat_extract_norm (`str`, *optional*, defaults to `"group"`):
|
feat_extract_norm (`str`, *optional*, defaults to `"group"`):
|
||||||
The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group
|
The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
|
||||||
normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
|
normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
|
||||||
convolutional layers.
|
convolutional layers.
|
||||||
feat_proj_dropout (`float`, *optional*, defaults to 0.0):
|
feat_proj_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
The dropout probability for output of the feature extractor.
|
The dropout probability for output of the feature encoder.
|
||||||
feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
|
feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
|
||||||
The non-linear activation function (function or string) in the 1D convolutional layers of the feature
|
The non-linear activation function (function or string) in the 1D convolutional layers of the feature
|
||||||
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
|
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
|
||||||
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
|
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
The dropout probabilitiy for quantized feature extractor states.
|
The dropout probabilitiy for quantized feature encoder states.
|
||||||
conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
|
conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
|
||||||
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
|
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
|
||||||
feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers.
|
feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
|
||||||
conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
|
conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
|
||||||
A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length
|
A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
|
||||||
of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
|
of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
|
||||||
conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
|
conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
|
||||||
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The
|
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
|
||||||
length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
|
length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
|
||||||
*conv_dim*.
|
*conv_dim*.
|
||||||
conv_bias (`bool`, *optional*, defaults to `False`):
|
conv_bias (`bool`, *optional*, defaults to `False`):
|
||||||
@@ -97,7 +97,7 @@ class UniSpeechSatConfig(PretrainedConfig):
|
|||||||
True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is
|
True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is
|
||||||
False` corresponds to applying layer norm after the attention layer.
|
False` corresponds to applying layer norm after the attention layer.
|
||||||
apply_spec_augment (`bool`, *optional*, defaults to `True`):
|
apply_spec_augment (`bool`, *optional*, defaults to `True`):
|
||||||
Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see
|
Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
|
||||||
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech
|
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech
|
||||||
Recognition](https://arxiv.org/abs/1904.08779).
|
Recognition](https://arxiv.org/abs/1904.08779).
|
||||||
mask_time_prob (`float`, *optional*, defaults to 0.05):
|
mask_time_prob (`float`, *optional*, defaults to 0.05):
|
||||||
@@ -132,7 +132,7 @@ class UniSpeechSatConfig(PretrainedConfig):
|
|||||||
contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
|
contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
|
||||||
The temperature *kappa* in the contrastive loss.
|
The temperature *kappa* in the contrastive loss.
|
||||||
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
|
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
The dropout probabilitiy for the output of the feature extractor that's used by the quantizer.
|
The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
|
||||||
num_negatives (`int`, *optional*, defaults to 100):
|
num_negatives (`int`, *optional*, defaults to 100):
|
||||||
Number of negative samples for the contrastive loss.
|
Number of negative samples for the contrastive loss.
|
||||||
codevector_dim (`int`, *optional*, defaults to 256):
|
codevector_dim (`int`, *optional*, defaults to 256):
|
||||||
|
|||||||
@@ -20,12 +20,7 @@ import argparse
|
|||||||
import fairseq
|
import fairseq
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from transformers import ( # UniSpeechSatCTCTokenizer,; UniSpeechSatFeatureExtractor,; UniSpeechSatProcessor,
|
from transformers import UniSpeechSatConfig, UniSpeechSatForCTC, UniSpeechSatForPreTraining, logging
|
||||||
UniSpeechSatConfig,
|
|
||||||
UniSpeechSatForCTC,
|
|
||||||
UniSpeechSatForPreTraining,
|
|
||||||
logging,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
logging.set_verbosity_info()
|
logging.set_verbosity_info()
|
||||||
|
|||||||
@@ -15,6 +15,7 @@
|
|||||||
""" PyTorch UniSpeechSat model."""
|
""" PyTorch UniSpeechSat model."""
|
||||||
|
|
||||||
import math
|
import math
|
||||||
|
import warnings
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Optional, Tuple, Union
|
from typing import Optional, Tuple, Union
|
||||||
|
|
||||||
@@ -385,8 +386,8 @@ class UniSpeechSatSamePadLayer(nn.Module):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureExtractor with Wav2Vec2->UniSpeechSat
|
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->UniSpeechSat
|
||||||
class UniSpeechSatFeatureExtractor(nn.Module):
|
class UniSpeechSatFeatureEncoder(nn.Module):
|
||||||
"""Construct the features from raw audio waveform"""
|
"""Construct the features from raw audio waveform"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
@@ -440,6 +441,17 @@ class UniSpeechSatFeatureExtractor(nn.Module):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
|
class UniSpeechSatFeatureExtractor(UniSpeechSatFeatureEncoder):
|
||||||
|
def __init__(self, config):
|
||||||
|
super().__init__(config)
|
||||||
|
warnings.warn(
|
||||||
|
f"The class `{self.__class__.__name__}` has been depreciated "
|
||||||
|
"and will be removed in Transformers v5. "
|
||||||
|
f"Use `{self.__class__.__bases__[0].__name__}` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection with Wav2Vec2->UniSpeechSat
|
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection with Wav2Vec2->UniSpeechSat
|
||||||
class UniSpeechSatFeatureProjection(nn.Module):
|
class UniSpeechSatFeatureProjection(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
@@ -1014,7 +1026,7 @@ class UniSpeechSatPreTrainedModel(PreTrainedModel):
|
|||||||
return attention_mask
|
return attention_mask
|
||||||
|
|
||||||
def _set_gradient_checkpointing(self, module, value=False):
|
def _set_gradient_checkpointing(self, module, value=False):
|
||||||
if isinstance(module, (UniSpeechSatEncoder, UniSpeechSatEncoderStableLayerNorm, UniSpeechSatFeatureExtractor)):
|
if isinstance(module, (UniSpeechSatEncoder, UniSpeechSatEncoderStableLayerNorm, UniSpeechSatFeatureEncoder)):
|
||||||
module.gradient_checkpointing = value
|
module.gradient_checkpointing = value
|
||||||
|
|
||||||
|
|
||||||
@@ -1084,7 +1096,7 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel):
|
|||||||
def __init__(self, config: UniSpeechSatConfig):
|
def __init__(self, config: UniSpeechSatConfig):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
self.config = config
|
self.config = config
|
||||||
self.feature_extractor = UniSpeechSatFeatureExtractor(config)
|
self.feature_extractor = UniSpeechSatFeatureEncoder(config)
|
||||||
self.feature_projection = UniSpeechSatFeatureProjection(config)
|
self.feature_projection = UniSpeechSatFeatureProjection(config)
|
||||||
|
|
||||||
self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())
|
self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())
|
||||||
@@ -1232,10 +1244,22 @@ class UniSpeechSatForPreTraining(UniSpeechSatPreTrainedModel):
|
|||||||
|
|
||||||
def freeze_feature_extractor(self):
|
def freeze_feature_extractor(self):
|
||||||
"""
|
"""
|
||||||
Calling this function will disable the gradient computation for the feature extractor so that its parameters
|
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
|
||||||
will not be updated during training.
|
not be updated during training.
|
||||||
"""
|
"""
|
||||||
self.unispeech_sat.feature_extractor._freeze_parameters()
|
warnings.warn(
|
||||||
|
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
|
||||||
|
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
self.freeze_feature_encoder()
|
||||||
|
|
||||||
|
def freeze_feature_encoder(self):
|
||||||
|
"""
|
||||||
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
|
not be updated during training.
|
||||||
|
"""
|
||||||
|
self.wav2vec2.feature_extractor._freeze_parameters()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def compute_contrastive_logits(
|
def compute_contrastive_logits(
|
||||||
@@ -1274,12 +1298,12 @@ class UniSpeechSatForPreTraining(UniSpeechSatPreTrainedModel):
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
>>> import torch
|
>>> import torch
|
||||||
>>> from transformers import UniSpeechSatFeatureExtractor, UniSpeechSatForPreTraining
|
>>> from transformers import UniSpeechSatFeatureEncoder, UniSpeechSatForPreTraining
|
||||||
>>> from transformers.models.unispeech_sat.modeling_unispeech_sat import _compute_mask_indices
|
>>> from transformers.models.unispeech_sat.modeling_unispeech_sat import _compute_mask_indices
|
||||||
>>> from datasets import load_dataset
|
>>> from datasets import load_dataset
|
||||||
>>> import soundfile as sf
|
>>> import soundfile as sf
|
||||||
|
|
||||||
>>> feature_extractor = UniSpeechSatFeatureExtractor.from_pretrained("patrickvonplaten/unispeech_sat-base")
|
>>> feature_extractor = UniSpeechSatFeatureEncoder.from_pretrained("patrickvonplaten/unispeech_sat-base")
|
||||||
>>> model = UniSpeechSatForPreTraining.from_pretrained("patrickvonplaten/unispeech_sat-base")
|
>>> model = UniSpeechSatForPreTraining.from_pretrained("patrickvonplaten/unispeech_sat-base")
|
||||||
|
|
||||||
|
|
||||||
@@ -1383,8 +1407,20 @@ class UniSpeechSatForCTC(UniSpeechSatPreTrainedModel):
|
|||||||
|
|
||||||
def freeze_feature_extractor(self):
|
def freeze_feature_extractor(self):
|
||||||
"""
|
"""
|
||||||
Calling this function will disable the gradient computation for the feature extractor so that its parameter
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
will not be updated during training.
|
not be updated during training.
|
||||||
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
|
||||||
|
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
self.freeze_feature_encoder()
|
||||||
|
|
||||||
|
def freeze_feature_encoder(self):
|
||||||
|
"""
|
||||||
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
|
not be updated during training.
|
||||||
"""
|
"""
|
||||||
self.unispeech_sat.feature_extractor._freeze_parameters()
|
self.unispeech_sat.feature_extractor._freeze_parameters()
|
||||||
|
|
||||||
@@ -1492,8 +1528,20 @@ class UniSpeechSatForSequenceClassification(UniSpeechSatPreTrainedModel):
|
|||||||
|
|
||||||
def freeze_feature_extractor(self):
|
def freeze_feature_extractor(self):
|
||||||
"""
|
"""
|
||||||
Calling this function will disable the gradient computation for the feature extractor so that its parameters
|
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
|
||||||
will not be updated during training.
|
not be updated during training.
|
||||||
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
|
||||||
|
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
self.freeze_feature_encoder()
|
||||||
|
|
||||||
|
def freeze_feature_encoder(self):
|
||||||
|
"""
|
||||||
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
|
not be updated during training.
|
||||||
"""
|
"""
|
||||||
self.unispeech_sat.feature_extractor._freeze_parameters()
|
self.unispeech_sat.feature_extractor._freeze_parameters()
|
||||||
|
|
||||||
@@ -1596,8 +1644,20 @@ class UniSpeechSatForAudioFrameClassification(UniSpeechSatPreTrainedModel):
|
|||||||
|
|
||||||
def freeze_feature_extractor(self):
|
def freeze_feature_extractor(self):
|
||||||
"""
|
"""
|
||||||
Calling this function will disable the gradient computation for the feature extractor so that its parameters
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
will not be updated during training.
|
not be updated during training.
|
||||||
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
|
||||||
|
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
self.freeze_feature_encoder()
|
||||||
|
|
||||||
|
def freeze_feature_encoder(self):
|
||||||
|
"""
|
||||||
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
|
not be updated during training.
|
||||||
"""
|
"""
|
||||||
self.unispeech_sat.feature_extractor._freeze_parameters()
|
self.unispeech_sat.feature_extractor._freeze_parameters()
|
||||||
|
|
||||||
@@ -1745,8 +1805,20 @@ class UniSpeechSatForXVector(UniSpeechSatPreTrainedModel):
|
|||||||
|
|
||||||
def freeze_feature_extractor(self):
|
def freeze_feature_extractor(self):
|
||||||
"""
|
"""
|
||||||
Calling this function will disable the gradient computation for the feature extractor so that its parameters
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
will not be updated during training.
|
not be updated during training.
|
||||||
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
|
||||||
|
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
self.freeze_feature_encoder()
|
||||||
|
|
||||||
|
def freeze_feature_encoder(self):
|
||||||
|
"""
|
||||||
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
|
not be updated during training.
|
||||||
"""
|
"""
|
||||||
self.unispeech_sat.feature_extractor._freeze_parameters()
|
self.unispeech_sat.feature_extractor._freeze_parameters()
|
||||||
|
|
||||||
|
|||||||
@@ -65,24 +65,24 @@ class Wav2Vec2Config(PretrainedConfig):
|
|||||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||||
The epsilon used by the layer normalization layers.
|
The epsilon used by the layer normalization layers.
|
||||||
feat_extract_norm (`str`, *optional*, defaults to `"group"`):
|
feat_extract_norm (`str`, *optional*, defaults to `"group"`):
|
||||||
The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group
|
The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
|
||||||
normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
|
normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
|
||||||
convolutional layers.
|
convolutional layers.
|
||||||
feat_proj_dropout (`float`, *optional*, defaults to 0.0):
|
feat_proj_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
The dropout probability for output of the feature extractor.
|
The dropout probability for output of the feature encoder.
|
||||||
feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
|
feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
|
||||||
The non-linear activation function (function or string) in the 1D convolutional layers of the feature
|
The non-linear activation function (function or string) in the 1D convolutional layers of the feature
|
||||||
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
|
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
|
||||||
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
|
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
The dropout probabilitiy for quantized feature extractor states.
|
The dropout probabilitiy for quantized feature encoder states.
|
||||||
conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
|
conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
|
||||||
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
|
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
|
||||||
feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers.
|
feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
|
||||||
conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
|
conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
|
||||||
A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length
|
A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
|
||||||
of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
|
of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
|
||||||
conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
|
conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
|
||||||
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The
|
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
|
||||||
length of *conv_kernel* defines the number of convolutional layers and has to match the length of
|
length of *conv_kernel* defines the number of convolutional layers and has to match the length of
|
||||||
*conv_dim*.
|
*conv_dim*.
|
||||||
conv_bias (`bool`, *optional*, defaults to `False`):
|
conv_bias (`bool`, *optional*, defaults to `False`):
|
||||||
@@ -97,7 +97,7 @@ class Wav2Vec2Config(PretrainedConfig):
|
|||||||
True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is
|
True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is
|
||||||
False` corresponds to applying layer norm after the attention layer.
|
False` corresponds to applying layer norm after the attention layer.
|
||||||
apply_spec_augment (`bool`, *optional*, defaults to `True`):
|
apply_spec_augment (`bool`, *optional*, defaults to `True`):
|
||||||
Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see
|
Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
|
||||||
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech
|
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech
|
||||||
Recognition](https://arxiv.org/abs/1904.08779).
|
Recognition](https://arxiv.org/abs/1904.08779).
|
||||||
mask_time_prob (`float`, *optional*, defaults to 0.05):
|
mask_time_prob (`float`, *optional*, defaults to 0.05):
|
||||||
@@ -132,7 +132,7 @@ class Wav2Vec2Config(PretrainedConfig):
|
|||||||
contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
|
contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
|
||||||
The temperature *kappa* in the contrastive loss.
|
The temperature *kappa* in the contrastive loss.
|
||||||
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
|
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
The dropout probabilitiy for the output of the feature extractor that's used by the quantizer.
|
The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
|
||||||
num_negatives (`int`, *optional*, defaults to 100):
|
num_negatives (`int`, *optional*, defaults to 100):
|
||||||
Number of negative samples for the contrastive loss.
|
Number of negative samples for the contrastive loss.
|
||||||
codevector_dim (`int`, *optional*, defaults to 256):
|
codevector_dim (`int`, *optional*, defaults to 256):
|
||||||
|
|||||||
@@ -395,7 +395,7 @@ class FlaxConvLayersCollection(nn.Module):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
class FlaxWav2Vec2FeatureExtractor(nn.Module):
|
class FlaxWav2Vec2FeatureEncoder(nn.Module):
|
||||||
"""Construct the features from raw audio waveform"""
|
"""Construct the features from raw audio waveform"""
|
||||||
|
|
||||||
config: Wav2Vec2Config
|
config: Wav2Vec2Config
|
||||||
@@ -849,7 +849,7 @@ class FlaxWav2Vec2Module(nn.Module):
|
|||||||
dtype: jnp.dtype = jnp.float32
|
dtype: jnp.dtype = jnp.float32
|
||||||
|
|
||||||
def setup(self):
|
def setup(self):
|
||||||
self.feature_extractor = FlaxWav2Vec2FeatureExtractor(self.config, dtype=self.dtype)
|
self.feature_extractor = FlaxWav2Vec2FeatureEncoder(self.config, dtype=self.dtype)
|
||||||
self.feature_projection = FlaxWav2Vec2FeatureProjection(self.config, dtype=self.dtype)
|
self.feature_projection = FlaxWav2Vec2FeatureProjection(self.config, dtype=self.dtype)
|
||||||
self.masked_spec_embed = self.param(
|
self.masked_spec_embed = self.param(
|
||||||
"masked_spec_embed", jax.nn.initializers.uniform(), (self.config.hidden_size,)
|
"masked_spec_embed", jax.nn.initializers.uniform(), (self.config.hidden_size,)
|
||||||
|
|||||||
@@ -655,7 +655,7 @@ class TFWav2Vec2SamePadLayer(tf.keras.layers.Layer):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
class TFWav2Vec2FeatureExtractor(tf.keras.layers.Layer):
|
class TFWav2Vec2FeatureEncoder(tf.keras.layers.Layer):
|
||||||
def __init__(self, config: Wav2Vec2Config, **kwargs: Any) -> None:
|
def __init__(self, config: Wav2Vec2Config, **kwargs: Any) -> None:
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
@@ -682,6 +682,17 @@ class TFWav2Vec2FeatureExtractor(tf.keras.layers.Layer):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
|
class TFWav2Vec2FeatureExtractor(TFWav2Vec2FeatureEncoder):
|
||||||
|
def __init__(self, config, **kwargs):
|
||||||
|
super().__init__(config, **kwargs)
|
||||||
|
warnings.warn(
|
||||||
|
f"The class `{self.__class__.__name__}` has been depreciated "
|
||||||
|
"and will be removed in Transformers v5. "
|
||||||
|
f"Use `{self.__class__.__bases__[0].__name__}` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class TFWav2Vec2FeatureProjection(tf.keras.layers.Layer):
|
class TFWav2Vec2FeatureProjection(tf.keras.layers.Layer):
|
||||||
def __init__(self, config: Wav2Vec2Config, **kwargs):
|
def __init__(self, config: Wav2Vec2Config, **kwargs):
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
@@ -1107,7 +1118,7 @@ class TFWav2Vec2MainLayer(tf.keras.layers.Layer):
|
|||||||
def __init__(self, config: Wav2Vec2Config, **kwargs):
|
def __init__(self, config: Wav2Vec2Config, **kwargs):
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.config = config
|
self.config = config
|
||||||
self.feature_extractor = TFWav2Vec2FeatureExtractor(config, name="feature_extractor")
|
self.feature_extractor = TFWav2Vec2FeatureEncoder(config, name="feature_extractor")
|
||||||
self.feature_projection = TFWav2Vec2FeatureProjection(config, name="feature_projection")
|
self.feature_projection = TFWav2Vec2FeatureProjection(config, name="feature_projection")
|
||||||
|
|
||||||
if config.do_stable_layer_norm:
|
if config.do_stable_layer_norm:
|
||||||
@@ -1481,8 +1492,20 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel):
|
|||||||
|
|
||||||
def freeze_feature_extractor(self):
|
def freeze_feature_extractor(self):
|
||||||
"""
|
"""
|
||||||
Calling this function will disable the gradient computation for the feature extractor so that its parameter
|
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
|
||||||
will not be updated during training.
|
not be updated during training.
|
||||||
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
|
||||||
|
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
self.freeze_feature_encoder()
|
||||||
|
|
||||||
|
def freeze_feature_encoder(self):
|
||||||
|
"""
|
||||||
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
|
not be updated during training.
|
||||||
"""
|
"""
|
||||||
self.wav2vec2.feature_extractor.trainable = False
|
self.wav2vec2.feature_extractor.trainable = False
|
||||||
|
|
||||||
|
|||||||
@@ -431,7 +431,7 @@ class Wav2Vec2SamePadLayer(nn.Module):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
class Wav2Vec2FeatureExtractor(nn.Module):
|
class Wav2Vec2FeatureEncoder(nn.Module):
|
||||||
"""Construct the features from raw audio waveform"""
|
"""Construct the features from raw audio waveform"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
@@ -484,6 +484,17 @@ class Wav2Vec2FeatureExtractor(nn.Module):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
|
class Wav2Vec2FeatureExtractor(Wav2Vec2FeatureEncoder):
|
||||||
|
def __init__(self, config):
|
||||||
|
super().__init__(config)
|
||||||
|
warnings.warn(
|
||||||
|
f"The class `{self.__class__.__name__}` has been depreciated "
|
||||||
|
"and will be removed in Transformers v5. "
|
||||||
|
f"Use `{self.__class__.__bases__[0].__name__}` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class Wav2Vec2FeatureProjection(nn.Module):
|
class Wav2Vec2FeatureProjection(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
@@ -1125,7 +1136,7 @@ class Wav2Vec2PreTrainedModel(PreTrainedModel):
|
|||||||
return attention_mask
|
return attention_mask
|
||||||
|
|
||||||
def _set_gradient_checkpointing(self, module, value=False):
|
def _set_gradient_checkpointing(self, module, value=False):
|
||||||
if isinstance(module, (Wav2Vec2Encoder, Wav2Vec2EncoderStableLayerNorm, Wav2Vec2FeatureExtractor)):
|
if isinstance(module, (Wav2Vec2Encoder, Wav2Vec2EncoderStableLayerNorm, Wav2Vec2FeatureEncoder)):
|
||||||
module.gradient_checkpointing = value
|
module.gradient_checkpointing = value
|
||||||
|
|
||||||
|
|
||||||
@@ -1194,7 +1205,7 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel):
|
|||||||
def __init__(self, config: Wav2Vec2Config):
|
def __init__(self, config: Wav2Vec2Config):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
self.config = config
|
self.config = config
|
||||||
self.feature_extractor = Wav2Vec2FeatureExtractor(config)
|
self.feature_extractor = Wav2Vec2FeatureEncoder(config)
|
||||||
self.feature_projection = Wav2Vec2FeatureProjection(config)
|
self.feature_projection = Wav2Vec2FeatureProjection(config)
|
||||||
|
|
||||||
# model only needs masking vector if mask prob is > 0.0
|
# model only needs masking vector if mask prob is > 0.0
|
||||||
@@ -1213,8 +1224,20 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel):
|
|||||||
|
|
||||||
def freeze_feature_extractor(self):
|
def freeze_feature_extractor(self):
|
||||||
"""
|
"""
|
||||||
Calling this function will disable the gradient computation for the feature extractor so that its parameters
|
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
|
||||||
will not be updated during training.
|
not be updated during training.
|
||||||
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
|
||||||
|
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
self.freeze_feature_encoder()
|
||||||
|
|
||||||
|
def freeze_feature_encoder(self):
|
||||||
|
"""
|
||||||
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
|
not be updated during training.
|
||||||
"""
|
"""
|
||||||
self.feature_extractor._freeze_parameters()
|
self.feature_extractor._freeze_parameters()
|
||||||
|
|
||||||
@@ -1349,8 +1372,20 @@ class Wav2Vec2ForPreTraining(Wav2Vec2PreTrainedModel):
|
|||||||
|
|
||||||
def freeze_feature_extractor(self):
|
def freeze_feature_extractor(self):
|
||||||
"""
|
"""
|
||||||
Calling this function will disable the gradient computation for the feature extractor so that its parameters
|
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
|
||||||
will not be updated during training.
|
not be updated during training.
|
||||||
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
|
||||||
|
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
self.freeze_feature_encoder()
|
||||||
|
|
||||||
|
def freeze_feature_encoder(self):
|
||||||
|
"""
|
||||||
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
|
not be updated during training.
|
||||||
"""
|
"""
|
||||||
self.wav2vec2.feature_extractor._freeze_parameters()
|
self.wav2vec2.feature_extractor._freeze_parameters()
|
||||||
|
|
||||||
@@ -1637,8 +1672,20 @@ class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel):
|
|||||||
|
|
||||||
def freeze_feature_extractor(self):
|
def freeze_feature_extractor(self):
|
||||||
"""
|
"""
|
||||||
Calling this function will disable the gradient computation for the feature extractor so that its parameter
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
will not be updated during training.
|
not be updated during training.
|
||||||
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
|
||||||
|
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
self.freeze_feature_encoder()
|
||||||
|
|
||||||
|
def freeze_feature_encoder(self):
|
||||||
|
"""
|
||||||
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
|
not be updated during training.
|
||||||
"""
|
"""
|
||||||
self.wav2vec2.feature_extractor._freeze_parameters()
|
self.wav2vec2.feature_extractor._freeze_parameters()
|
||||||
|
|
||||||
@@ -1745,8 +1792,20 @@ class Wav2Vec2ForSequenceClassification(Wav2Vec2PreTrainedModel):
|
|||||||
|
|
||||||
def freeze_feature_extractor(self):
|
def freeze_feature_extractor(self):
|
||||||
"""
|
"""
|
||||||
Calling this function will disable the gradient computation for the feature extractor so that its parameters
|
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
|
||||||
will not be updated during training.
|
not be updated during training.
|
||||||
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
|
||||||
|
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
self.freeze_feature_encoder()
|
||||||
|
|
||||||
|
def freeze_feature_encoder(self):
|
||||||
|
"""
|
||||||
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
|
not be updated during training.
|
||||||
"""
|
"""
|
||||||
self.wav2vec2.feature_extractor._freeze_parameters()
|
self.wav2vec2.feature_extractor._freeze_parameters()
|
||||||
|
|
||||||
@@ -1848,8 +1907,20 @@ class Wav2Vec2ForAudioFrameClassification(Wav2Vec2PreTrainedModel):
|
|||||||
|
|
||||||
def freeze_feature_extractor(self):
|
def freeze_feature_extractor(self):
|
||||||
"""
|
"""
|
||||||
Calling this function will disable the gradient computation for the feature extractor so that its parameters
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
will not be updated during training.
|
not be updated during training.
|
||||||
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
|
||||||
|
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
self.freeze_feature_encoder()
|
||||||
|
|
||||||
|
def freeze_feature_encoder(self):
|
||||||
|
"""
|
||||||
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
|
not be updated during training.
|
||||||
"""
|
"""
|
||||||
self.wav2vec2.feature_extractor._freeze_parameters()
|
self.wav2vec2.feature_extractor._freeze_parameters()
|
||||||
|
|
||||||
@@ -1994,8 +2065,20 @@ class Wav2Vec2ForXVector(Wav2Vec2PreTrainedModel):
|
|||||||
|
|
||||||
def freeze_feature_extractor(self):
|
def freeze_feature_extractor(self):
|
||||||
"""
|
"""
|
||||||
Calling this function will disable the gradient computation for the feature extractor so that its parameters
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
will not be updated during training.
|
not be updated during training.
|
||||||
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
|
||||||
|
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
self.freeze_feature_encoder()
|
||||||
|
|
||||||
|
def freeze_feature_encoder(self):
|
||||||
|
"""
|
||||||
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
|
not be updated during training.
|
||||||
"""
|
"""
|
||||||
self.wav2vec2.feature_extractor._freeze_parameters()
|
self.wav2vec2.feature_extractor._freeze_parameters()
|
||||||
|
|
||||||
|
|||||||
@@ -64,24 +64,24 @@ class WavLMConfig(PretrainedConfig):
|
|||||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||||
The epsilon used by the layer normalization layers.
|
The epsilon used by the layer normalization layers.
|
||||||
feat_extract_norm (`str`, *optional*, defaults to `"group"`):
|
feat_extract_norm (`str`, *optional*, defaults to `"group"`):
|
||||||
The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group
|
The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
|
||||||
normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
|
normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
|
||||||
convolutional layers.
|
convolutional layers.
|
||||||
feat_proj_dropout (`float`, *optional*, defaults to 0.0):
|
feat_proj_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
The dropout probability for output of the feature extractor.
|
The dropout probability for output of the feature encoder.
|
||||||
feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
|
feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
|
||||||
The non-linear activation function (function or string) in the 1D convolutional layers of the feature
|
The non-linear activation function (function or string) in the 1D convolutional layers of the feature
|
||||||
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
|
extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
|
||||||
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
|
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
The dropout probabilitiy for quantized feature extractor states.
|
The dropout probabilitiy for quantized feature encoder states.
|
||||||
conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
|
conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
|
||||||
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
|
A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
|
||||||
feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers.
|
feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
|
||||||
conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
|
conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
|
||||||
A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length
|
A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
|
||||||
of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
|
of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
|
||||||
conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
|
conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
|
||||||
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The
|
A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
|
||||||
length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
|
length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
|
||||||
*conv_dim*.
|
*conv_dim*.
|
||||||
conv_bias (`bool`, *optional*, defaults to `False`):
|
conv_bias (`bool`, *optional*, defaults to `False`):
|
||||||
@@ -96,7 +96,7 @@ class WavLMConfig(PretrainedConfig):
|
|||||||
True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is
|
True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is
|
||||||
False` corresponds to applying layer norm after the attention layer.
|
False` corresponds to applying layer norm after the attention layer.
|
||||||
apply_spec_augment (`bool`, *optional*, defaults to `True`):
|
apply_spec_augment (`bool`, *optional*, defaults to `True`):
|
||||||
Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see
|
Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
|
||||||
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech
|
[SpecAugment: A Simple Data Augmentation Method for Automatic Speech
|
||||||
Recognition](https://arxiv.org/abs/1904.08779).
|
Recognition](https://arxiv.org/abs/1904.08779).
|
||||||
mask_time_prob (`float`, *optional*, defaults to 0.05):
|
mask_time_prob (`float`, *optional*, defaults to 0.05):
|
||||||
@@ -122,7 +122,7 @@ class WavLMConfig(PretrainedConfig):
|
|||||||
contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
|
contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
|
||||||
The temperature *kappa* in the contrastive loss.
|
The temperature *kappa* in the contrastive loss.
|
||||||
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
|
feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
|
||||||
The dropout probabilitiy for the output of the feature extractor that's used by the quantizer.
|
The dropout probabilitiy for the output of the feature encoder that's used by the quantizer.
|
||||||
num_negatives (`int`, *optional*, defaults to 100):
|
num_negatives (`int`, *optional*, defaults to 100):
|
||||||
Number of negative samples for the contrastive loss.
|
Number of negative samples for the contrastive loss.
|
||||||
codevector_dim (`int`, *optional*, defaults to 256):
|
codevector_dim (`int`, *optional*, defaults to 256):
|
||||||
|
|||||||
@@ -15,6 +15,7 @@
|
|||||||
""" PyTorch WavLM model."""
|
""" PyTorch WavLM model."""
|
||||||
|
|
||||||
import math
|
import math
|
||||||
|
import warnings
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Optional, Tuple, Union
|
from typing import Optional, Tuple, Union
|
||||||
|
|
||||||
@@ -352,8 +353,8 @@ class WavLMSamePadLayer(nn.Module):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureExtractor with Wav2Vec2->WavLM
|
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->WavLM
|
||||||
class WavLMFeatureExtractor(nn.Module):
|
class WavLMFeatureEncoder(nn.Module):
|
||||||
"""Construct the features from raw audio waveform"""
|
"""Construct the features from raw audio waveform"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
@@ -404,6 +405,17 @@ class WavLMFeatureExtractor(nn.Module):
|
|||||||
return hidden_states
|
return hidden_states
|
||||||
|
|
||||||
|
|
||||||
|
class WavLMFeatureExtractor(WavLMFeatureEncoder):
|
||||||
|
def __init__(self, config):
|
||||||
|
super().__init__(config)
|
||||||
|
warnings.warn(
|
||||||
|
f"The class `{self.__class__.__name__}` has been depreciated "
|
||||||
|
"and will be removed in Transformers v5. "
|
||||||
|
f"Use `{self.__class__.__bases__[0].__name__}` instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection with Wav2Vec2->WavLM
|
# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection with Wav2Vec2->WavLM
|
||||||
class WavLMFeatureProjection(nn.Module):
|
class WavLMFeatureProjection(nn.Module):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
@@ -1077,7 +1089,7 @@ class WavLMPreTrainedModel(PreTrainedModel):
|
|||||||
return attention_mask
|
return attention_mask
|
||||||
|
|
||||||
def _set_gradient_checkpointing(self, module, value=False):
|
def _set_gradient_checkpointing(self, module, value=False):
|
||||||
if isinstance(module, (WavLMEncoder, WavLMEncoderStableLayerNorm, WavLMFeatureExtractor)):
|
if isinstance(module, (WavLMEncoder, WavLMEncoderStableLayerNorm, WavLMFeatureEncoder)):
|
||||||
module.gradient_checkpointing = value
|
module.gradient_checkpointing = value
|
||||||
|
|
||||||
|
|
||||||
@@ -1146,7 +1158,7 @@ class WavLMModel(WavLMPreTrainedModel):
|
|||||||
def __init__(self, config: WavLMConfig):
|
def __init__(self, config: WavLMConfig):
|
||||||
super().__init__(config)
|
super().__init__(config)
|
||||||
self.config = config
|
self.config = config
|
||||||
self.feature_extractor = WavLMFeatureExtractor(config)
|
self.feature_extractor = WavLMFeatureEncoder(config)
|
||||||
self.feature_projection = WavLMFeatureProjection(config)
|
self.feature_projection = WavLMFeatureProjection(config)
|
||||||
|
|
||||||
# model only needs masking vector if mask prob is > 0.0
|
# model only needs masking vector if mask prob is > 0.0
|
||||||
@@ -1165,8 +1177,20 @@ class WavLMModel(WavLMPreTrainedModel):
|
|||||||
|
|
||||||
def freeze_feature_extractor(self):
|
def freeze_feature_extractor(self):
|
||||||
"""
|
"""
|
||||||
Calling this function will disable the gradient computation for the feature extractor so that its parameters
|
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
|
||||||
will not be updated during training.
|
not be updated during training.
|
||||||
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
|
||||||
|
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
self.freeze_feature_encoder()
|
||||||
|
|
||||||
|
def freeze_feature_encoder(self):
|
||||||
|
"""
|
||||||
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
|
not be updated during training.
|
||||||
"""
|
"""
|
||||||
self.feature_extractor._freeze_parameters()
|
self.feature_extractor._freeze_parameters()
|
||||||
|
|
||||||
@@ -1303,8 +1327,20 @@ class WavLMForCTC(WavLMPreTrainedModel):
|
|||||||
|
|
||||||
def freeze_feature_extractor(self):
|
def freeze_feature_extractor(self):
|
||||||
"""
|
"""
|
||||||
Calling this function will disable the gradient computation for the feature extractor so that its parameter
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
will not be updated during training.
|
not be updated during training.
|
||||||
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
|
||||||
|
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
self.freeze_feature_encoder()
|
||||||
|
|
||||||
|
def freeze_feature_encoder(self):
|
||||||
|
"""
|
||||||
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
|
not be updated during training.
|
||||||
"""
|
"""
|
||||||
self.wavlm.feature_extractor._freeze_parameters()
|
self.wavlm.feature_extractor._freeze_parameters()
|
||||||
|
|
||||||
@@ -1412,8 +1448,20 @@ class WavLMForSequenceClassification(WavLMPreTrainedModel):
|
|||||||
|
|
||||||
def freeze_feature_extractor(self):
|
def freeze_feature_extractor(self):
|
||||||
"""
|
"""
|
||||||
Calling this function will disable the gradient computation for the feature extractor so that its parameters
|
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
|
||||||
will not be updated during training.
|
not be updated during training.
|
||||||
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
|
||||||
|
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
self.freeze_feature_encoder()
|
||||||
|
|
||||||
|
def freeze_feature_encoder(self):
|
||||||
|
"""
|
||||||
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
|
not be updated during training.
|
||||||
"""
|
"""
|
||||||
self.wavlm.feature_extractor._freeze_parameters()
|
self.wavlm.feature_extractor._freeze_parameters()
|
||||||
|
|
||||||
@@ -1516,8 +1564,20 @@ class WavLMForAudioFrameClassification(WavLMPreTrainedModel):
|
|||||||
|
|
||||||
def freeze_feature_extractor(self):
|
def freeze_feature_extractor(self):
|
||||||
"""
|
"""
|
||||||
Calling this function will disable the gradient computation for the feature extractor so that its parameters
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
will not be updated during training.
|
not be updated during training.
|
||||||
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
|
||||||
|
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
self.freeze_feature_encoder()
|
||||||
|
|
||||||
|
def freeze_feature_encoder(self):
|
||||||
|
"""
|
||||||
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
|
not be updated during training.
|
||||||
"""
|
"""
|
||||||
self.wavlm.feature_extractor._freeze_parameters()
|
self.wavlm.feature_extractor._freeze_parameters()
|
||||||
|
|
||||||
@@ -1665,8 +1725,20 @@ class WavLMForXVector(WavLMPreTrainedModel):
|
|||||||
|
|
||||||
def freeze_feature_extractor(self):
|
def freeze_feature_extractor(self):
|
||||||
"""
|
"""
|
||||||
Calling this function will disable the gradient computation for the feature extractor so that its parameters
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
will not be updated during training.
|
not be updated during training.
|
||||||
|
"""
|
||||||
|
warnings.warn(
|
||||||
|
"The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5."
|
||||||
|
"Please use the equivalent `freeze_feature_encoder` method instead.",
|
||||||
|
FutureWarning,
|
||||||
|
)
|
||||||
|
self.freeze_feature_encoder()
|
||||||
|
|
||||||
|
def freeze_feature_encoder(self):
|
||||||
|
"""
|
||||||
|
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
|
||||||
|
not be updated during training.
|
||||||
"""
|
"""
|
||||||
self.wavlm.feature_extractor._freeze_parameters()
|
self.wavlm.feature_extractor._freeze_parameters()
|
||||||
|
|
||||||
|
|||||||
@@ -225,7 +225,7 @@ class HubertModelTester:
|
|||||||
model.train()
|
model.train()
|
||||||
|
|
||||||
# freeze feature encoder
|
# freeze feature encoder
|
||||||
model.freeze_feature_extractor()
|
model.freeze_feature_encoder()
|
||||||
|
|
||||||
input_values = input_values[:3]
|
input_values = input_values[:3]
|
||||||
|
|
||||||
|
|||||||
@@ -203,7 +203,7 @@ class SEWModelTester:
|
|||||||
model.train()
|
model.train()
|
||||||
|
|
||||||
# freeze feature encoder
|
# freeze feature encoder
|
||||||
model.freeze_feature_extractor()
|
model.freeze_feature_encoder()
|
||||||
|
|
||||||
input_values = input_values[:3]
|
input_values = input_values[:3]
|
||||||
|
|
||||||
|
|||||||
@@ -224,7 +224,7 @@ class SEWDModelTester:
|
|||||||
model.train()
|
model.train()
|
||||||
|
|
||||||
# freeze feature encoder
|
# freeze feature encoder
|
||||||
model.freeze_feature_extractor()
|
model.freeze_feature_encoder()
|
||||||
|
|
||||||
input_values = input_values[:3]
|
input_values = input_values[:3]
|
||||||
|
|
||||||
|
|||||||
@@ -184,7 +184,7 @@ class TFHubertModelTester:
|
|||||||
model = TFHubertForCTC(config)
|
model = TFHubertForCTC(config)
|
||||||
|
|
||||||
# freeze feature encoder
|
# freeze feature encoder
|
||||||
model.freeze_feature_extractor()
|
model.freeze_feature_encoder()
|
||||||
|
|
||||||
input_values = input_values[:3]
|
input_values = input_values[:3]
|
||||||
|
|
||||||
|
|||||||
@@ -194,7 +194,7 @@ class TFWav2Vec2ModelTester:
|
|||||||
model = TFWav2Vec2ForCTC(config)
|
model = TFWav2Vec2ForCTC(config)
|
||||||
|
|
||||||
# freeze feature encoder
|
# freeze feature encoder
|
||||||
model.freeze_feature_extractor()
|
model.freeze_feature_encoder()
|
||||||
|
|
||||||
input_values = input_values[:3]
|
input_values = input_values[:3]
|
||||||
|
|
||||||
|
|||||||
@@ -226,7 +226,7 @@ class UniSpeechModelTester:
|
|||||||
model.train()
|
model.train()
|
||||||
|
|
||||||
# freeze feature encoder
|
# freeze feature encoder
|
||||||
model.freeze_feature_extractor()
|
model.freeze_feature_encoder()
|
||||||
|
|
||||||
input_values = input_values[:3]
|
input_values = input_values[:3]
|
||||||
|
|
||||||
|
|||||||
@@ -246,7 +246,7 @@ class UniSpeechSatModelTester:
|
|||||||
model.train()
|
model.train()
|
||||||
|
|
||||||
# freeze feature encoder
|
# freeze feature encoder
|
||||||
model.freeze_feature_extractor()
|
model.freeze_feature_encoder()
|
||||||
|
|
||||||
input_values = input_values[:3]
|
input_values = input_values[:3]
|
||||||
|
|
||||||
|
|||||||
@@ -300,7 +300,7 @@ class Wav2Vec2ModelTester:
|
|||||||
model.train()
|
model.train()
|
||||||
|
|
||||||
# freeze feature encoder
|
# freeze feature encoder
|
||||||
model.freeze_feature_extractor()
|
model.freeze_feature_encoder()
|
||||||
|
|
||||||
input_values = input_values[:3]
|
input_values = input_values[:3]
|
||||||
|
|
||||||
|
|||||||
@@ -238,7 +238,7 @@ class WavLMModelTester:
|
|||||||
model.train()
|
model.train()
|
||||||
|
|
||||||
# freeze feature encoder
|
# freeze feature encoder
|
||||||
model.freeze_feature_extractor()
|
model.freeze_feature_encoder()
|
||||||
|
|
||||||
input_values = input_values[:3]
|
input_values = input_values[:3]
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user