From 600496fa509e5cc7245e0594e60142b9ab565c73 Mon Sep 17 00:00:00 2001 From: Patrick von Platen Date: Tue, 28 Dec 2021 20:33:23 +0100 Subject: [PATCH] [Wav2Vec2] Rename model's feature extractor to feature encoder (#14959) * rename classes * clean up more namings * remove bogus file * Apply suggestions from code review * Apply suggestions from code review * replace more names * more regex replace * make style * correct * correct more * make style * finish * correct more in wav2vec2 * make style * improve freeze_extractor * add aliases * add tf aliases --- .../run_audio_classification.py | 43 +++++-- examples/pytorch/speech-recognition/README.md | 8 +- .../run_speech_recognition_ctc.py | 56 +++++---- .../run_speech_recognition_seq2seq.py | 26 ++-- .../models/hubert/configuration_hubert.py | 14 +-- .../models/hubert/modeling_hubert.py | 50 ++++++-- .../models/hubert/modeling_tf_hubert.py | 31 ++++- .../models/sew/configuration_sew.py | 12 +- src/transformers/models/sew/modeling_sew.py | 52 ++++++-- .../models/sew_d/configuration_sew_d.py | 14 +-- .../models/sew_d/modeling_sew_d.py | 50 ++++++-- .../modeling_speech_encoder_decoder.py | 6 +- .../unispeech/configuration_unispeech.py | 16 +-- .../models/unispeech/modeling_unispeech.py | 68 +++++++++-- .../configuration_unispeech_sat.py | 16 +-- ..._original_pytorch_checkpoint_to_pytorch.py | 7 +- .../unispeech_sat/modeling_unispeech_sat.py | 106 +++++++++++++--- .../models/wav2vec2/configuration_wav2vec2.py | 16 +-- .../models/wav2vec2/modeling_flax_wav2vec2.py | 4 +- .../models/wav2vec2/modeling_tf_wav2vec2.py | 31 ++++- .../models/wav2vec2/modeling_wav2vec2.py | 113 +++++++++++++++--- .../models/wavlm/configuration_wavlm.py | 16 +-- .../models/wavlm/modeling_wavlm.py | 100 +++++++++++++--- tests/test_modeling_hubert.py | 2 +- tests/test_modeling_sew.py | 2 +- tests/test_modeling_sew_d.py | 2 +- tests/test_modeling_tf_hubert.py | 2 +- tests/test_modeling_tf_wav2vec2.py | 2 +- tests/test_modeling_unispeech.py | 2 +- tests/test_modeling_unispeech_sat.py | 2 +- tests/test_modeling_wav2vec2.py | 2 +- tests/test_modeling_wavlm.py | 2 +- 32 files changed, 658 insertions(+), 215 deletions(-) diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py index 8b9b72dc61..9dcb98d456 100644 --- a/examples/pytorch/audio-classification/run_audio_classification.py +++ b/examples/pytorch/audio-classification/run_audio_classification.py @@ -17,6 +17,7 @@ import logging import os import sys +import warnings from dataclasses import dataclass, field from random import randint from typing import Optional @@ -76,24 +77,24 @@ class DataTrainingArguments: eval_file: Optional[str] = field( default=None, metadata={"help": "A file containing the validation audio paths and labels."} ) - train_split_name: Optional[str] = field( + train_split_name: str = field( default="train", metadata={ "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'" }, ) - eval_split_name: Optional[str] = field( + eval_split_name: str = field( default="validation", metadata={ "help": "The name of the training data set split to use (via the datasets library). Defaults to " "'validation'" }, ) - audio_column_name: Optional[str] = field( + audio_column_name: str = field( default="audio", metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"}, ) - label_column_name: Optional[str] = field( + label_column_name: str = field( default="label", metadata={"help": "The name of the dataset column containing the labels. Defaults to 'label'"} ) max_train_samples: Optional[int] = field( @@ -110,7 +111,7 @@ class DataTrainingArguments: "value if set." }, ) - max_length_seconds: Optional[float] = field( + max_length_seconds: float = field( default=20, metadata={"help": "Audio clips will be randomly cut to this length during training if the value is set."}, ) @@ -136,11 +137,13 @@ class ModelArguments: default="main", metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, ) - feature_extractor_name: str = field(default=None, metadata={"help": "Name or path of preprocessor config."}) - freeze_feature_extractor: Optional[bool] = field( - default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."} + feature_extractor_name: Optional[str] = field( + default=None, metadata={"help": "Name or path of preprocessor config."} ) - attention_mask: Optional[bool] = field( + freeze_feature_encoder: bool = field( + default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."} + ) + attention_mask: bool = field( default=True, metadata={"help": "Whether to generate an attention mask in the feature extractor."} ) use_auth_token: bool = field( @@ -150,6 +153,24 @@ class ModelArguments: "with private models)." }, ) + freeze_feature_extractor: Optional[bool] = field( + default=None, metadata={"help": "Whether to freeze the feature extractor layers of the model."} + ) + + def __post_init__(self): + if not self.freeze_feature_extractor and self.freeze_feature_encoder: + warnings.warn( + "The argument `--freeze_feature_extractor` is deprecated and " + "will be removed in a future version. Use `--freeze_feature_encoder`" + "instead. Setting `freeze_feature_encoder==True`.", + FutureWarning, + ) + if self.freeze_feature_extractor and not self.freeze_feature_encoder: + raise ValueError( + "The argument `--freeze_feature_extractor` is deprecated and " + "should not be used in combination with `--freeze_feature_encoder`." + "Only make use of `--freeze_feature_encoder`." + ) def main(): @@ -302,8 +323,8 @@ def main(): ) # freeze the convolutional waveform encoder - if model_args.freeze_feature_extractor: - model.freeze_feature_extractor() + if model_args.freeze_feature_encoder: + model.freeze_feature_encoder() if training_args.do_train: if data_args.max_train_samples is not None: diff --git a/examples/pytorch/speech-recognition/README.md b/examples/pytorch/speech-recognition/README.md index fdc29139a8..8eeb8efc87 100644 --- a/examples/pytorch/speech-recognition/README.md +++ b/examples/pytorch/speech-recognition/README.md @@ -78,7 +78,7 @@ python run_speech_recognition_ctc.py \ --eval_steps="100" \ --layerdrop="0.0" \ --save_total_limit="3" \ - --freeze_feature_extractor \ + --freeze_feature_encoder \ --gradient_checkpointing \ --chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \ --fp16 \ @@ -113,7 +113,7 @@ python -m torch.distributed.launch \ --logging_steps="1" \ --layerdrop="0.0" \ --save_total_limit="3" \ - --freeze_feature_extractor \ + --freeze_feature_encoder \ --gradient_checkpointing \ --chars_to_ignore , ? . ! - \; \: \" “ % ‘ ” � \ --fp16 \ @@ -304,7 +304,7 @@ python run_speech_recognition_seq2seq.py \ --eval_steps="400" \ --logging_steps="10" \ --save_total_limit="1" \ - --freeze_feature_extractor \ + --freeze_feature_encoder \ --gradient_checkpointing \ --fp16 \ --group_by_length \ @@ -346,7 +346,7 @@ python -m torch.distributed.launch \ --eval_steps="400" \ --logging_steps="10" \ --save_total_limit="1" \ - --freeze_feature_extractor \ + --freeze_feature_encoder \ --gradient_checkpointing \ --fp16 \ --group_by_length \ diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py index 98a4f1374a..1784b613fa 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py @@ -78,29 +78,27 @@ class ModelArguments: default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, ) - freeze_feature_extractor: Optional[bool] = field( - default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."} + freeze_feature_encoder: bool = field( + default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."} ) - attention_dropout: Optional[float] = field( + attention_dropout: float = field( default=0.0, metadata={"help": "The dropout ratio for the attention probabilities."} ) - activation_dropout: Optional[float] = field( + activation_dropout: float = field( default=0.0, metadata={"help": "The dropout ratio for activations inside the fully connected layer."} ) - feat_proj_dropout: Optional[float] = field( - default=0.0, metadata={"help": "The dropout ratio for the projected features."} - ) - hidden_dropout: Optional[float] = field( + feat_proj_dropout: float = field(default=0.0, metadata={"help": "The dropout ratio for the projected features."}) + hidden_dropout: float = field( default=0.0, metadata={ "help": "The dropout probability for all fully connected layers in the embeddings, encoder, and pooler." }, ) - final_dropout: Optional[float] = field( + final_dropout: float = field( default=0.0, metadata={"help": "The dropout probability for the final projection layer."}, ) - mask_time_prob: Optional[float] = field( + mask_time_prob: float = field( default=0.05, metadata={ "help": "Probability of each feature vector along the time axis to be chosen as the start of the vector" @@ -108,22 +106,22 @@ class ModelArguments: "vectors will be masked along the time axis." }, ) - mask_time_length: Optional[int] = field( + mask_time_length: int = field( default=10, metadata={"help": "Length of vector span to mask along the time axis."}, ) - mask_feature_prob: Optional[float] = field( + mask_feature_prob: float = field( default=0.0, metadata={ "help": "Probability of each feature vector along the feature axis to be chosen as the start of the vector" "span to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature bins will be masked along the time axis." }, ) - mask_feature_length: Optional[int] = field( + mask_feature_length: int = field( default=10, metadata={"help": "Length of vector span to mask along the feature axis."}, ) - layerdrop: Optional[float] = field(default=0.0, metadata={"help": "The LayerDrop probability."}) + layerdrop: float = field(default=0.0, metadata={"help": "The LayerDrop probability."}) ctc_loss_reduction: Optional[str] = field( default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."} ) @@ -142,26 +140,26 @@ class DataTrainingArguments: dataset_name: str = field( metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} ) - dataset_config_name: Optional[str] = field( + dataset_config_name: str = field( default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} ) - train_split_name: Optional[str] = field( + train_split_name: str = field( default="train+validation", metadata={ "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'" }, ) - eval_split_name: Optional[str] = field( + eval_split_name: str = field( default="test", metadata={ "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'" }, ) - audio_column_name: Optional[str] = field( + audio_column_name: str = field( default="audio", metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"}, ) - text_column_name: Optional[str] = field( + text_column_name: str = field( default="text", metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"}, ) @@ -190,20 +188,20 @@ class DataTrainingArguments: default=None, metadata={"help": "A list of characters to remove from the transcripts."}, ) - eval_metrics: Optional[List[str]] = list_field( + eval_metrics: List[str] = list_field( default=["wer"], metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"}, ) - max_duration_in_seconds: Optional[float] = field( + max_duration_in_seconds: float = field( default=20.0, metadata={ "help": "Filter audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`" }, ) - min_duration_in_seconds: Optional[float] = field( + min_duration_in_seconds: float = field( default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"} ) - preprocessing_only: Optional[bool] = field( + preprocessing_only: bool = field( default=False, metadata={ "help": "Whether to only do data preprocessing and skip training. " @@ -212,22 +210,22 @@ class DataTrainingArguments: "so that the cached datasets can consequently be loaded in distributed training" }, ) - use_auth_token: Optional[bool] = field( + use_auth_token: bool = field( default=False, metadata={ "help": "If :obj:`True`, will use the token generated when running" ":obj:`transformers-cli login` as HTTP bearer authorization for remote files." }, ) - unk_token: Optional[str] = field( + unk_token: str = field( default="[UNK]", metadata={"help": "The unk token for the tokenizer"}, ) - pad_token: Optional[str] = field( + pad_token: str = field( default="[PAD]", metadata={"help": "The padding token for the tokenizer"}, ) - word_delimiter_token: Optional[str] = field( + word_delimiter_token: str = field( default="|", metadata={"help": "The word delimiter token for the tokenizer"}, ) @@ -545,8 +543,8 @@ def main(): ) # freeze encoder - if model_args.freeze_feature_extractor: - model.freeze_feature_extractor() + if model_args.freeze_feature_encoder: + model.freeze_feature_encoder() # 6. Now we preprocess the datasets including loading the audio, resampling and normalization # Thankfully, `datasets` takes care of automatically loading and resampling the audio, diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py index f2a3b1ee4a..89efdca6a6 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py @@ -91,8 +91,8 @@ class ModelArguments: "with private models)." }, ) - freeze_feature_extractor: Optional[bool] = field( - default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."} + freeze_feature_encoder: bool = field( + default=True, metadata={"help": "Whether to freeze the feature encoder layers of the model."} ) @@ -102,7 +102,7 @@ class DataTrainingArguments: Arguments pertaining to what data we are going to input our model for training and eval. """ - dataset_name: Optional[str] = field( + dataset_name: str = field( default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} ) dataset_config_name: Optional[str] = field( @@ -133,24 +133,24 @@ class DataTrainingArguments: "value if set." }, ) - audio_column_name: Optional[str] = field( + audio_column_name: str = field( default="audio", metadata={"help": "The name of the dataset column containing the audio data. Defaults to 'audio'"}, ) - text_column_name: Optional[str] = field( + text_column_name: str = field( default="text", metadata={"help": "The name of the dataset column containing the text data. Defaults to 'text'"}, ) - max_duration_in_seconds: Optional[float] = field( + max_duration_in_seconds: float = field( default=20.0, metadata={ "help": "Truncate audio files that are longer than `max_duration_in_seconds` seconds to 'max_duration_in_seconds`" }, ) - min_duration_in_seconds: Optional[float] = field( + min_duration_in_seconds: float = field( default=0.0, metadata={"help": "Filter audio files that are shorter than `min_duration_in_seconds` seconds"} ) - preprocessing_only: Optional[bool] = field( + preprocessing_only: bool = field( default=False, metadata={ "help": "Whether to only do data preprocessing and skip training. " @@ -159,19 +159,19 @@ class DataTrainingArguments: "so that the cached datasets can consequently be loaded in distributed training" }, ) - train_split_name: Optional[str] = field( + train_split_name: str = field( default="train", metadata={ "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'" }, ) - eval_split_name: Optional[str] = field( + eval_split_name: str = field( default="test", metadata={ "help": "The name of the training data set split to use (via the datasets library). Defaults to 'train'" }, ) - do_lower_case: Optional[bool] = field( + do_lower_case: bool = field( default=True, metadata={"help": "Whether the target text should be lower cased."}, ) @@ -335,8 +335,8 @@ def main(): if model.config.decoder_start_token_id is None: raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined") - if model_args.freeze_feature_extractor: - model.freeze_feature_extractor() + if model_args.freeze_feature_encoder: + model.freeze_feature_encoder() # 6. Resample speech dataset if necassary dataset_sampling_rate = next(iter(raw_datasets.values())).features[data_args.audio_column_name].sampling_rate diff --git a/src/transformers/models/hubert/configuration_hubert.py b/src/transformers/models/hubert/configuration_hubert.py index 9d481a7555..703d0eddfb 100644 --- a/src/transformers/models/hubert/configuration_hubert.py +++ b/src/transformers/models/hubert/configuration_hubert.py @@ -64,24 +64,24 @@ class HubertConfig(PretrainedConfig): layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. feat_extract_norm (`str`, *optional*, defaults to `"group"`): - The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group + The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D convolutional layers. feat_proj_dropout (`float`, *optional*, defaults to 0.0): - The dropout probability for output of the feature extractor. + The dropout probability for output of the feature encoder. feat_proj_layer_norm (`bool`, *optional*, defaults to `True`): - Whether to apply LayerNorm to the output of the feature extractor. + Whether to apply LayerNorm to the output of the feature encoder. feat_extract_activation (`str, `optional`, defaults to `"gelu"`): The non-linear activation function (function or string) in the 1D convolutional layers of the feature extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`): A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the - feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers. + feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers. conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`): - A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length + A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*. conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`): - A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The + A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The length of *conv_kernel* defines the number of convolutional layers and has to match the the length of *conv_dim*. conv_bias (`bool`, *optional*, defaults to `False`): @@ -96,7 +96,7 @@ class HubertConfig(PretrainedConfig): True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is False` corresponds to applying layer norm after the attention layer. apply_spec_augment (`bool`, *optional*, defaults to `True`): - Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see + Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see [SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition](https://arxiv.org/abs/1904.08779). mask_time_prob (`float`, *optional*, defaults to 0.05): diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index 0f955d734c..c317bf3247 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -14,6 +14,7 @@ # limitations under the License. """ PyTorch Hubert model.""" +import warnings from typing import Optional, Tuple, Union import numpy as np @@ -284,8 +285,8 @@ class HubertSamePadLayer(nn.Module): return hidden_states -# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureExtractor with Wav2Vec2->Hubert -class HubertFeatureExtractor(nn.Module): +# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->Hubert +class HubertFeatureEncoder(nn.Module): """Construct the features from raw audio waveform""" def __init__(self, config): @@ -336,6 +337,17 @@ class HubertFeatureExtractor(nn.Module): return hidden_states +class HubertFeatureExtractor(HubertFeatureEncoder): + def __init__(self, config): + super().__init__(config) + warnings.warn( + f"The class `{self.__class__.__name__}` has been depreciated " + "and will be removed in Transformers v5. " + f"Use `{self.__class__.__bases__[0].__name__}` instead.", + FutureWarning, + ) + + class HubertFeatureProjection(nn.Module): def __init__(self, config): super().__init__() @@ -902,7 +914,7 @@ class HubertModel(HubertPreTrainedModel): def __init__(self, config: HubertConfig): super().__init__(config) self.config = config - self.feature_extractor = HubertFeatureExtractor(config) + self.feature_extractor = HubertFeatureEncoder(config) self.feature_projection = HubertFeatureProjection(config) if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0: @@ -1063,8 +1075,20 @@ class HubertForCTC(HubertPreTrainedModel): def freeze_feature_extractor(self): """ - Calling this function will disable the gradient computation for the feature extractor so that its parameter - will not be updated during training. + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. """ self.hubert.feature_extractor._freeze_parameters() @@ -1172,8 +1196,20 @@ class HubertForSequenceClassification(HubertPreTrainedModel): def freeze_feature_extractor(self): """ - Calling this function will disable the gradient computation for the feature extractor so that its parameters - will not be updated during training. + Calling this function will disable the gradient computation for the feature encoder so that its parameters will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. """ self.hubert.feature_extractor._freeze_parameters() diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py index 8cbd2c6fdf..79ae4ad934 100644 --- a/src/transformers/models/hubert/modeling_tf_hubert.py +++ b/src/transformers/models/hubert/modeling_tf_hubert.py @@ -659,7 +659,7 @@ class TFHubertSamePadLayer(tf.keras.layers.Layer): return hidden_states -class TFHubertFeatureExtractor(tf.keras.layers.Layer): +class TFHubertFeatureEncoder(tf.keras.layers.Layer): def __init__(self, config: HubertConfig, **kwargs: Any) -> None: super().__init__(**kwargs) @@ -686,6 +686,17 @@ class TFHubertFeatureExtractor(tf.keras.layers.Layer): return hidden_states +class TFHubertFeatureExtractor(TFHubertFeatureEncoder): + def __init__(self, config, **kwargs): + super().__init__(config, **kwargs) + warnings.warn( + f"The class `{self.__class__.__name__}` has been depreciated " + "and will be removed in Transformers v5. " + f"Use `{self.__class__.__bases__[0].__name__}` instead.", + FutureWarning, + ) + + class TFHubertFeatureProjection(tf.keras.layers.Layer): def __init__(self, config: HubertConfig, **kwargs): super().__init__(**kwargs) @@ -1116,7 +1127,7 @@ class TFHubertMainLayer(tf.keras.layers.Layer): def __init__(self, config: HubertConfig, **kwargs): super().__init__(**kwargs) self.config = config - self.feature_extractor = TFHubertFeatureExtractor(config, name="feature_extractor") + self.feature_extractor = TFHubertFeatureEncoder(config, name="feature_extractor") self.feature_projection = TFHubertFeatureProjection(config, name="feature_projection") if config.do_stable_layer_norm: @@ -1490,8 +1501,20 @@ class TFHubertForCTC(TFHubertPreTrainedModel): def freeze_feature_extractor(self): """ - Calling this function will disable the gradient computation for the feature extractor so that its parameter - will not be updated during training. + Calling this function will disable the gradient computation for the feature encoder so that its parameters will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. """ self.hubert.feature_extractor.trainable = False diff --git a/src/transformers/models/sew/configuration_sew.py b/src/transformers/models/sew/configuration_sew.py index 9b05e23aa9..9d0953ffc1 100644 --- a/src/transformers/models/sew/configuration_sew.py +++ b/src/transformers/models/sew/configuration_sew.py @@ -65,22 +65,22 @@ class SEWConfig(PretrainedConfig): layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. feat_extract_norm (`str`, *optional*, defaults to `"group"`): - The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group + The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D convolutional layers. feat_proj_dropout (`float`, *optional*, defaults to 0.0): - The dropout probability for output of the feature extractor. + The dropout probability for output of the feature encoder. feat_extract_activation (`str, `optional`, defaults to `"gelu"`): The non-linear activation function (function or string) in the 1D convolutional layers of the feature extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. conv_dim (`Tuple[int]`, *optional*, defaults to `(64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512)`): A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the - feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers. + feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers. conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1)`): - A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length + A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*. conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1)`): - A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The + A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The length of *conv_kernel* defines the number of convolutional layers and has to match the the length of *conv_dim*. conv_bias (`bool`, *optional*, defaults to `False`): @@ -91,7 +91,7 @@ class SEWConfig(PretrainedConfig): num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16): Number of groups of 1D convolutional positional embeddings layer. apply_spec_augment (`bool`, *optional*, defaults to `True`): - Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see + Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see [SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition](https://arxiv.org/abs/1904.08779). mask_time_prob (`float`, *optional*, defaults to 0.05): diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py index 6b6a8b83a2..ae283baf39 100644 --- a/src/transformers/models/sew/modeling_sew.py +++ b/src/transformers/models/sew/modeling_sew.py @@ -15,6 +15,7 @@ """ PyTorch SEW model.""" import math +import warnings from typing import Optional, Tuple, Union import numpy as np @@ -301,8 +302,8 @@ class SEWUpsampling(nn.Module): return hidden_states -# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureExtractor with Wav2Vec2->SEW -class SEWFeatureExtractor(nn.Module): +# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->SEW +class SEWFeatureEncoder(nn.Module): """Construct the features from raw audio waveform""" def __init__(self, config): @@ -353,6 +354,17 @@ class SEWFeatureExtractor(nn.Module): return hidden_states +class SEWFeatureExtractor(SEWFeatureEncoder): + def __init__(self, config): + super().__init__(config) + warnings.warn( + f"The class `{self.__class__.__name__}` has been depreciated " + "and will be removed in Transformers v5. " + f"Use `{self.__class__.__bases__[0].__name__}` instead.", + FutureWarning, + ) + + # Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->SEW class SEWAttention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" @@ -712,7 +724,7 @@ class SEWPreTrainedModel(PreTrainedModel): module.bias.data.zero_() def _set_gradient_checkpointing(self, module, value=False): - if isinstance(module, (SEWEncoder, SEWFeatureExtractor)): + if isinstance(module, (SEWEncoder, SEWFeatureEncoder)): module.gradient_checkpointing = value def _get_feat_extract_output_lengths(self, input_lengths: Union[torch.LongTensor, int]): @@ -797,7 +809,7 @@ class SEWModel(SEWPreTrainedModel): def __init__(self, config: SEWConfig): super().__init__(config) self.config = config - self.feature_extractor = SEWFeatureExtractor(config) + self.feature_extractor = SEWFeatureEncoder(config) self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps) self.project_features = config.conv_dim[-1] != config.hidden_size @@ -943,8 +955,20 @@ class SEWForCTC(SEWPreTrainedModel): def freeze_feature_extractor(self): """ - Calling this function will disable the gradient computation for the feature extractor so that its parameter - will not be updated during training. + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. """ self.sew.feature_extractor._freeze_parameters() @@ -1052,8 +1076,20 @@ class SEWForSequenceClassification(SEWPreTrainedModel): def freeze_feature_extractor(self): """ - Calling this function will disable the gradient computation for the feature extractor so that its parameters - will not be updated during training. + Calling this function will disable the gradient computation for the feature encoder so that its parameters will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. """ self.sew.feature_extractor._freeze_parameters() diff --git a/src/transformers/models/sew_d/configuration_sew_d.py b/src/transformers/models/sew_d/configuration_sew_d.py index eb5d9d99c1..d808da2d1a 100644 --- a/src/transformers/models/sew_d/configuration_sew_d.py +++ b/src/transformers/models/sew_d/configuration_sew_d.py @@ -81,24 +81,24 @@ class SEWDConfig(PretrainedConfig): layer_norm_eps (`float`, *optional*, defaults to 1e-7): The epsilon used by the layer normalization layers in the transformer encoder. feature_layer_norm_eps (`float`, *optional*, defaults to 1e-5): - The epsilon used by the layer normalization after the feature extractor. + The epsilon used by the layer normalization after the feature encoder. feat_extract_norm (`str`, *optional*, defaults to `"group"`): - The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group + The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D convolutional layers. feat_proj_dropout (`float`, *optional*, defaults to 0.0): - The dropout probability for output of the feature extractor. + The dropout probability for output of the feature encoder. feat_extract_activation (`str, `optional`, defaults to `"gelu"`): The non-linear activation function (function or string) in the 1D convolutional layers of the feature extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. conv_dim (`Tuple[int]`, *optional*, defaults to `(64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512)`): A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the - feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers. + feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers. conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1)`): - A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length + A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*. conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1)`): - A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The + A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The length of *conv_kernel* defines the number of convolutional layers and has to match the the length of *conv_dim*. conv_bias (`bool`, *optional*, defaults to `False`): @@ -109,7 +109,7 @@ class SEWDConfig(PretrainedConfig): num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16): Number of groups of 1D convolutional positional embeddings layer. apply_spec_augment (`bool`, *optional*, defaults to `True`): - Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see + Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see [SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition](https://arxiv.org/abs/1904.08779). mask_time_prob (`float`, *optional*, defaults to 0.05): diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py index 636766663e..be02a12244 100644 --- a/src/transformers/models/sew_d/modeling_sew_d.py +++ b/src/transformers/models/sew_d/modeling_sew_d.py @@ -15,6 +15,7 @@ """ PyTorch SEW model.""" import math +import warnings from collections.abc import Sequence from typing import Optional, Tuple, Union @@ -387,8 +388,8 @@ class SEWDUpsampling(nn.Module): return hidden_states -# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureExtractor with Wav2Vec2->SEWD -class SEWDFeatureExtractor(nn.Module): +# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->SEWD +class SEWDFeatureEncoder(nn.Module): """Construct the features from raw audio waveform""" def __init__(self, config): @@ -439,6 +440,17 @@ class SEWDFeatureExtractor(nn.Module): return hidden_states +class SEWDFeatureExtractor(SEWDFeatureEncoder): + def __init__(self, config): + super().__init__(config) + warnings.warn( + f"The class `{self.__class__.__name__}` has been depreciated " + "and will be removed in Transformers v5. " + f"Use `{self.__class__.__bases__[0].__name__}` instead.", + FutureWarning, + ) + + # Copied from transformers.models.deberta.modeling_deberta.ContextPooler class ContextPooler(nn.Module): def __init__(self, config): @@ -1333,7 +1345,7 @@ class SEWDModel(SEWDPreTrainedModel): def __init__(self, config: SEWDConfig): super().__init__(config) self.config = config - self.feature_extractor = SEWDFeatureExtractor(config) + self.feature_extractor = SEWDFeatureEncoder(config) self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.feature_layer_norm_eps) self.project_features = config.conv_dim[-1] != config.hidden_size @@ -1479,8 +1491,20 @@ class SEWDForCTC(SEWDPreTrainedModel): def freeze_feature_extractor(self): """ - Calling this function will disable the gradient computation for the feature extractor so that its parameter - will not be updated during training. + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. """ self.sew_d.feature_extractor._freeze_parameters() @@ -1588,8 +1612,20 @@ class SEWDForSequenceClassification(SEWDPreTrainedModel): def freeze_feature_extractor(self): """ - Calling this function will disable the gradient computation for the feature extractor so that its parameters - will not be updated during training. + Calling this function will disable the gradient computation for the feature encoder so that its parameters will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. """ self.sew_d.feature_extractor._freeze_parameters() diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py index b632c020ec..510c22aa69 100644 --- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py @@ -265,12 +265,12 @@ class SpeechEncoderDecoderModel(PreTrainedModel): def set_output_embeddings(self, new_embeddings): return self.decoder.set_output_embeddings(new_embeddings) - def freeze_feature_extractor(self): + def freeze_feature_encoder(self): """ - Calling this function will disable the gradient computation for the feature extractor of the speech encoder so + Calling this function will disable the gradient computation for the feature encoder of the speech encoder so that its parameters will not be updated during training. """ - self.encoder.freeze_feature_extractor() + self.encoder.freeze_feature_encoder() @classmethod def from_pretrained(cls, *args, **kwargs): diff --git a/src/transformers/models/unispeech/configuration_unispeech.py b/src/transformers/models/unispeech/configuration_unispeech.py index 9d2f2e025e..996a27fa92 100644 --- a/src/transformers/models/unispeech/configuration_unispeech.py +++ b/src/transformers/models/unispeech/configuration_unispeech.py @@ -65,24 +65,24 @@ class UniSpeechConfig(PretrainedConfig): layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. feat_extract_norm (`str`, *optional*, defaults to `"group"`): - The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group + The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D convolutional layers. feat_proj_dropout (`float`, *optional*, defaults to 0.0): - The dropout probability for output of the feature extractor. + The dropout probability for output of the feature encoder. feat_extract_activation (`str, `optional`, defaults to `"gelu"`): The non-linear activation function (function or string) in the 1D convolutional layers of the feature extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. feat_quantizer_dropout (`float`, *optional*, defaults to 0.0): - The dropout probabilitiy for quantized feature extractor states. + The dropout probabilitiy for quantized feature encoder states. conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`): A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the - feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers. + feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers. conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`): - A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length + A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*. conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`): - A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The + A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The length of *conv_kernel* defines the number of convolutional layers and has to match the the length of *conv_dim*. conv_bias (`bool`, *optional*, defaults to `False`): @@ -97,7 +97,7 @@ class UniSpeechConfig(PretrainedConfig): True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is False` corresponds to applying layer norm after the attention layer. apply_spec_augment (`bool`, *optional*, defaults to `True`): - Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see + Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see [SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition](https://arxiv.org/abs/1904.08779). mask_time_prob (`float`, *optional*, defaults to 0.05): @@ -132,7 +132,7 @@ class UniSpeechConfig(PretrainedConfig): contrastive_logits_temperature (`float`, *optional*, defaults to 0.1): The temperature *kappa* in the contrastive loss. feat_quantizer_dropout (`float`, *optional*, defaults to 0.0): - The dropout probabilitiy for the output of the feature extractor that's used by the quantizer. + The dropout probabilitiy for the output of the feature encoder that's used by the quantizer. num_negatives (`int`, *optional*, defaults to 100): Number of negative samples for the contrastive loss. codevector_dim (`int`, *optional*, defaults to 256): diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py index 124b3be1d3..5dfd73a3d3 100755 --- a/src/transformers/models/unispeech/modeling_unispeech.py +++ b/src/transformers/models/unispeech/modeling_unispeech.py @@ -15,6 +15,7 @@ """ PyTorch UniSpeech model.""" import math +import warnings from dataclasses import dataclass from typing import Optional, Tuple, Union @@ -351,8 +352,8 @@ class UniSpeechSamePadLayer(nn.Module): return hidden_states -# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureExtractor with Wav2Vec2->UniSpeech -class UniSpeechFeatureExtractor(nn.Module): +# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->UniSpeech +class UniSpeechFeatureEncoder(nn.Module): """Construct the features from raw audio waveform""" def __init__(self, config): @@ -406,6 +407,17 @@ class UniSpeechFeatureExtractor(nn.Module): return hidden_states +class UniSpeechFeatureExtractor(UniSpeechFeatureEncoder): + def __init__(self, config): + super().__init__(config) + warnings.warn( + f"The class `{self.__class__.__name__}` has been depreciated " + "and will be removed in Transformers v5. " + f"Use `{self.__class__.__bases__[0].__name__}` instead.", + FutureWarning, + ) + + # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection with Wav2Vec2->UniSpeech class UniSpeechFeatureProjection(nn.Module): def __init__(self, config): @@ -980,7 +992,7 @@ class UniSpeechPreTrainedModel(PreTrainedModel): return attention_mask def _set_gradient_checkpointing(self, module, value=False): - if isinstance(module, (UniSpeechEncoder, UniSpeechEncoderStableLayerNorm, UniSpeechFeatureExtractor)): + if isinstance(module, (UniSpeechEncoder, UniSpeechEncoderStableLayerNorm, UniSpeechFeatureEncoder)): module.gradient_checkpointing = value @@ -1049,7 +1061,7 @@ class UniSpeechModel(UniSpeechPreTrainedModel): def __init__(self, config: UniSpeechConfig): super().__init__(config) self.config = config - self.feature_extractor = UniSpeechFeatureExtractor(config) + self.feature_extractor = UniSpeechFeatureEncoder(config) self.feature_projection = UniSpeechFeatureProjection(config) if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0: @@ -1193,8 +1205,20 @@ class UniSpeechForPreTraining(UniSpeechPreTrainedModel): def freeze_feature_extractor(self): """ - Calling this function will disable the gradient computation for the feature extractor so that its parameters - will not be updated during training. + Calling this function will disable the gradient computation for the feature encoder so that its parameters will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. """ self.unispeech.feature_extractor._freeze_parameters() @@ -1358,8 +1382,20 @@ class UniSpeechForCTC(UniSpeechPreTrainedModel): def freeze_feature_extractor(self): """ - Calling this function will disable the gradient computation for the feature extractor so that its parameter - will not be updated during training. + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. """ self.unispeech.feature_extractor._freeze_parameters() @@ -1467,8 +1503,20 @@ class UniSpeechForSequenceClassification(UniSpeechPreTrainedModel): def freeze_feature_extractor(self): """ - Calling this function will disable the gradient computation for the feature extractor so that its parameters - will not be updated during training. + Calling this function will disable the gradient computation for the feature encoder so that its parameters will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. """ self.unispeech.feature_extractor._freeze_parameters() diff --git a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py index d58e8fcb12..d13260dcff 100644 --- a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py @@ -65,24 +65,24 @@ class UniSpeechSatConfig(PretrainedConfig): layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. feat_extract_norm (`str`, *optional*, defaults to `"group"`): - The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group + The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D convolutional layers. feat_proj_dropout (`float`, *optional*, defaults to 0.0): - The dropout probability for output of the feature extractor. + The dropout probability for output of the feature encoder. feat_extract_activation (`str, `optional`, defaults to `"gelu"`): The non-linear activation function (function or string) in the 1D convolutional layers of the feature extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. feat_quantizer_dropout (`float`, *optional*, defaults to 0.0): - The dropout probabilitiy for quantized feature extractor states. + The dropout probabilitiy for quantized feature encoder states. conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`): A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the - feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers. + feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers. conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`): - A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length + A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*. conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`): - A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The + A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The length of *conv_kernel* defines the number of convolutional layers and has to match the the length of *conv_dim*. conv_bias (`bool`, *optional*, defaults to `False`): @@ -97,7 +97,7 @@ class UniSpeechSatConfig(PretrainedConfig): True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is False` corresponds to applying layer norm after the attention layer. apply_spec_augment (`bool`, *optional*, defaults to `True`): - Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see + Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see [SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition](https://arxiv.org/abs/1904.08779). mask_time_prob (`float`, *optional*, defaults to 0.05): @@ -132,7 +132,7 @@ class UniSpeechSatConfig(PretrainedConfig): contrastive_logits_temperature (`float`, *optional*, defaults to 0.1): The temperature *kappa* in the contrastive loss. feat_quantizer_dropout (`float`, *optional*, defaults to 0.0): - The dropout probabilitiy for the output of the feature extractor that's used by the quantizer. + The dropout probabilitiy for the output of the feature encoder that's used by the quantizer. num_negatives (`int`, *optional*, defaults to 100): Number of negative samples for the contrastive loss. codevector_dim (`int`, *optional*, defaults to 256): diff --git a/src/transformers/models/unispeech_sat/convert_unispeech_sat_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/unispeech_sat/convert_unispeech_sat_original_pytorch_checkpoint_to_pytorch.py index fc43b81380..37e5011c7b 100644 --- a/src/transformers/models/unispeech_sat/convert_unispeech_sat_original_pytorch_checkpoint_to_pytorch.py +++ b/src/transformers/models/unispeech_sat/convert_unispeech_sat_original_pytorch_checkpoint_to_pytorch.py @@ -20,12 +20,7 @@ import argparse import fairseq import torch -from transformers import ( # UniSpeechSatCTCTokenizer,; UniSpeechSatFeatureExtractor,; UniSpeechSatProcessor, - UniSpeechSatConfig, - UniSpeechSatForCTC, - UniSpeechSatForPreTraining, - logging, -) +from transformers import UniSpeechSatConfig, UniSpeechSatForCTC, UniSpeechSatForPreTraining, logging logging.set_verbosity_info() diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py index a9969b7159..efdd35725b 100755 --- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py @@ -15,6 +15,7 @@ """ PyTorch UniSpeechSat model.""" import math +import warnings from dataclasses import dataclass from typing import Optional, Tuple, Union @@ -385,8 +386,8 @@ class UniSpeechSatSamePadLayer(nn.Module): return hidden_states -# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureExtractor with Wav2Vec2->UniSpeechSat -class UniSpeechSatFeatureExtractor(nn.Module): +# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->UniSpeechSat +class UniSpeechSatFeatureEncoder(nn.Module): """Construct the features from raw audio waveform""" def __init__(self, config): @@ -440,6 +441,17 @@ class UniSpeechSatFeatureExtractor(nn.Module): return hidden_states +class UniSpeechSatFeatureExtractor(UniSpeechSatFeatureEncoder): + def __init__(self, config): + super().__init__(config) + warnings.warn( + f"The class `{self.__class__.__name__}` has been depreciated " + "and will be removed in Transformers v5. " + f"Use `{self.__class__.__bases__[0].__name__}` instead.", + FutureWarning, + ) + + # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection with Wav2Vec2->UniSpeechSat class UniSpeechSatFeatureProjection(nn.Module): def __init__(self, config): @@ -1014,7 +1026,7 @@ class UniSpeechSatPreTrainedModel(PreTrainedModel): return attention_mask def _set_gradient_checkpointing(self, module, value=False): - if isinstance(module, (UniSpeechSatEncoder, UniSpeechSatEncoderStableLayerNorm, UniSpeechSatFeatureExtractor)): + if isinstance(module, (UniSpeechSatEncoder, UniSpeechSatEncoderStableLayerNorm, UniSpeechSatFeatureEncoder)): module.gradient_checkpointing = value @@ -1084,7 +1096,7 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel): def __init__(self, config: UniSpeechSatConfig): super().__init__(config) self.config = config - self.feature_extractor = UniSpeechSatFeatureExtractor(config) + self.feature_extractor = UniSpeechSatFeatureEncoder(config) self.feature_projection = UniSpeechSatFeatureProjection(config) self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_()) @@ -1232,10 +1244,22 @@ class UniSpeechSatForPreTraining(UniSpeechSatPreTrainedModel): def freeze_feature_extractor(self): """ - Calling this function will disable the gradient computation for the feature extractor so that its parameters - will not be updated during training. + Calling this function will disable the gradient computation for the feature encoder so that its parameters will + not be updated during training. """ - self.unispeech_sat.feature_extractor._freeze_parameters() + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + self.wav2vec2.feature_extractor._freeze_parameters() @staticmethod def compute_contrastive_logits( @@ -1274,12 +1298,12 @@ class UniSpeechSatForPreTraining(UniSpeechSatPreTrainedModel): ```python >>> import torch - >>> from transformers import UniSpeechSatFeatureExtractor, UniSpeechSatForPreTraining + >>> from transformers import UniSpeechSatFeatureEncoder, UniSpeechSatForPreTraining >>> from transformers.models.unispeech_sat.modeling_unispeech_sat import _compute_mask_indices >>> from datasets import load_dataset >>> import soundfile as sf - >>> feature_extractor = UniSpeechSatFeatureExtractor.from_pretrained("patrickvonplaten/unispeech_sat-base") + >>> feature_extractor = UniSpeechSatFeatureEncoder.from_pretrained("patrickvonplaten/unispeech_sat-base") >>> model = UniSpeechSatForPreTraining.from_pretrained("patrickvonplaten/unispeech_sat-base") @@ -1383,8 +1407,20 @@ class UniSpeechSatForCTC(UniSpeechSatPreTrainedModel): def freeze_feature_extractor(self): """ - Calling this function will disable the gradient computation for the feature extractor so that its parameter - will not be updated during training. + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. """ self.unispeech_sat.feature_extractor._freeze_parameters() @@ -1492,8 +1528,20 @@ class UniSpeechSatForSequenceClassification(UniSpeechSatPreTrainedModel): def freeze_feature_extractor(self): """ - Calling this function will disable the gradient computation for the feature extractor so that its parameters - will not be updated during training. + Calling this function will disable the gradient computation for the feature encoder so that its parameters will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. """ self.unispeech_sat.feature_extractor._freeze_parameters() @@ -1596,8 +1644,20 @@ class UniSpeechSatForAudioFrameClassification(UniSpeechSatPreTrainedModel): def freeze_feature_extractor(self): """ - Calling this function will disable the gradient computation for the feature extractor so that its parameters - will not be updated during training. + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. """ self.unispeech_sat.feature_extractor._freeze_parameters() @@ -1745,8 +1805,20 @@ class UniSpeechSatForXVector(UniSpeechSatPreTrainedModel): def freeze_feature_extractor(self): """ - Calling this function will disable the gradient computation for the feature extractor so that its parameters - will not be updated during training. + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. """ self.unispeech_sat.feature_extractor._freeze_parameters() diff --git a/src/transformers/models/wav2vec2/configuration_wav2vec2.py b/src/transformers/models/wav2vec2/configuration_wav2vec2.py index 2ff006c51e..808fab2667 100644 --- a/src/transformers/models/wav2vec2/configuration_wav2vec2.py +++ b/src/transformers/models/wav2vec2/configuration_wav2vec2.py @@ -65,24 +65,24 @@ class Wav2Vec2Config(PretrainedConfig): layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. feat_extract_norm (`str`, *optional*, defaults to `"group"`): - The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group + The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D convolutional layers. feat_proj_dropout (`float`, *optional*, defaults to 0.0): - The dropout probability for output of the feature extractor. + The dropout probability for output of the feature encoder. feat_extract_activation (`str, `optional`, defaults to `"gelu"`): The non-linear activation function (function or string) in the 1D convolutional layers of the feature extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. feat_quantizer_dropout (`float`, *optional*, defaults to 0.0): - The dropout probabilitiy for quantized feature extractor states. + The dropout probabilitiy for quantized feature encoder states. conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`): A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the - feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers. + feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers. conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`): - A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length + A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*. conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`): - A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The + A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The length of *conv_kernel* defines the number of convolutional layers and has to match the length of *conv_dim*. conv_bias (`bool`, *optional*, defaults to `False`): @@ -97,7 +97,7 @@ class Wav2Vec2Config(PretrainedConfig): True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is False` corresponds to applying layer norm after the attention layer. apply_spec_augment (`bool`, *optional*, defaults to `True`): - Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see + Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see [SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition](https://arxiv.org/abs/1904.08779). mask_time_prob (`float`, *optional*, defaults to 0.05): @@ -132,7 +132,7 @@ class Wav2Vec2Config(PretrainedConfig): contrastive_logits_temperature (`float`, *optional*, defaults to 0.1): The temperature *kappa* in the contrastive loss. feat_quantizer_dropout (`float`, *optional*, defaults to 0.0): - The dropout probabilitiy for the output of the feature extractor that's used by the quantizer. + The dropout probabilitiy for the output of the feature encoder that's used by the quantizer. num_negatives (`int`, *optional*, defaults to 100): Number of negative samples for the contrastive loss. codevector_dim (`int`, *optional*, defaults to 256): diff --git a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py index 73d9a3ad25..a8ea74dd5c 100644 --- a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py @@ -395,7 +395,7 @@ class FlaxConvLayersCollection(nn.Module): return hidden_states -class FlaxWav2Vec2FeatureExtractor(nn.Module): +class FlaxWav2Vec2FeatureEncoder(nn.Module): """Construct the features from raw audio waveform""" config: Wav2Vec2Config @@ -849,7 +849,7 @@ class FlaxWav2Vec2Module(nn.Module): dtype: jnp.dtype = jnp.float32 def setup(self): - self.feature_extractor = FlaxWav2Vec2FeatureExtractor(self.config, dtype=self.dtype) + self.feature_extractor = FlaxWav2Vec2FeatureEncoder(self.config, dtype=self.dtype) self.feature_projection = FlaxWav2Vec2FeatureProjection(self.config, dtype=self.dtype) self.masked_spec_embed = self.param( "masked_spec_embed", jax.nn.initializers.uniform(), (self.config.hidden_size,) diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py index ac7658b0c3..d7e15fb4f4 100644 --- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py @@ -655,7 +655,7 @@ class TFWav2Vec2SamePadLayer(tf.keras.layers.Layer): return hidden_states -class TFWav2Vec2FeatureExtractor(tf.keras.layers.Layer): +class TFWav2Vec2FeatureEncoder(tf.keras.layers.Layer): def __init__(self, config: Wav2Vec2Config, **kwargs: Any) -> None: super().__init__(**kwargs) @@ -682,6 +682,17 @@ class TFWav2Vec2FeatureExtractor(tf.keras.layers.Layer): return hidden_states +class TFWav2Vec2FeatureExtractor(TFWav2Vec2FeatureEncoder): + def __init__(self, config, **kwargs): + super().__init__(config, **kwargs) + warnings.warn( + f"The class `{self.__class__.__name__}` has been depreciated " + "and will be removed in Transformers v5. " + f"Use `{self.__class__.__bases__[0].__name__}` instead.", + FutureWarning, + ) + + class TFWav2Vec2FeatureProjection(tf.keras.layers.Layer): def __init__(self, config: Wav2Vec2Config, **kwargs): super().__init__(**kwargs) @@ -1107,7 +1118,7 @@ class TFWav2Vec2MainLayer(tf.keras.layers.Layer): def __init__(self, config: Wav2Vec2Config, **kwargs): super().__init__(**kwargs) self.config = config - self.feature_extractor = TFWav2Vec2FeatureExtractor(config, name="feature_extractor") + self.feature_extractor = TFWav2Vec2FeatureEncoder(config, name="feature_extractor") self.feature_projection = TFWav2Vec2FeatureProjection(config, name="feature_projection") if config.do_stable_layer_norm: @@ -1481,8 +1492,20 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel): def freeze_feature_extractor(self): """ - Calling this function will disable the gradient computation for the feature extractor so that its parameter - will not be updated during training. + Calling this function will disable the gradient computation for the feature encoder so that its parameters will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. """ self.wav2vec2.feature_extractor.trainable = False diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index f45224ff3b..660e93350e 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -431,7 +431,7 @@ class Wav2Vec2SamePadLayer(nn.Module): return hidden_states -class Wav2Vec2FeatureExtractor(nn.Module): +class Wav2Vec2FeatureEncoder(nn.Module): """Construct the features from raw audio waveform""" def __init__(self, config): @@ -484,6 +484,17 @@ class Wav2Vec2FeatureExtractor(nn.Module): return hidden_states +class Wav2Vec2FeatureExtractor(Wav2Vec2FeatureEncoder): + def __init__(self, config): + super().__init__(config) + warnings.warn( + f"The class `{self.__class__.__name__}` has been depreciated " + "and will be removed in Transformers v5. " + f"Use `{self.__class__.__bases__[0].__name__}` instead.", + FutureWarning, + ) + + class Wav2Vec2FeatureProjection(nn.Module): def __init__(self, config): super().__init__() @@ -1125,7 +1136,7 @@ class Wav2Vec2PreTrainedModel(PreTrainedModel): return attention_mask def _set_gradient_checkpointing(self, module, value=False): - if isinstance(module, (Wav2Vec2Encoder, Wav2Vec2EncoderStableLayerNorm, Wav2Vec2FeatureExtractor)): + if isinstance(module, (Wav2Vec2Encoder, Wav2Vec2EncoderStableLayerNorm, Wav2Vec2FeatureEncoder)): module.gradient_checkpointing = value @@ -1194,7 +1205,7 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel): def __init__(self, config: Wav2Vec2Config): super().__init__(config) self.config = config - self.feature_extractor = Wav2Vec2FeatureExtractor(config) + self.feature_extractor = Wav2Vec2FeatureEncoder(config) self.feature_projection = Wav2Vec2FeatureProjection(config) # model only needs masking vector if mask prob is > 0.0 @@ -1213,8 +1224,20 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel): def freeze_feature_extractor(self): """ - Calling this function will disable the gradient computation for the feature extractor so that its parameters - will not be updated during training. + Calling this function will disable the gradient computation for the feature encoder so that its parameters will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. """ self.feature_extractor._freeze_parameters() @@ -1349,8 +1372,20 @@ class Wav2Vec2ForPreTraining(Wav2Vec2PreTrainedModel): def freeze_feature_extractor(self): """ - Calling this function will disable the gradient computation for the feature extractor so that its parameters - will not be updated during training. + Calling this function will disable the gradient computation for the feature encoder so that its parameters will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. """ self.wav2vec2.feature_extractor._freeze_parameters() @@ -1637,8 +1672,20 @@ class Wav2Vec2ForCTC(Wav2Vec2PreTrainedModel): def freeze_feature_extractor(self): """ - Calling this function will disable the gradient computation for the feature extractor so that its parameter - will not be updated during training. + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. """ self.wav2vec2.feature_extractor._freeze_parameters() @@ -1745,8 +1792,20 @@ class Wav2Vec2ForSequenceClassification(Wav2Vec2PreTrainedModel): def freeze_feature_extractor(self): """ - Calling this function will disable the gradient computation for the feature extractor so that its parameters - will not be updated during training. + Calling this function will disable the gradient computation for the feature encoder so that its parameters will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. """ self.wav2vec2.feature_extractor._freeze_parameters() @@ -1848,8 +1907,20 @@ class Wav2Vec2ForAudioFrameClassification(Wav2Vec2PreTrainedModel): def freeze_feature_extractor(self): """ - Calling this function will disable the gradient computation for the feature extractor so that its parameters - will not be updated during training. + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. """ self.wav2vec2.feature_extractor._freeze_parameters() @@ -1994,8 +2065,20 @@ class Wav2Vec2ForXVector(Wav2Vec2PreTrainedModel): def freeze_feature_extractor(self): """ - Calling this function will disable the gradient computation for the feature extractor so that its parameters - will not be updated during training. + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. """ self.wav2vec2.feature_extractor._freeze_parameters() diff --git a/src/transformers/models/wavlm/configuration_wavlm.py b/src/transformers/models/wavlm/configuration_wavlm.py index 86fedac2e5..84eb542a16 100644 --- a/src/transformers/models/wavlm/configuration_wavlm.py +++ b/src/transformers/models/wavlm/configuration_wavlm.py @@ -64,24 +64,24 @@ class WavLMConfig(PretrainedConfig): layer_norm_eps (`float`, *optional*, defaults to 1e-12): The epsilon used by the layer normalization layers. feat_extract_norm (`str`, *optional*, defaults to `"group"`): - The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group + The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D convolutional layers. feat_proj_dropout (`float`, *optional*, defaults to 0.0): - The dropout probability for output of the feature extractor. + The dropout probability for output of the feature encoder. feat_extract_activation (`str, `optional`, defaults to `"gelu"`): The non-linear activation function (function or string) in the 1D convolutional layers of the feature extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. feat_quantizer_dropout (`float`, *optional*, defaults to 0.0): - The dropout probabilitiy for quantized feature extractor states. + The dropout probabilitiy for quantized feature encoder states. conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`): A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the - feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers. + feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers. conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`): - A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length + A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*. conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`): - A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The + A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The length of *conv_kernel* defines the number of convolutional layers and has to match the the length of *conv_dim*. conv_bias (`bool`, *optional*, defaults to `False`): @@ -96,7 +96,7 @@ class WavLMConfig(PretrainedConfig): True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is False` corresponds to applying layer norm after the attention layer. apply_spec_augment (`bool`, *optional*, defaults to `True`): - Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see + Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see [SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition](https://arxiv.org/abs/1904.08779). mask_time_prob (`float`, *optional*, defaults to 0.05): @@ -122,7 +122,7 @@ class WavLMConfig(PretrainedConfig): contrastive_logits_temperature (`float`, *optional*, defaults to 0.1): The temperature *kappa* in the contrastive loss. feat_quantizer_dropout (`float`, *optional*, defaults to 0.0): - The dropout probabilitiy for the output of the feature extractor that's used by the quantizer. + The dropout probabilitiy for the output of the feature encoder that's used by the quantizer. num_negatives (`int`, *optional*, defaults to 100): Number of negative samples for the contrastive loss. codevector_dim (`int`, *optional*, defaults to 256): diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py index 088e1671c5..344d196167 100755 --- a/src/transformers/models/wavlm/modeling_wavlm.py +++ b/src/transformers/models/wavlm/modeling_wavlm.py @@ -15,6 +15,7 @@ """ PyTorch WavLM model.""" import math +import warnings from dataclasses import dataclass from typing import Optional, Tuple, Union @@ -352,8 +353,8 @@ class WavLMSamePadLayer(nn.Module): return hidden_states -# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureExtractor with Wav2Vec2->WavLM -class WavLMFeatureExtractor(nn.Module): +# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->WavLM +class WavLMFeatureEncoder(nn.Module): """Construct the features from raw audio waveform""" def __init__(self, config): @@ -404,6 +405,17 @@ class WavLMFeatureExtractor(nn.Module): return hidden_states +class WavLMFeatureExtractor(WavLMFeatureEncoder): + def __init__(self, config): + super().__init__(config) + warnings.warn( + f"The class `{self.__class__.__name__}` has been depreciated " + "and will be removed in Transformers v5. " + f"Use `{self.__class__.__bases__[0].__name__}` instead.", + FutureWarning, + ) + + # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection with Wav2Vec2->WavLM class WavLMFeatureProjection(nn.Module): def __init__(self, config): @@ -1077,7 +1089,7 @@ class WavLMPreTrainedModel(PreTrainedModel): return attention_mask def _set_gradient_checkpointing(self, module, value=False): - if isinstance(module, (WavLMEncoder, WavLMEncoderStableLayerNorm, WavLMFeatureExtractor)): + if isinstance(module, (WavLMEncoder, WavLMEncoderStableLayerNorm, WavLMFeatureEncoder)): module.gradient_checkpointing = value @@ -1146,7 +1158,7 @@ class WavLMModel(WavLMPreTrainedModel): def __init__(self, config: WavLMConfig): super().__init__(config) self.config = config - self.feature_extractor = WavLMFeatureExtractor(config) + self.feature_extractor = WavLMFeatureEncoder(config) self.feature_projection = WavLMFeatureProjection(config) # model only needs masking vector if mask prob is > 0.0 @@ -1165,8 +1177,20 @@ class WavLMModel(WavLMPreTrainedModel): def freeze_feature_extractor(self): """ - Calling this function will disable the gradient computation for the feature extractor so that its parameters - will not be updated during training. + Calling this function will disable the gradient computation for the feature encoder so that its parameters will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. """ self.feature_extractor._freeze_parameters() @@ -1303,8 +1327,20 @@ class WavLMForCTC(WavLMPreTrainedModel): def freeze_feature_extractor(self): """ - Calling this function will disable the gradient computation for the feature extractor so that its parameter - will not be updated during training. + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. """ self.wavlm.feature_extractor._freeze_parameters() @@ -1412,8 +1448,20 @@ class WavLMForSequenceClassification(WavLMPreTrainedModel): def freeze_feature_extractor(self): """ - Calling this function will disable the gradient computation for the feature extractor so that its parameters - will not be updated during training. + Calling this function will disable the gradient computation for the feature encoder so that its parameters will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. """ self.wavlm.feature_extractor._freeze_parameters() @@ -1516,8 +1564,20 @@ class WavLMForAudioFrameClassification(WavLMPreTrainedModel): def freeze_feature_extractor(self): """ - Calling this function will disable the gradient computation for the feature extractor so that its parameters - will not be updated during training. + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. """ self.wavlm.feature_extractor._freeze_parameters() @@ -1665,8 +1725,20 @@ class WavLMForXVector(WavLMPreTrainedModel): def freeze_feature_extractor(self): """ - Calling this function will disable the gradient computation for the feature extractor so that its parameters - will not be updated during training. + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. + """ + warnings.warn( + "The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5." + "Please use the equivalent `freeze_feature_encoder` method instead.", + FutureWarning, + ) + self.freeze_feature_encoder() + + def freeze_feature_encoder(self): + """ + Calling this function will disable the gradient computation for the feature encoder so that its parameter will + not be updated during training. """ self.wavlm.feature_extractor._freeze_parameters() diff --git a/tests/test_modeling_hubert.py b/tests/test_modeling_hubert.py index 709db7b476..06ff8eeee8 100644 --- a/tests/test_modeling_hubert.py +++ b/tests/test_modeling_hubert.py @@ -225,7 +225,7 @@ class HubertModelTester: model.train() # freeze feature encoder - model.freeze_feature_extractor() + model.freeze_feature_encoder() input_values = input_values[:3] diff --git a/tests/test_modeling_sew.py b/tests/test_modeling_sew.py index 673dacabdc..2caaec43fa 100644 --- a/tests/test_modeling_sew.py +++ b/tests/test_modeling_sew.py @@ -203,7 +203,7 @@ class SEWModelTester: model.train() # freeze feature encoder - model.freeze_feature_extractor() + model.freeze_feature_encoder() input_values = input_values[:3] diff --git a/tests/test_modeling_sew_d.py b/tests/test_modeling_sew_d.py index 5774eb0949..bc254436d0 100644 --- a/tests/test_modeling_sew_d.py +++ b/tests/test_modeling_sew_d.py @@ -224,7 +224,7 @@ class SEWDModelTester: model.train() # freeze feature encoder - model.freeze_feature_extractor() + model.freeze_feature_encoder() input_values = input_values[:3] diff --git a/tests/test_modeling_tf_hubert.py b/tests/test_modeling_tf_hubert.py index b6fefe3462..fb878f5905 100644 --- a/tests/test_modeling_tf_hubert.py +++ b/tests/test_modeling_tf_hubert.py @@ -184,7 +184,7 @@ class TFHubertModelTester: model = TFHubertForCTC(config) # freeze feature encoder - model.freeze_feature_extractor() + model.freeze_feature_encoder() input_values = input_values[:3] diff --git a/tests/test_modeling_tf_wav2vec2.py b/tests/test_modeling_tf_wav2vec2.py index eb98e54c8d..add1ddd329 100644 --- a/tests/test_modeling_tf_wav2vec2.py +++ b/tests/test_modeling_tf_wav2vec2.py @@ -194,7 +194,7 @@ class TFWav2Vec2ModelTester: model = TFWav2Vec2ForCTC(config) # freeze feature encoder - model.freeze_feature_extractor() + model.freeze_feature_encoder() input_values = input_values[:3] diff --git a/tests/test_modeling_unispeech.py b/tests/test_modeling_unispeech.py index 9aab88a013..5091aebf7f 100644 --- a/tests/test_modeling_unispeech.py +++ b/tests/test_modeling_unispeech.py @@ -226,7 +226,7 @@ class UniSpeechModelTester: model.train() # freeze feature encoder - model.freeze_feature_extractor() + model.freeze_feature_encoder() input_values = input_values[:3] diff --git a/tests/test_modeling_unispeech_sat.py b/tests/test_modeling_unispeech_sat.py index e18d86a8bc..bd04995819 100644 --- a/tests/test_modeling_unispeech_sat.py +++ b/tests/test_modeling_unispeech_sat.py @@ -246,7 +246,7 @@ class UniSpeechSatModelTester: model.train() # freeze feature encoder - model.freeze_feature_extractor() + model.freeze_feature_encoder() input_values = input_values[:3] diff --git a/tests/test_modeling_wav2vec2.py b/tests/test_modeling_wav2vec2.py index aa2cea8be1..ca7f43d4ca 100644 --- a/tests/test_modeling_wav2vec2.py +++ b/tests/test_modeling_wav2vec2.py @@ -300,7 +300,7 @@ class Wav2Vec2ModelTester: model.train() # freeze feature encoder - model.freeze_feature_extractor() + model.freeze_feature_encoder() input_values = input_values[:3] diff --git a/tests/test_modeling_wavlm.py b/tests/test_modeling_wavlm.py index 89f7c8fb92..4aa5b51f6f 100644 --- a/tests/test_modeling_wavlm.py +++ b/tests/test_modeling_wavlm.py @@ -238,7 +238,7 @@ class WavLMModelTester: model.train() # freeze feature encoder - model.freeze_feature_extractor() + model.freeze_feature_encoder() input_values = input_values[:3]