From 1d21471c78baaaf1596a86c6af37de6076dbfa87 Mon Sep 17 00:00:00 2001 From: Magnus Pierrau <56202367+mpierrau@users.noreply.github.com> Date: Thu, 5 Jan 2023 17:24:55 +0100 Subject: [PATCH] Added mask_time_prob and mask_time_length arguments to wav2vec2 pretraining script (#20985) Added mask_time_prob and mask_time_length arguments to wav2vec2 pretraining script and readme - new branch --- examples/pytorch/speech-pretraining/README.md | 6 +++ .../run_wav2vec2_pretraining_no_trainer.py | 42 +++++++++++++++++-- 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/examples/pytorch/speech-pretraining/README.md b/examples/pytorch/speech-pretraining/README.md index 1d57fc8e72..d0126634d2 100644 --- a/examples/pytorch/speech-pretraining/README.md +++ b/examples/pytorch/speech-pretraining/README.md @@ -79,6 +79,8 @@ accelerate launch run_wav2vec2_pretraining_no_trainer.py \ --adam_beta2="0.98" \ --adam_epsilon="1e-06" \ --gradient_checkpointing \ + --mask_time_prob="0.65" \ + --mask_time_length="10" ``` The results of this run can be seen [here](https://wandb.ai/patrickvonplaten/wav2vec2-pretrained-demo/reports/Wav2Vec2-PreTraining-Demo-Run--VmlldzoxMDk3MjAw?accessToken=oa05s1y57lizo2ocxy3k01g6db1u4pt8m6ur2n8nl4cb0ug02ms2cw313kb8ruch). @@ -110,6 +112,8 @@ accelerate launch run_wav2vec2_pretraining_no_trainer.py \ --adam_beta2="0.98" \ --adam_epsilon="1e-06" \ --gradient_checkpointing \ + --mask_time_prob="0.65" \ + --mask_time_length="10" ``` The experiment was run on 8 GPU V100 (16 GB RAM each) for 4 days. @@ -146,6 +150,8 @@ accelerate launch run_wav2vec2_pretraining_no_trainer.py \ --adam_beta2=0.98 \ --adam_epsilon=1e-06 \ --gradient_checkpointing \ + --mask_time_prob=0.65 \ + --mask_time_length=10 ``` The experiment was run on 8 GPU V100 (16 GB RAM each) for 7 days. diff --git a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py index 0de1776df5..c15a8b73f5 100755 --- a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py +++ b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py @@ -247,6 +247,24 @@ def parse_args(): "--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`." ) parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.") + parser.add_argument( + "--mask_time_prob", + type=float, + default=None, + help=( + "Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked in the" + " contrastive task. If omitted, will pull value from model config." + ), + ) + parser.add_argument( + "--mask_time_length", + type=int, + default=None, + help=( + "Length of each vector mask span to mask along the time axis in the contrastive task." + " If omitted, will pull value from model config." + ), + ) args = parser.parse_args() if args.push_to_hub: @@ -285,12 +303,22 @@ class DataCollatorForWav2Vec2Pretraining: If set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta). + mask_time_prob (:obj:`float`, `optional`, defaults to :obj:`0.65`): + Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked for the contrastive task. + Note that overlap between masked sequences may decrease the actual percentage of masked vectors. + The default value is taken from the original wav2vec 2.0 article (https://arxiv.org/abs/2006.11477), + and results in about 49 percent of each sequence being masked on average. + mask_time_length (:obj:`int`, `optional`, defaults to :obj:`10`): + Length of each vector mask span to mask along the time axis in the contrastive task. The default value + originates from the original wav2vec 2.0 article and corresponds to the ``M`` variable mentioned there. """ model: Wav2Vec2ForPreTraining feature_extractor: Wav2Vec2FeatureExtractor padding: Union[bool, str] = "longest" pad_to_multiple_of: Optional[int] = None + mask_time_prob: Optional[float] = 0.65 + mask_time_length: Optional[int] = 10 def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]: # reformat list to dict and set to pytorch format @@ -320,8 +348,8 @@ class DataCollatorForWav2Vec2Pretraining: # sample randomly masked indices mask_time_indices = _compute_mask_indices( features_shape, - self.model.config.mask_time_prob, - self.model.config.mask_time_length, + self.mask_time_prob, + self.mask_time_length, attention_mask=batch.get("sub_attention_mask"), ) @@ -515,8 +543,16 @@ def main(): model.gradient_checkpointing_enable() # 4. Define data collator, optimizer and scheduler + + mask_time_prob = config.mask_time_prob if args.mask_time_prob is None else args.mask_time_prob + mask_time_length = config.mask_time_length if args.mask_time_length is None else args.mask_time_length + data_collator = DataCollatorForWav2Vec2Pretraining( - model=model, feature_extractor=feature_extractor, pad_to_multiple_of=args.pad_to_multiple_of + model=model, + feature_extractor=feature_extractor, + pad_to_multiple_of=args.pad_to_multiple_of, + mask_time_prob=mask_time_prob, + mask_time_length=mask_time_length, ) train_dataloader = DataLoader( vectorized_datasets["train"],