From 6645eb61fa61cd24c773bcf2973d1b1e014d7964 Mon Sep 17 00:00:00 2001 From: Nik Date: Thu, 2 Dec 2021 15:05:31 +0100 Subject: [PATCH] fix #14524 (IndexError when mask prob is too low) (#14525) * fix #14524 (IndexError when mask prob is too low) * fix formatting * correct documentation, add option for setting min_num_masks * change the semantic meaning of `mask_prob` in _compute_mask_indices With this commit the meaing of `mask_prob` actually adhered to the probability for each vector to be the start of a masked span of length. * fix check_copies test * fix documentation to semantic meaning of `upper bound of overall masking percentage`, revert changes to _compute_mask_indices * fix typo --- .../models/hubert/configuration_hubert.py | 29 ++++++++++++++---- .../models/hubert/modeling_hubert.py | 27 ++++++++++++----- .../models/sew/configuration_sew.py | 29 ++++++++++++++---- src/transformers/models/sew/modeling_sew.py | 27 ++++++++++++----- .../models/sew_d/configuration_sew_d.py | 29 ++++++++++++++---- .../models/sew_d/modeling_sew_d.py | 27 ++++++++++++----- .../unispeech/configuration_unispeech.py | 29 ++++++++++++++---- .../models/unispeech/modeling_unispeech.py | 27 ++++++++++++----- .../configuration_unispeech_sat.py | 29 ++++++++++++++---- .../unispeech_sat/modeling_unispeech_sat.py | 27 ++++++++++++----- .../models/wav2vec2/configuration_wav2vec2.py | 29 ++++++++++++++---- .../models/wav2vec2/modeling_wav2vec2.py | 27 ++++++++++++----- tests/test_modeling_wav2vec2.py | 30 +++++++++++++++++++ 13 files changed, 288 insertions(+), 78 deletions(-) diff --git a/src/transformers/models/hubert/configuration_hubert.py b/src/transformers/models/hubert/configuration_hubert.py index d663c458a9..84ed7a70bc 100644 --- a/src/transformers/models/hubert/configuration_hubert.py +++ b/src/transformers/models/hubert/configuration_hubert.py @@ -101,17 +101,30 @@ class HubertConfig(PretrainedConfig): `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition `__. mask_time_prob (:obj:`float`, `optional`, defaults to 0.05): - Propability of each feature vector along the time axis to be chosen as the start of the vector span to be - masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be - masked along the time axis. This is only relevant if ``apply_spec_augment is True``. + Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking + procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If + reasoning from the propability of each feature vector to be chosen as the start of the vector span to be + masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease + the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``. mask_time_length (:obj:`int`, `optional`, defaults to 10): Length of vector span along the time axis. + mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),: + The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time + step, irrespectively of ``mask_feature_prob``. Only relevant if + ''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks'' mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0): - Propability of each feature vector along the feature axis to be chosen as the start of the vector span to - be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be - masked along the time axis. This is only relevant if ``apply_spec_augment is True``. + Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The + masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over + the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector + span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that + overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment + is True``. mask_feature_length (:obj:`int`, `optional`, defaults to 10): Length of vector span along the feature axis. + mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),: + The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time + step, irrespectively of ``mask_feature_prob``. Only relevant if + ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks'' ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`): Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an instance of :class:`~transformers.HubertForCTC`. @@ -169,8 +182,10 @@ class HubertConfig(PretrainedConfig): apply_spec_augment=True, mask_time_prob=0.05, mask_time_length=10, + mask_time_min_masks=2, mask_feature_prob=0.0, mask_feature_length=10, + mask_feature_min_masks=0, ctc_loss_reduction="sum", ctc_zero_infinity=False, use_weighted_layer_sum=False, @@ -225,8 +240,10 @@ class HubertConfig(PretrainedConfig): self.apply_spec_augment = apply_spec_augment self.mask_time_prob = mask_time_prob self.mask_time_length = mask_time_length + self.mask_time_min_masks = mask_time_min_masks self.mask_feature_prob = mask_feature_prob self.mask_feature_length = mask_feature_length + self.mask_feature_min_masks = mask_feature_min_masks # ctc loss self.ctc_loss_reduction = ctc_loss_reduction diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index 2f8c59257c..6d2affd2df 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -69,13 +69,16 @@ def _compute_mask_indices( on CPU as part of the preprocessing during training. Args: - shape: the the shape for which to compute masks. - should be of size 2 where first element is batch size and 2nd is timesteps - mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by - number of timesteps divided by length of mask span to mask approximately this percentage of all elements. - however due to overlaps, the actual number will be smaller (unless no_overlap is True) + shape: The shape for which to compute masks. This should be of a tuple of size 2 where + the first element is the batch size and the second element is the length of the axis to span. + mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of + independently generated mask spans of length `mask_length` is computed by + `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the + actual percentage will be smaller. mask_length: size of the mask min_masks: minimum number of masked spans + attention_mask: A (right-padded) attention mask which independently shortens the feature axis of + each batch dimension. """ batch_size, sequence_length = shape @@ -84,9 +87,11 @@ def _compute_mask_indices( if mask_length > sequence_length: raise ValueError( - f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`" + f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}" + f" and `sequence_length`: {sequence_length}`" ) + # epsilon is used for probabilistic rounding epsilon = np.random.rand(1).item() def compute_num_masked_span(input_length): @@ -113,15 +118,21 @@ def _compute_mask_indices( max_num_masked_span = compute_num_masked_span(sequence_length) + if max_num_masked_span == 0: + return spec_aug_mask + for input_length in input_lengths: # compute num of masked spans for this input num_masked_span = compute_num_masked_span(input_length) + # get random indices to mask spec_aug_mask_idx = np.random.choice( np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False ) # pick first sampled index that will serve as a dummy index to pad vector + # to ensure same dimension for all batches due to probabilistic rounding + # Picking first sample just pads those vectors twice. dummy_mask_idx = spec_aug_mask_idx[0] spec_aug_mask_idx = np.concatenate( @@ -137,6 +148,7 @@ def _compute_mask_indices( ) spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length) + # add offset to the starting indexes so that that indexes now create a span offsets = np.arange(mask_length)[None, None, :] offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape( batch_size, max_num_masked_span * mask_length @@ -930,7 +942,7 @@ class HubertModel(HubertPreTrainedModel): mask_prob=self.config.mask_time_prob, mask_length=self.config.mask_time_length, attention_mask=attention_mask, - min_masks=2, + min_masks=self.config.mask_time_min_masks, ) mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool) hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) @@ -941,6 +953,7 @@ class HubertModel(HubertPreTrainedModel): (batch_size, hidden_size), mask_prob=self.config.mask_feature_prob, mask_length=self.config.mask_feature_length, + min_masks=self.config.mask_feature_min_masks, ) mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool) mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1) diff --git a/src/transformers/models/sew/configuration_sew.py b/src/transformers/models/sew/configuration_sew.py index 11832887b1..a5a7ff7908 100644 --- a/src/transformers/models/sew/configuration_sew.py +++ b/src/transformers/models/sew/configuration_sew.py @@ -95,17 +95,30 @@ class SEWConfig(PretrainedConfig): `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition `__. mask_time_prob (:obj:`float`, `optional`, defaults to 0.05): - Propability of each feature vector along the time axis to be chosen as the start of the vector span to be - masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be - masked along the time axis. This is only relevant if ``apply_spec_augment is True``. + Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking + procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If + reasoning from the propability of each feature vector to be chosen as the start of the vector span to be + masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease + the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``. mask_time_length (:obj:`int`, `optional`, defaults to 10): Length of vector span along the time axis. + mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),: + The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time + step, irrespectively of ``mask_feature_prob``. Only relevant if + ''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks'' mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0): - Propability of each feature vector along the feature axis to be chosen as the start of the vector span to - be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be - masked along the time axis. This is only relevant if ``apply_spec_augment is True``. + Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The + masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over + the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector + span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that + overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment + is True``. mask_feature_length (:obj:`int`, `optional`, defaults to 10): Length of vector span along the feature axis. + mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),: + The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time + step, irrespectively of ``mask_feature_prob``. Only relevant if + ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks'' ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`): Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an instance of :class:`~transformers.SEWForCTC`. @@ -162,8 +175,10 @@ class SEWConfig(PretrainedConfig): apply_spec_augment=True, mask_time_prob=0.05, mask_time_length=10, + mask_time_min_masks=2, mask_feature_prob=0.0, mask_feature_length=10, + mask_feature_min_masks=0, ctc_loss_reduction="mean", ctc_zero_infinity=False, use_weighted_layer_sum=False, @@ -215,8 +230,10 @@ class SEWConfig(PretrainedConfig): self.apply_spec_augment = apply_spec_augment self.mask_time_prob = mask_time_prob self.mask_time_length = mask_time_length + self.mask_time_min_masks = mask_time_min_masks self.mask_feature_prob = mask_feature_prob self.mask_feature_length = mask_feature_length + self.mask_feature_min_masks = mask_feature_min_masks # ctc loss self.ctc_loss_reduction = ctc_loss_reduction diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py index fd4cf4bf4d..c028103c7f 100644 --- a/src/transformers/models/sew/modeling_sew.py +++ b/src/transformers/models/sew/modeling_sew.py @@ -67,13 +67,16 @@ def _compute_mask_indices( on CPU as part of the preprocessing during training. Args: - shape: the the shape for which to compute masks. - should be of size 2 where first element is batch size and 2nd is timesteps - mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by - number of timesteps divided by length of mask span to mask approximately this percentage of all elements. - however due to overlaps, the actual number will be smaller (unless no_overlap is True) + shape: The shape for which to compute masks. This should be of a tuple of size 2 where + the first element is the batch size and the second element is the length of the axis to span. + mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of + independently generated mask spans of length `mask_length` is computed by + `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the + actual percentage will be smaller. mask_length: size of the mask min_masks: minimum number of masked spans + attention_mask: A (right-padded) attention mask which independently shortens the feature axis of + each batch dimension. """ batch_size, sequence_length = shape @@ -82,9 +85,11 @@ def _compute_mask_indices( if mask_length > sequence_length: raise ValueError( - f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`" + f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}" + f" and `sequence_length`: {sequence_length}`" ) + # epsilon is used for probabilistic rounding epsilon = np.random.rand(1).item() def compute_num_masked_span(input_length): @@ -111,15 +116,21 @@ def _compute_mask_indices( max_num_masked_span = compute_num_masked_span(sequence_length) + if max_num_masked_span == 0: + return spec_aug_mask + for input_length in input_lengths: # compute num of masked spans for this input num_masked_span = compute_num_masked_span(input_length) + # get random indices to mask spec_aug_mask_idx = np.random.choice( np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False ) # pick first sampled index that will serve as a dummy index to pad vector + # to ensure same dimension for all batches due to probabilistic rounding + # Picking first sample just pads those vectors twice. dummy_mask_idx = spec_aug_mask_idx[0] spec_aug_mask_idx = np.concatenate( @@ -135,6 +146,7 @@ def _compute_mask_indices( ) spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length) + # add offset to the starting indexes so that that indexes now create a span offsets = np.arange(mask_length)[None, None, :] offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape( batch_size, max_num_masked_span * mask_length @@ -829,7 +841,7 @@ class SEWModel(SEWPreTrainedModel): mask_prob=self.config.mask_time_prob, mask_length=self.config.mask_time_length, attention_mask=attention_mask, - min_masks=2, + min_masks=self.config.mask_time_min_masks, ) mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool) hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) @@ -840,6 +852,7 @@ class SEWModel(SEWPreTrainedModel): (batch_size, hidden_size), mask_prob=self.config.mask_feature_prob, mask_length=self.config.mask_feature_length, + min_masks=self.config.mask_feature_min_masks, ) mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool) mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1) diff --git a/src/transformers/models/sew_d/configuration_sew_d.py b/src/transformers/models/sew_d/configuration_sew_d.py index d526105523..09976c9204 100644 --- a/src/transformers/models/sew_d/configuration_sew_d.py +++ b/src/transformers/models/sew_d/configuration_sew_d.py @@ -113,17 +113,30 @@ class SEWDConfig(PretrainedConfig): `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition `__. mask_time_prob (:obj:`float`, `optional`, defaults to 0.05): - Propability of each feature vector along the time axis to be chosen as the start of the vector span to be - masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be - masked along the time axis. This is only relevant if ``apply_spec_augment is True``. + Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking + procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If + reasoning from the propability of each feature vector to be chosen as the start of the vector span to be + masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease + the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``. mask_time_length (:obj:`int`, `optional`, defaults to 10): Length of vector span along the time axis. + mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),: + The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time + step, irrespectively of ``mask_feature_prob``. Only relevant if + ''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks'' mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0): - Propability of each feature vector along the feature axis to be chosen as the start of the vector span to - be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be - masked along the time axis. This is only relevant if ``apply_spec_augment is True``. + Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The + masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over + the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector + span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that + overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment + is True``. mask_feature_length (:obj:`int`, `optional`, defaults to 10): Length of vector span along the feature axis. + mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),: + The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time + step, irrespectively of ``mask_feature_prob``. Only relevant if + ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks'' diversity_loss_weight (:obj:`int`, `optional`, defaults to 0.1): The weight of the codebook diversity loss component. ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`): @@ -190,8 +203,10 @@ class SEWDConfig(PretrainedConfig): apply_spec_augment=True, mask_time_prob=0.05, mask_time_length=10, + mask_time_min_masks=2, mask_feature_prob=0.0, mask_feature_length=10, + mask_feature_min_masks=0, ctc_loss_reduction="mean", ctc_zero_infinity=False, use_weighted_layer_sum=False, @@ -251,8 +266,10 @@ class SEWDConfig(PretrainedConfig): self.apply_spec_augment = apply_spec_augment self.mask_time_prob = mask_time_prob self.mask_time_length = mask_time_length + self.mask_time_min_masks = mask_time_min_masks self.mask_feature_prob = mask_feature_prob self.mask_feature_length = mask_feature_length + self.mask_feature_min_masks = mask_feature_min_masks # ctc loss self.ctc_loss_reduction = ctc_loss_reduction diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py index 53f9862b47..677c1384f7 100644 --- a/src/transformers/models/sew_d/modeling_sew_d.py +++ b/src/transformers/models/sew_d/modeling_sew_d.py @@ -73,13 +73,16 @@ def _compute_mask_indices( on CPU as part of the preprocessing during training. Args: - shape: the the shape for which to compute masks. - should be of size 2 where first element is batch size and 2nd is timesteps - mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by - number of timesteps divided by length of mask span to mask approximately this percentage of all elements. - however due to overlaps, the actual number will be smaller (unless no_overlap is True) + shape: The shape for which to compute masks. This should be of a tuple of size 2 where + the first element is the batch size and the second element is the length of the axis to span. + mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of + independently generated mask spans of length `mask_length` is computed by + `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the + actual percentage will be smaller. mask_length: size of the mask min_masks: minimum number of masked spans + attention_mask: A (right-padded) attention mask which independently shortens the feature axis of + each batch dimension. """ batch_size, sequence_length = shape @@ -88,9 +91,11 @@ def _compute_mask_indices( if mask_length > sequence_length: raise ValueError( - f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`" + f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}" + f" and `sequence_length`: {sequence_length}`" ) + # epsilon is used for probabilistic rounding epsilon = np.random.rand(1).item() def compute_num_masked_span(input_length): @@ -117,15 +122,21 @@ def _compute_mask_indices( max_num_masked_span = compute_num_masked_span(sequence_length) + if max_num_masked_span == 0: + return spec_aug_mask + for input_length in input_lengths: # compute num of masked spans for this input num_masked_span = compute_num_masked_span(input_length) + # get random indices to mask spec_aug_mask_idx = np.random.choice( np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False ) # pick first sampled index that will serve as a dummy index to pad vector + # to ensure same dimension for all batches due to probabilistic rounding + # Picking first sample just pads those vectors twice. dummy_mask_idx = spec_aug_mask_idx[0] spec_aug_mask_idx = np.concatenate( @@ -141,6 +152,7 @@ def _compute_mask_indices( ) spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length) + # add offset to the starting indexes so that that indexes now create a span offsets = np.arange(mask_length)[None, None, :] offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape( batch_size, max_num_masked_span * mask_length @@ -1360,7 +1372,7 @@ class SEWDModel(SEWDPreTrainedModel): mask_prob=self.config.mask_time_prob, mask_length=self.config.mask_time_length, attention_mask=attention_mask, - min_masks=2, + min_masks=self.config.mask_time_min_masks, ) mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool) hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) @@ -1371,6 +1383,7 @@ class SEWDModel(SEWDPreTrainedModel): (batch_size, hidden_size), mask_prob=self.config.mask_feature_prob, mask_length=self.config.mask_feature_length, + min_masks=self.config.mask_feature_min_masks, ) mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool) mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1) diff --git a/src/transformers/models/unispeech/configuration_unispeech.py b/src/transformers/models/unispeech/configuration_unispeech.py index caefb383a4..d328f5a6df 100644 --- a/src/transformers/models/unispeech/configuration_unispeech.py +++ b/src/transformers/models/unispeech/configuration_unispeech.py @@ -101,17 +101,30 @@ class UniSpeechConfig(PretrainedConfig): `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition `__. mask_time_prob (:obj:`float`, `optional`, defaults to 0.05): - Propability of each feature vector along the time axis to be chosen as the start of the vector span to be - masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be - masked along the time axis. This is only relevant if ``apply_spec_augment is True``. + Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking + procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If + reasoning from the propability of each feature vector to be chosen as the start of the vector span to be + masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease + the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``. mask_time_length (:obj:`int`, `optional`, defaults to 10): Length of vector span along the time axis. + mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),: + The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time + step, irrespectively of ``mask_feature_prob``. Only relevant if + ''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks'' mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0): - Propability of each feature vector along the feature axis to be chosen as the start of the vector span to - be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be - masked along the time axis. This is only relevant if ``apply_spec_augment is True``. + Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The + masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over + the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector + span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that + overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment + is True``. mask_feature_length (:obj:`int`, `optional`, defaults to 10): Length of vector span along the feature axis. + mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),: + The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time + step, irrespectively of ``mask_feature_prob``. Only relevant if + ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks'' num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320): Number of entries in each quantization codebook (group). num_codevector_groups (:obj:`int`, `optional`, defaults to 2): @@ -187,8 +200,10 @@ class UniSpeechConfig(PretrainedConfig): apply_spec_augment=True, mask_time_prob=0.05, mask_time_length=10, + mask_time_min_masks=2, mask_feature_prob=0.0, mask_feature_length=10, + mask_feature_min_masks=0, num_codevectors_per_group=320, num_codevector_groups=2, contrastive_logits_temperature=0.1, @@ -252,8 +267,10 @@ class UniSpeechConfig(PretrainedConfig): self.apply_spec_augment = apply_spec_augment self.mask_time_prob = mask_time_prob self.mask_time_length = mask_time_length + self.mask_time_min_masks = mask_time_min_masks self.mask_feature_prob = mask_feature_prob self.mask_feature_length = mask_feature_length + self.mask_feature_min_masks = mask_feature_min_masks # parameters for pretraining with codevector quantized representations self.num_codevectors_per_group = num_codevectors_per_group diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py index cd4ff01081..b0e2168bc4 100755 --- a/src/transformers/models/unispeech/modeling_unispeech.py +++ b/src/transformers/models/unispeech/modeling_unispeech.py @@ -136,13 +136,16 @@ def _compute_mask_indices( on CPU as part of the preprocessing during training. Args: - shape: the the shape for which to compute masks. - should be of size 2 where first element is batch size and 2nd is timesteps - mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by - number of timesteps divided by length of mask span to mask approximately this percentage of all elements. - however due to overlaps, the actual number will be smaller (unless no_overlap is True) + shape: The shape for which to compute masks. This should be of a tuple of size 2 where + the first element is the batch size and the second element is the length of the axis to span. + mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of + independently generated mask spans of length `mask_length` is computed by + `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the + actual percentage will be smaller. mask_length: size of the mask min_masks: minimum number of masked spans + attention_mask: A (right-padded) attention mask which independently shortens the feature axis of + each batch dimension. """ batch_size, sequence_length = shape @@ -151,9 +154,11 @@ def _compute_mask_indices( if mask_length > sequence_length: raise ValueError( - f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`" + f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}" + f" and `sequence_length`: {sequence_length}`" ) + # epsilon is used for probabilistic rounding epsilon = np.random.rand(1).item() def compute_num_masked_span(input_length): @@ -180,15 +185,21 @@ def _compute_mask_indices( max_num_masked_span = compute_num_masked_span(sequence_length) + if max_num_masked_span == 0: + return spec_aug_mask + for input_length in input_lengths: # compute num of masked spans for this input num_masked_span = compute_num_masked_span(input_length) + # get random indices to mask spec_aug_mask_idx = np.random.choice( np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False ) # pick first sampled index that will serve as a dummy index to pad vector + # to ensure same dimension for all batches due to probabilistic rounding + # Picking first sample just pads those vectors twice. dummy_mask_idx = spec_aug_mask_idx[0] spec_aug_mask_idx = np.concatenate( @@ -204,6 +215,7 @@ def _compute_mask_indices( ) spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length) + # add offset to the starting indexes so that that indexes now create a span offsets = np.arange(mask_length)[None, None, :] offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape( batch_size, max_num_masked_span * mask_length @@ -1076,7 +1088,7 @@ class UniSpeechModel(UniSpeechPreTrainedModel): mask_prob=self.config.mask_time_prob, mask_length=self.config.mask_time_length, attention_mask=attention_mask, - min_masks=2, + min_masks=self.config.mask_time_min_masks, ) mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool) hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) @@ -1087,6 +1099,7 @@ class UniSpeechModel(UniSpeechPreTrainedModel): (batch_size, hidden_size), mask_prob=self.config.mask_feature_prob, mask_length=self.config.mask_feature_length, + min_masks=self.config.mask_feature_min_masks, ) mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool) mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1) diff --git a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py index 040e0abd0b..ecf8b01f1c 100644 --- a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py @@ -101,17 +101,30 @@ class UniSpeechSatConfig(PretrainedConfig): `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition `__. mask_time_prob (:obj:`float`, `optional`, defaults to 0.05): - Propability of each feature vector along the time axis to be chosen as the start of the vector span to be - masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be - masked along the time axis. This is only relevant if ``apply_spec_augment is True``. + Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking + procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If + reasoning from the propability of each feature vector to be chosen as the start of the vector span to be + masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease + the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``. mask_time_length (:obj:`int`, `optional`, defaults to 10): Length of vector span along the time axis. + mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),: + The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time + step, irrespectively of ``mask_feature_prob``. Only relevant if + ''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks'' mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0): - Propability of each feature vector along the feature axis to be chosen as the start of the vector span to - be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be - masked along the time axis. This is only relevant if ``apply_spec_augment is True``. + Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The + masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over + the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector + span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that + overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment + is True``. mask_feature_length (:obj:`int`, `optional`, defaults to 10): Length of vector span along the feature axis. + mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),: + The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time + step, irrespectively of ``mask_feature_prob``. Only relevant if + ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks'' num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320): Number of entries in each quantization codebook (group). num_codevector_groups (:obj:`int`, `optional`, defaults to 2): @@ -185,8 +198,10 @@ class UniSpeechSatConfig(PretrainedConfig): apply_spec_augment=True, mask_time_prob=0.05, mask_time_length=10, + mask_time_min_masks=2, mask_feature_prob=0.0, mask_feature_length=10, + mask_feature_min_masks=0, num_codevectors_per_group=320, num_codevector_groups=2, contrastive_logits_temperature=0.1, @@ -249,8 +264,10 @@ class UniSpeechSatConfig(PretrainedConfig): self.apply_spec_augment = apply_spec_augment self.mask_time_prob = mask_time_prob self.mask_time_length = mask_time_length + self.mask_time_min_masks = mask_time_min_masks self.mask_feature_prob = mask_feature_prob self.mask_feature_length = mask_feature_length + self.mask_feature_min_masks = mask_feature_min_masks # parameters for pretraining with codevector quantized representations self.num_codevectors_per_group = num_codevectors_per_group diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py index c69faafc43..9e1f89ae55 100755 --- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py @@ -137,13 +137,16 @@ def _compute_mask_indices( on CPU as part of the preprocessing during training. Args: - shape: the the shape for which to compute masks. - should be of size 2 where first element is batch size and 2nd is timesteps - mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by - number of timesteps divided by length of mask span to mask approximately this percentage of all elements. - however due to overlaps, the actual number will be smaller (unless no_overlap is True) + shape: The shape for which to compute masks. This should be of a tuple of size 2 where + the first element is the batch size and the second element is the length of the axis to span. + mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of + independently generated mask spans of length `mask_length` is computed by + `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the + actual percentage will be smaller. mask_length: size of the mask min_masks: minimum number of masked spans + attention_mask: A (right-padded) attention mask which independently shortens the feature axis of + each batch dimension. """ batch_size, sequence_length = shape @@ -152,9 +155,11 @@ def _compute_mask_indices( if mask_length > sequence_length: raise ValueError( - f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`" + f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}" + f" and `sequence_length`: {sequence_length}`" ) + # epsilon is used for probabilistic rounding epsilon = np.random.rand(1).item() def compute_num_masked_span(input_length): @@ -181,15 +186,21 @@ def _compute_mask_indices( max_num_masked_span = compute_num_masked_span(sequence_length) + if max_num_masked_span == 0: + return spec_aug_mask + for input_length in input_lengths: # compute num of masked spans for this input num_masked_span = compute_num_masked_span(input_length) + # get random indices to mask spec_aug_mask_idx = np.random.choice( np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False ) # pick first sampled index that will serve as a dummy index to pad vector + # to ensure same dimension for all batches due to probabilistic rounding + # Picking first sample just pads those vectors twice. dummy_mask_idx = spec_aug_mask_idx[0] spec_aug_mask_idx = np.concatenate( @@ -205,6 +216,7 @@ def _compute_mask_indices( ) spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length) + # add offset to the starting indexes so that that indexes now create a span offsets = np.arange(mask_length)[None, None, :] offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape( batch_size, max_num_masked_span * mask_length @@ -1077,7 +1089,7 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel): mask_prob=self.config.mask_time_prob, mask_length=self.config.mask_time_length, attention_mask=attention_mask, - min_masks=2, + min_masks=self.config.mask_time_min_masks, ) mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool) hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) @@ -1088,6 +1100,7 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel): (batch_size, hidden_size), mask_prob=self.config.mask_feature_prob, mask_length=self.config.mask_feature_length, + min_masks=self.config.mask_feature_min_masks, ) mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool) mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1) diff --git a/src/transformers/models/wav2vec2/configuration_wav2vec2.py b/src/transformers/models/wav2vec2/configuration_wav2vec2.py index 69e2106c11..fcbfd1c41e 100644 --- a/src/transformers/models/wav2vec2/configuration_wav2vec2.py +++ b/src/transformers/models/wav2vec2/configuration_wav2vec2.py @@ -101,17 +101,30 @@ class Wav2Vec2Config(PretrainedConfig): `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition `__. mask_time_prob (:obj:`float`, `optional`, defaults to 0.05): - Propability of each feature vector along the time axis to be chosen as the start of the vector span to be - masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be - masked along the time axis. This is only relevant if ``apply_spec_augment is True``. + Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking + procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If + reasoning from the propability of each feature vector to be chosen as the start of the vector span to be + masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease + the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``. mask_time_length (:obj:`int`, `optional`, defaults to 10): Length of vector span along the time axis. + mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),: + The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time + step, irrespectively of ``mask_feature_prob``. Only relevant if + ''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks'' mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0): - Propability of each feature vector along the feature axis to be chosen as the start of the vector span to - be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be - masked along the time axis. This is only relevant if ``apply_spec_augment is True``. + Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The + masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over + the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector + span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that + overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment + is True``. mask_feature_length (:obj:`int`, `optional`, defaults to 10): Length of vector span along the feature axis. + mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),: + The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time + step, irrespectively of ``mask_feature_prob``. Only relevant if + ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks'' num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320): Number of entries in each quantization codebook (group). num_codevector_groups (:obj:`int`, `optional`, defaults to 2): @@ -198,8 +211,10 @@ class Wav2Vec2Config(PretrainedConfig): apply_spec_augment=True, mask_time_prob=0.05, mask_time_length=10, + mask_time_min_masks=2, mask_feature_prob=0.0, mask_feature_length=10, + mask_feature_min_masks=0, num_codevectors_per_group=320, num_codevector_groups=2, contrastive_logits_temperature=0.1, @@ -265,8 +280,10 @@ class Wav2Vec2Config(PretrainedConfig): self.apply_spec_augment = apply_spec_augment self.mask_time_prob = mask_time_prob self.mask_time_length = mask_time_length + self.mask_time_min_masks = mask_time_min_masks self.mask_feature_prob = mask_feature_prob self.mask_feature_length = mask_feature_length + self.mask_feature_min_masks = mask_feature_min_masks # parameters for pretraining with codevector quantized representations self.num_codevectors_per_group = num_codevectors_per_group diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index 00eec6933b..e8fcc5b805 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -145,13 +145,16 @@ def _compute_mask_indices( on CPU as part of the preprocessing during training. Args: - shape: the the shape for which to compute masks. - should be of size 2 where first element is batch size and 2nd is timesteps - mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by - number of timesteps divided by length of mask span to mask approximately this percentage of all elements. - however due to overlaps, the actual number will be smaller (unless no_overlap is True) + shape: The shape for which to compute masks. This should be of a tuple of size 2 where + the first element is the batch size and the second element is the length of the axis to span. + mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of + independently generated mask spans of length `mask_length` is computed by + `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the + actual percentage will be smaller. mask_length: size of the mask min_masks: minimum number of masked spans + attention_mask: A (right-padded) attention mask which independently shortens the feature axis of + each batch dimension. """ batch_size, sequence_length = shape @@ -160,9 +163,11 @@ def _compute_mask_indices( if mask_length > sequence_length: raise ValueError( - f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`" + f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}" + f" and `sequence_length`: {sequence_length}`" ) + # epsilon is used for probabilistic rounding epsilon = np.random.rand(1).item() def compute_num_masked_span(input_length): @@ -189,15 +194,21 @@ def _compute_mask_indices( max_num_masked_span = compute_num_masked_span(sequence_length) + if max_num_masked_span == 0: + return spec_aug_mask + for input_length in input_lengths: # compute num of masked spans for this input num_masked_span = compute_num_masked_span(input_length) + # get random indices to mask spec_aug_mask_idx = np.random.choice( np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False ) # pick first sampled index that will serve as a dummy index to pad vector + # to ensure same dimension for all batches due to probabilistic rounding + # Picking first sample just pads those vectors twice. dummy_mask_idx = spec_aug_mask_idx[0] spec_aug_mask_idx = np.concatenate( @@ -213,6 +224,7 @@ def _compute_mask_indices( ) spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length) + # add offset to the starting indexes so that that indexes now create a span offsets = np.arange(mask_length)[None, None, :] offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape( batch_size, max_num_masked_span * mask_length @@ -1182,7 +1194,7 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel): mask_prob=self.config.mask_time_prob, mask_length=self.config.mask_time_length, attention_mask=attention_mask, - min_masks=2, + min_masks=self.config.mask_time_min_masks, ) mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool) hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) @@ -1193,6 +1205,7 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel): (batch_size, hidden_size), mask_prob=self.config.mask_feature_prob, mask_length=self.config.mask_feature_length, + min_masks=self.config.mask_feature_min_masks, ) mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool) mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1) diff --git a/tests/test_modeling_wav2vec2.py b/tests/test_modeling_wav2vec2.py index c689b05f25..278465341a 100644 --- a/tests/test_modeling_wav2vec2.py +++ b/tests/test_modeling_wav2vec2.py @@ -854,6 +854,36 @@ class Wav2Vec2UtilsTest(unittest.TestCase): self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)]) + def test_compute_mask_indices_low_prob(self): + # with these settings num_masked_spans=0.5, which means probabilistic rounding + # ensures that in 5 out of 10 method calls, num_masked_spans=0, and in + # the other 5 out of 10, cases num_masked_spans=1 + n_trials = 100 + batch_size = 4 + sequence_length = 100 + mask_prob = 0.05 + mask_length = 10 + + count_dimensions_masked = 0 + count_dimensions_not_masked = 0 + + for _ in range(n_trials): + mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length) + mask = torch.from_numpy(mask).to(torch_device) + + num_masks = torch.sum(mask).item() + + if num_masks > 0: + count_dimensions_masked += 1 + else: + count_dimensions_not_masked += 1 + + # as we test for at least 10 masked dimension and at least + # 10 non-masked dimension, this test could fail with probability: + # P(100 coin flips, at most 9 heads) = 1.66e-18 + self.assertGreater(count_dimensions_masked, int(n_trials * 0.1)) + self.assertGreater(count_dimensions_not_masked, int(n_trials * 0.1)) + def test_compute_mask_indices_overlap(self): batch_size = 4 sequence_length = 80