fix #14524 (IndexError when mask prob is too low) (#14525)

* fix #14524 (IndexError when mask prob is too low)

* fix formatting

* correct documentation, add option for setting min_num_masks

* change the semantic meaning of `mask_prob` in _compute_mask_indices

With this commit the meaing of `mask_prob` actually adhered to the probability for each
vector to be the start of a masked span of length.

* fix check_copies test

* fix documentation to semantic meaning of `upper bound of overall masking percentage`, revert changes to _compute_mask_indices

* fix typo
This commit is contained in:
Nik
2021-12-02 15:05:31 +01:00
committed by GitHub
parent 96cc02b51b
commit 6645eb61fa
13 changed files with 288 additions and 78 deletions

View File

@@ -101,17 +101,30 @@ class HubertConfig(PretrainedConfig):
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
<https://arxiv.org/abs/1904.08779>`__. <https://arxiv.org/abs/1904.08779>`__.
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05): mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
Propability of each feature vector along the time axis to be chosen as the start of the vector span to be Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
masked along the time axis. This is only relevant if ``apply_spec_augment is True``. reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
mask_time_length (:obj:`int`, `optional`, defaults to 10): mask_time_length (:obj:`int`, `optional`, defaults to 10):
Length of vector span along the time axis. Length of vector span along the time axis.
mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
step, irrespectively of ``mask_feature_prob``. Only relevant if
''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0): mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
Propability of each feature vector along the feature axis to be chosen as the start of the vector span to Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
masked along the time axis. This is only relevant if ``apply_spec_augment is True``. the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
is True``.
mask_feature_length (:obj:`int`, `optional`, defaults to 10): mask_feature_length (:obj:`int`, `optional`, defaults to 10):
Length of vector span along the feature axis. Length of vector span along the feature axis.
mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
step, irrespectively of ``mask_feature_prob``. Only relevant if
''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`): ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`):
Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an
instance of :class:`~transformers.HubertForCTC`. instance of :class:`~transformers.HubertForCTC`.
@@ -169,8 +182,10 @@ class HubertConfig(PretrainedConfig):
apply_spec_augment=True, apply_spec_augment=True,
mask_time_prob=0.05, mask_time_prob=0.05,
mask_time_length=10, mask_time_length=10,
mask_time_min_masks=2,
mask_feature_prob=0.0, mask_feature_prob=0.0,
mask_feature_length=10, mask_feature_length=10,
mask_feature_min_masks=0,
ctc_loss_reduction="sum", ctc_loss_reduction="sum",
ctc_zero_infinity=False, ctc_zero_infinity=False,
use_weighted_layer_sum=False, use_weighted_layer_sum=False,
@@ -225,8 +240,10 @@ class HubertConfig(PretrainedConfig):
self.apply_spec_augment = apply_spec_augment self.apply_spec_augment = apply_spec_augment
self.mask_time_prob = mask_time_prob self.mask_time_prob = mask_time_prob
self.mask_time_length = mask_time_length self.mask_time_length = mask_time_length
self.mask_time_min_masks = mask_time_min_masks
self.mask_feature_prob = mask_feature_prob self.mask_feature_prob = mask_feature_prob
self.mask_feature_length = mask_feature_length self.mask_feature_length = mask_feature_length
self.mask_feature_min_masks = mask_feature_min_masks
# ctc loss # ctc loss
self.ctc_loss_reduction = ctc_loss_reduction self.ctc_loss_reduction = ctc_loss_reduction

View File

@@ -69,13 +69,16 @@ def _compute_mask_indices(
on CPU as part of the preprocessing during training. on CPU as part of the preprocessing during training.
Args: Args:
shape: the the shape for which to compute masks. shape: The shape for which to compute masks. This should be of a tuple of size 2 where
should be of size 2 where first element is batch size and 2nd is timesteps the first element is the batch size and the second element is the length of the axis to span.
mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
number of timesteps divided by length of mask span to mask approximately this percentage of all elements. independently generated mask spans of length `mask_length` is computed by
however due to overlaps, the actual number will be smaller (unless no_overlap is True) `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
actual percentage will be smaller.
mask_length: size of the mask mask_length: size of the mask
min_masks: minimum number of masked spans min_masks: minimum number of masked spans
attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
each batch dimension.
""" """
batch_size, sequence_length = shape batch_size, sequence_length = shape
@@ -84,9 +87,11 @@ def _compute_mask_indices(
if mask_length > sequence_length: if mask_length > sequence_length:
raise ValueError( raise ValueError(
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`" f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
f" and `sequence_length`: {sequence_length}`"
) )
# epsilon is used for probabilistic rounding
epsilon = np.random.rand(1).item() epsilon = np.random.rand(1).item()
def compute_num_masked_span(input_length): def compute_num_masked_span(input_length):
@@ -113,15 +118,21 @@ def _compute_mask_indices(
max_num_masked_span = compute_num_masked_span(sequence_length) max_num_masked_span = compute_num_masked_span(sequence_length)
if max_num_masked_span == 0:
return spec_aug_mask
for input_length in input_lengths: for input_length in input_lengths:
# compute num of masked spans for this input # compute num of masked spans for this input
num_masked_span = compute_num_masked_span(input_length) num_masked_span = compute_num_masked_span(input_length)
# get random indices to mask # get random indices to mask
spec_aug_mask_idx = np.random.choice( spec_aug_mask_idx = np.random.choice(
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
) )
# pick first sampled index that will serve as a dummy index to pad vector # pick first sampled index that will serve as a dummy index to pad vector
# to ensure same dimension for all batches due to probabilistic rounding
# Picking first sample just pads those vectors twice.
dummy_mask_idx = spec_aug_mask_idx[0] dummy_mask_idx = spec_aug_mask_idx[0]
spec_aug_mask_idx = np.concatenate( spec_aug_mask_idx = np.concatenate(
@@ -137,6 +148,7 @@ def _compute_mask_indices(
) )
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length) spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
# add offset to the starting indexes so that that indexes now create a span
offsets = np.arange(mask_length)[None, None, :] offsets = np.arange(mask_length)[None, None, :]
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape( offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
batch_size, max_num_masked_span * mask_length batch_size, max_num_masked_span * mask_length
@@ -930,7 +942,7 @@ class HubertModel(HubertPreTrainedModel):
mask_prob=self.config.mask_time_prob, mask_prob=self.config.mask_time_prob,
mask_length=self.config.mask_time_length, mask_length=self.config.mask_time_length,
attention_mask=attention_mask, attention_mask=attention_mask,
min_masks=2, min_masks=self.config.mask_time_min_masks,
) )
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool) mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
@@ -941,6 +953,7 @@ class HubertModel(HubertPreTrainedModel):
(batch_size, hidden_size), (batch_size, hidden_size),
mask_prob=self.config.mask_feature_prob, mask_prob=self.config.mask_feature_prob,
mask_length=self.config.mask_feature_length, mask_length=self.config.mask_feature_length,
min_masks=self.config.mask_feature_min_masks,
) )
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool) mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1) mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)

View File

@@ -95,17 +95,30 @@ class SEWConfig(PretrainedConfig):
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
<https://arxiv.org/abs/1904.08779>`__. <https://arxiv.org/abs/1904.08779>`__.
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05): mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
Propability of each feature vector along the time axis to be chosen as the start of the vector span to be Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
masked along the time axis. This is only relevant if ``apply_spec_augment is True``. reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
mask_time_length (:obj:`int`, `optional`, defaults to 10): mask_time_length (:obj:`int`, `optional`, defaults to 10):
Length of vector span along the time axis. Length of vector span along the time axis.
mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
step, irrespectively of ``mask_feature_prob``. Only relevant if
''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0): mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
Propability of each feature vector along the feature axis to be chosen as the start of the vector span to Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
masked along the time axis. This is only relevant if ``apply_spec_augment is True``. the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
is True``.
mask_feature_length (:obj:`int`, `optional`, defaults to 10): mask_feature_length (:obj:`int`, `optional`, defaults to 10):
Length of vector span along the feature axis. Length of vector span along the feature axis.
mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
step, irrespectively of ``mask_feature_prob``. Only relevant if
''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`): ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`):
Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an
instance of :class:`~transformers.SEWForCTC`. instance of :class:`~transformers.SEWForCTC`.
@@ -162,8 +175,10 @@ class SEWConfig(PretrainedConfig):
apply_spec_augment=True, apply_spec_augment=True,
mask_time_prob=0.05, mask_time_prob=0.05,
mask_time_length=10, mask_time_length=10,
mask_time_min_masks=2,
mask_feature_prob=0.0, mask_feature_prob=0.0,
mask_feature_length=10, mask_feature_length=10,
mask_feature_min_masks=0,
ctc_loss_reduction="mean", ctc_loss_reduction="mean",
ctc_zero_infinity=False, ctc_zero_infinity=False,
use_weighted_layer_sum=False, use_weighted_layer_sum=False,
@@ -215,8 +230,10 @@ class SEWConfig(PretrainedConfig):
self.apply_spec_augment = apply_spec_augment self.apply_spec_augment = apply_spec_augment
self.mask_time_prob = mask_time_prob self.mask_time_prob = mask_time_prob
self.mask_time_length = mask_time_length self.mask_time_length = mask_time_length
self.mask_time_min_masks = mask_time_min_masks
self.mask_feature_prob = mask_feature_prob self.mask_feature_prob = mask_feature_prob
self.mask_feature_length = mask_feature_length self.mask_feature_length = mask_feature_length
self.mask_feature_min_masks = mask_feature_min_masks
# ctc loss # ctc loss
self.ctc_loss_reduction = ctc_loss_reduction self.ctc_loss_reduction = ctc_loss_reduction

View File

@@ -67,13 +67,16 @@ def _compute_mask_indices(
on CPU as part of the preprocessing during training. on CPU as part of the preprocessing during training.
Args: Args:
shape: the the shape for which to compute masks. shape: The shape for which to compute masks. This should be of a tuple of size 2 where
should be of size 2 where first element is batch size and 2nd is timesteps the first element is the batch size and the second element is the length of the axis to span.
mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
number of timesteps divided by length of mask span to mask approximately this percentage of all elements. independently generated mask spans of length `mask_length` is computed by
however due to overlaps, the actual number will be smaller (unless no_overlap is True) `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
actual percentage will be smaller.
mask_length: size of the mask mask_length: size of the mask
min_masks: minimum number of masked spans min_masks: minimum number of masked spans
attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
each batch dimension.
""" """
batch_size, sequence_length = shape batch_size, sequence_length = shape
@@ -82,9 +85,11 @@ def _compute_mask_indices(
if mask_length > sequence_length: if mask_length > sequence_length:
raise ValueError( raise ValueError(
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`" f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
f" and `sequence_length`: {sequence_length}`"
) )
# epsilon is used for probabilistic rounding
epsilon = np.random.rand(1).item() epsilon = np.random.rand(1).item()
def compute_num_masked_span(input_length): def compute_num_masked_span(input_length):
@@ -111,15 +116,21 @@ def _compute_mask_indices(
max_num_masked_span = compute_num_masked_span(sequence_length) max_num_masked_span = compute_num_masked_span(sequence_length)
if max_num_masked_span == 0:
return spec_aug_mask
for input_length in input_lengths: for input_length in input_lengths:
# compute num of masked spans for this input # compute num of masked spans for this input
num_masked_span = compute_num_masked_span(input_length) num_masked_span = compute_num_masked_span(input_length)
# get random indices to mask # get random indices to mask
spec_aug_mask_idx = np.random.choice( spec_aug_mask_idx = np.random.choice(
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
) )
# pick first sampled index that will serve as a dummy index to pad vector # pick first sampled index that will serve as a dummy index to pad vector
# to ensure same dimension for all batches due to probabilistic rounding
# Picking first sample just pads those vectors twice.
dummy_mask_idx = spec_aug_mask_idx[0] dummy_mask_idx = spec_aug_mask_idx[0]
spec_aug_mask_idx = np.concatenate( spec_aug_mask_idx = np.concatenate(
@@ -135,6 +146,7 @@ def _compute_mask_indices(
) )
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length) spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
# add offset to the starting indexes so that that indexes now create a span
offsets = np.arange(mask_length)[None, None, :] offsets = np.arange(mask_length)[None, None, :]
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape( offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
batch_size, max_num_masked_span * mask_length batch_size, max_num_masked_span * mask_length
@@ -829,7 +841,7 @@ class SEWModel(SEWPreTrainedModel):
mask_prob=self.config.mask_time_prob, mask_prob=self.config.mask_time_prob,
mask_length=self.config.mask_time_length, mask_length=self.config.mask_time_length,
attention_mask=attention_mask, attention_mask=attention_mask,
min_masks=2, min_masks=self.config.mask_time_min_masks,
) )
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool) mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
@@ -840,6 +852,7 @@ class SEWModel(SEWPreTrainedModel):
(batch_size, hidden_size), (batch_size, hidden_size),
mask_prob=self.config.mask_feature_prob, mask_prob=self.config.mask_feature_prob,
mask_length=self.config.mask_feature_length, mask_length=self.config.mask_feature_length,
min_masks=self.config.mask_feature_min_masks,
) )
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool) mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1) mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)

View File

@@ -113,17 +113,30 @@ class SEWDConfig(PretrainedConfig):
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
<https://arxiv.org/abs/1904.08779>`__. <https://arxiv.org/abs/1904.08779>`__.
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05): mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
Propability of each feature vector along the time axis to be chosen as the start of the vector span to be Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
masked along the time axis. This is only relevant if ``apply_spec_augment is True``. reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
mask_time_length (:obj:`int`, `optional`, defaults to 10): mask_time_length (:obj:`int`, `optional`, defaults to 10):
Length of vector span along the time axis. Length of vector span along the time axis.
mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
step, irrespectively of ``mask_feature_prob``. Only relevant if
''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0): mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
Propability of each feature vector along the feature axis to be chosen as the start of the vector span to Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
masked along the time axis. This is only relevant if ``apply_spec_augment is True``. the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
is True``.
mask_feature_length (:obj:`int`, `optional`, defaults to 10): mask_feature_length (:obj:`int`, `optional`, defaults to 10):
Length of vector span along the feature axis. Length of vector span along the feature axis.
mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
step, irrespectively of ``mask_feature_prob``. Only relevant if
''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
diversity_loss_weight (:obj:`int`, `optional`, defaults to 0.1): diversity_loss_weight (:obj:`int`, `optional`, defaults to 0.1):
The weight of the codebook diversity loss component. The weight of the codebook diversity loss component.
ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`): ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`):
@@ -190,8 +203,10 @@ class SEWDConfig(PretrainedConfig):
apply_spec_augment=True, apply_spec_augment=True,
mask_time_prob=0.05, mask_time_prob=0.05,
mask_time_length=10, mask_time_length=10,
mask_time_min_masks=2,
mask_feature_prob=0.0, mask_feature_prob=0.0,
mask_feature_length=10, mask_feature_length=10,
mask_feature_min_masks=0,
ctc_loss_reduction="mean", ctc_loss_reduction="mean",
ctc_zero_infinity=False, ctc_zero_infinity=False,
use_weighted_layer_sum=False, use_weighted_layer_sum=False,
@@ -251,8 +266,10 @@ class SEWDConfig(PretrainedConfig):
self.apply_spec_augment = apply_spec_augment self.apply_spec_augment = apply_spec_augment
self.mask_time_prob = mask_time_prob self.mask_time_prob = mask_time_prob
self.mask_time_length = mask_time_length self.mask_time_length = mask_time_length
self.mask_time_min_masks = mask_time_min_masks
self.mask_feature_prob = mask_feature_prob self.mask_feature_prob = mask_feature_prob
self.mask_feature_length = mask_feature_length self.mask_feature_length = mask_feature_length
self.mask_feature_min_masks = mask_feature_min_masks
# ctc loss # ctc loss
self.ctc_loss_reduction = ctc_loss_reduction self.ctc_loss_reduction = ctc_loss_reduction

View File

@@ -73,13 +73,16 @@ def _compute_mask_indices(
on CPU as part of the preprocessing during training. on CPU as part of the preprocessing during training.
Args: Args:
shape: the the shape for which to compute masks. shape: The shape for which to compute masks. This should be of a tuple of size 2 where
should be of size 2 where first element is batch size and 2nd is timesteps the first element is the batch size and the second element is the length of the axis to span.
mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
number of timesteps divided by length of mask span to mask approximately this percentage of all elements. independently generated mask spans of length `mask_length` is computed by
however due to overlaps, the actual number will be smaller (unless no_overlap is True) `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
actual percentage will be smaller.
mask_length: size of the mask mask_length: size of the mask
min_masks: minimum number of masked spans min_masks: minimum number of masked spans
attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
each batch dimension.
""" """
batch_size, sequence_length = shape batch_size, sequence_length = shape
@@ -88,9 +91,11 @@ def _compute_mask_indices(
if mask_length > sequence_length: if mask_length > sequence_length:
raise ValueError( raise ValueError(
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`" f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
f" and `sequence_length`: {sequence_length}`"
) )
# epsilon is used for probabilistic rounding
epsilon = np.random.rand(1).item() epsilon = np.random.rand(1).item()
def compute_num_masked_span(input_length): def compute_num_masked_span(input_length):
@@ -117,15 +122,21 @@ def _compute_mask_indices(
max_num_masked_span = compute_num_masked_span(sequence_length) max_num_masked_span = compute_num_masked_span(sequence_length)
if max_num_masked_span == 0:
return spec_aug_mask
for input_length in input_lengths: for input_length in input_lengths:
# compute num of masked spans for this input # compute num of masked spans for this input
num_masked_span = compute_num_masked_span(input_length) num_masked_span = compute_num_masked_span(input_length)
# get random indices to mask # get random indices to mask
spec_aug_mask_idx = np.random.choice( spec_aug_mask_idx = np.random.choice(
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
) )
# pick first sampled index that will serve as a dummy index to pad vector # pick first sampled index that will serve as a dummy index to pad vector
# to ensure same dimension for all batches due to probabilistic rounding
# Picking first sample just pads those vectors twice.
dummy_mask_idx = spec_aug_mask_idx[0] dummy_mask_idx = spec_aug_mask_idx[0]
spec_aug_mask_idx = np.concatenate( spec_aug_mask_idx = np.concatenate(
@@ -141,6 +152,7 @@ def _compute_mask_indices(
) )
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length) spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
# add offset to the starting indexes so that that indexes now create a span
offsets = np.arange(mask_length)[None, None, :] offsets = np.arange(mask_length)[None, None, :]
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape( offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
batch_size, max_num_masked_span * mask_length batch_size, max_num_masked_span * mask_length
@@ -1360,7 +1372,7 @@ class SEWDModel(SEWDPreTrainedModel):
mask_prob=self.config.mask_time_prob, mask_prob=self.config.mask_time_prob,
mask_length=self.config.mask_time_length, mask_length=self.config.mask_time_length,
attention_mask=attention_mask, attention_mask=attention_mask,
min_masks=2, min_masks=self.config.mask_time_min_masks,
) )
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool) mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
@@ -1371,6 +1383,7 @@ class SEWDModel(SEWDPreTrainedModel):
(batch_size, hidden_size), (batch_size, hidden_size),
mask_prob=self.config.mask_feature_prob, mask_prob=self.config.mask_feature_prob,
mask_length=self.config.mask_feature_length, mask_length=self.config.mask_feature_length,
min_masks=self.config.mask_feature_min_masks,
) )
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool) mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1) mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)

View File

@@ -101,17 +101,30 @@ class UniSpeechConfig(PretrainedConfig):
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
<https://arxiv.org/abs/1904.08779>`__. <https://arxiv.org/abs/1904.08779>`__.
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05): mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
Propability of each feature vector along the time axis to be chosen as the start of the vector span to be Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
masked along the time axis. This is only relevant if ``apply_spec_augment is True``. reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
mask_time_length (:obj:`int`, `optional`, defaults to 10): mask_time_length (:obj:`int`, `optional`, defaults to 10):
Length of vector span along the time axis. Length of vector span along the time axis.
mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
step, irrespectively of ``mask_feature_prob``. Only relevant if
''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0): mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
Propability of each feature vector along the feature axis to be chosen as the start of the vector span to Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
masked along the time axis. This is only relevant if ``apply_spec_augment is True``. the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
is True``.
mask_feature_length (:obj:`int`, `optional`, defaults to 10): mask_feature_length (:obj:`int`, `optional`, defaults to 10):
Length of vector span along the feature axis. Length of vector span along the feature axis.
mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
step, irrespectively of ``mask_feature_prob``. Only relevant if
''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320): num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320):
Number of entries in each quantization codebook (group). Number of entries in each quantization codebook (group).
num_codevector_groups (:obj:`int`, `optional`, defaults to 2): num_codevector_groups (:obj:`int`, `optional`, defaults to 2):
@@ -187,8 +200,10 @@ class UniSpeechConfig(PretrainedConfig):
apply_spec_augment=True, apply_spec_augment=True,
mask_time_prob=0.05, mask_time_prob=0.05,
mask_time_length=10, mask_time_length=10,
mask_time_min_masks=2,
mask_feature_prob=0.0, mask_feature_prob=0.0,
mask_feature_length=10, mask_feature_length=10,
mask_feature_min_masks=0,
num_codevectors_per_group=320, num_codevectors_per_group=320,
num_codevector_groups=2, num_codevector_groups=2,
contrastive_logits_temperature=0.1, contrastive_logits_temperature=0.1,
@@ -252,8 +267,10 @@ class UniSpeechConfig(PretrainedConfig):
self.apply_spec_augment = apply_spec_augment self.apply_spec_augment = apply_spec_augment
self.mask_time_prob = mask_time_prob self.mask_time_prob = mask_time_prob
self.mask_time_length = mask_time_length self.mask_time_length = mask_time_length
self.mask_time_min_masks = mask_time_min_masks
self.mask_feature_prob = mask_feature_prob self.mask_feature_prob = mask_feature_prob
self.mask_feature_length = mask_feature_length self.mask_feature_length = mask_feature_length
self.mask_feature_min_masks = mask_feature_min_masks
# parameters for pretraining with codevector quantized representations # parameters for pretraining with codevector quantized representations
self.num_codevectors_per_group = num_codevectors_per_group self.num_codevectors_per_group = num_codevectors_per_group

View File

@@ -136,13 +136,16 @@ def _compute_mask_indices(
on CPU as part of the preprocessing during training. on CPU as part of the preprocessing during training.
Args: Args:
shape: the the shape for which to compute masks. shape: The shape for which to compute masks. This should be of a tuple of size 2 where
should be of size 2 where first element is batch size and 2nd is timesteps the first element is the batch size and the second element is the length of the axis to span.
mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
number of timesteps divided by length of mask span to mask approximately this percentage of all elements. independently generated mask spans of length `mask_length` is computed by
however due to overlaps, the actual number will be smaller (unless no_overlap is True) `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
actual percentage will be smaller.
mask_length: size of the mask mask_length: size of the mask
min_masks: minimum number of masked spans min_masks: minimum number of masked spans
attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
each batch dimension.
""" """
batch_size, sequence_length = shape batch_size, sequence_length = shape
@@ -151,9 +154,11 @@ def _compute_mask_indices(
if mask_length > sequence_length: if mask_length > sequence_length:
raise ValueError( raise ValueError(
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`" f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
f" and `sequence_length`: {sequence_length}`"
) )
# epsilon is used for probabilistic rounding
epsilon = np.random.rand(1).item() epsilon = np.random.rand(1).item()
def compute_num_masked_span(input_length): def compute_num_masked_span(input_length):
@@ -180,15 +185,21 @@ def _compute_mask_indices(
max_num_masked_span = compute_num_masked_span(sequence_length) max_num_masked_span = compute_num_masked_span(sequence_length)
if max_num_masked_span == 0:
return spec_aug_mask
for input_length in input_lengths: for input_length in input_lengths:
# compute num of masked spans for this input # compute num of masked spans for this input
num_masked_span = compute_num_masked_span(input_length) num_masked_span = compute_num_masked_span(input_length)
# get random indices to mask # get random indices to mask
spec_aug_mask_idx = np.random.choice( spec_aug_mask_idx = np.random.choice(
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
) )
# pick first sampled index that will serve as a dummy index to pad vector # pick first sampled index that will serve as a dummy index to pad vector
# to ensure same dimension for all batches due to probabilistic rounding
# Picking first sample just pads those vectors twice.
dummy_mask_idx = spec_aug_mask_idx[0] dummy_mask_idx = spec_aug_mask_idx[0]
spec_aug_mask_idx = np.concatenate( spec_aug_mask_idx = np.concatenate(
@@ -204,6 +215,7 @@ def _compute_mask_indices(
) )
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length) spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
# add offset to the starting indexes so that that indexes now create a span
offsets = np.arange(mask_length)[None, None, :] offsets = np.arange(mask_length)[None, None, :]
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape( offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
batch_size, max_num_masked_span * mask_length batch_size, max_num_masked_span * mask_length
@@ -1076,7 +1088,7 @@ class UniSpeechModel(UniSpeechPreTrainedModel):
mask_prob=self.config.mask_time_prob, mask_prob=self.config.mask_time_prob,
mask_length=self.config.mask_time_length, mask_length=self.config.mask_time_length,
attention_mask=attention_mask, attention_mask=attention_mask,
min_masks=2, min_masks=self.config.mask_time_min_masks,
) )
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool) mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
@@ -1087,6 +1099,7 @@ class UniSpeechModel(UniSpeechPreTrainedModel):
(batch_size, hidden_size), (batch_size, hidden_size),
mask_prob=self.config.mask_feature_prob, mask_prob=self.config.mask_feature_prob,
mask_length=self.config.mask_feature_length, mask_length=self.config.mask_feature_length,
min_masks=self.config.mask_feature_min_masks,
) )
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool) mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1) mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)

View File

@@ -101,17 +101,30 @@ class UniSpeechSatConfig(PretrainedConfig):
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
<https://arxiv.org/abs/1904.08779>`__. <https://arxiv.org/abs/1904.08779>`__.
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05): mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
Propability of each feature vector along the time axis to be chosen as the start of the vector span to be Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
masked along the time axis. This is only relevant if ``apply_spec_augment is True``. reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
mask_time_length (:obj:`int`, `optional`, defaults to 10): mask_time_length (:obj:`int`, `optional`, defaults to 10):
Length of vector span along the time axis. Length of vector span along the time axis.
mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
step, irrespectively of ``mask_feature_prob``. Only relevant if
''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0): mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
Propability of each feature vector along the feature axis to be chosen as the start of the vector span to Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
masked along the time axis. This is only relevant if ``apply_spec_augment is True``. the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
is True``.
mask_feature_length (:obj:`int`, `optional`, defaults to 10): mask_feature_length (:obj:`int`, `optional`, defaults to 10):
Length of vector span along the feature axis. Length of vector span along the feature axis.
mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
step, irrespectively of ``mask_feature_prob``. Only relevant if
''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320): num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320):
Number of entries in each quantization codebook (group). Number of entries in each quantization codebook (group).
num_codevector_groups (:obj:`int`, `optional`, defaults to 2): num_codevector_groups (:obj:`int`, `optional`, defaults to 2):
@@ -185,8 +198,10 @@ class UniSpeechSatConfig(PretrainedConfig):
apply_spec_augment=True, apply_spec_augment=True,
mask_time_prob=0.05, mask_time_prob=0.05,
mask_time_length=10, mask_time_length=10,
mask_time_min_masks=2,
mask_feature_prob=0.0, mask_feature_prob=0.0,
mask_feature_length=10, mask_feature_length=10,
mask_feature_min_masks=0,
num_codevectors_per_group=320, num_codevectors_per_group=320,
num_codevector_groups=2, num_codevector_groups=2,
contrastive_logits_temperature=0.1, contrastive_logits_temperature=0.1,
@@ -249,8 +264,10 @@ class UniSpeechSatConfig(PretrainedConfig):
self.apply_spec_augment = apply_spec_augment self.apply_spec_augment = apply_spec_augment
self.mask_time_prob = mask_time_prob self.mask_time_prob = mask_time_prob
self.mask_time_length = mask_time_length self.mask_time_length = mask_time_length
self.mask_time_min_masks = mask_time_min_masks
self.mask_feature_prob = mask_feature_prob self.mask_feature_prob = mask_feature_prob
self.mask_feature_length = mask_feature_length self.mask_feature_length = mask_feature_length
self.mask_feature_min_masks = mask_feature_min_masks
# parameters for pretraining with codevector quantized representations # parameters for pretraining with codevector quantized representations
self.num_codevectors_per_group = num_codevectors_per_group self.num_codevectors_per_group = num_codevectors_per_group

View File

@@ -137,13 +137,16 @@ def _compute_mask_indices(
on CPU as part of the preprocessing during training. on CPU as part of the preprocessing during training.
Args: Args:
shape: the the shape for which to compute masks. shape: The shape for which to compute masks. This should be of a tuple of size 2 where
should be of size 2 where first element is batch size and 2nd is timesteps the first element is the batch size and the second element is the length of the axis to span.
mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
number of timesteps divided by length of mask span to mask approximately this percentage of all elements. independently generated mask spans of length `mask_length` is computed by
however due to overlaps, the actual number will be smaller (unless no_overlap is True) `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
actual percentage will be smaller.
mask_length: size of the mask mask_length: size of the mask
min_masks: minimum number of masked spans min_masks: minimum number of masked spans
attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
each batch dimension.
""" """
batch_size, sequence_length = shape batch_size, sequence_length = shape
@@ -152,9 +155,11 @@ def _compute_mask_indices(
if mask_length > sequence_length: if mask_length > sequence_length:
raise ValueError( raise ValueError(
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`" f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
f" and `sequence_length`: {sequence_length}`"
) )
# epsilon is used for probabilistic rounding
epsilon = np.random.rand(1).item() epsilon = np.random.rand(1).item()
def compute_num_masked_span(input_length): def compute_num_masked_span(input_length):
@@ -181,15 +186,21 @@ def _compute_mask_indices(
max_num_masked_span = compute_num_masked_span(sequence_length) max_num_masked_span = compute_num_masked_span(sequence_length)
if max_num_masked_span == 0:
return spec_aug_mask
for input_length in input_lengths: for input_length in input_lengths:
# compute num of masked spans for this input # compute num of masked spans for this input
num_masked_span = compute_num_masked_span(input_length) num_masked_span = compute_num_masked_span(input_length)
# get random indices to mask # get random indices to mask
spec_aug_mask_idx = np.random.choice( spec_aug_mask_idx = np.random.choice(
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
) )
# pick first sampled index that will serve as a dummy index to pad vector # pick first sampled index that will serve as a dummy index to pad vector
# to ensure same dimension for all batches due to probabilistic rounding
# Picking first sample just pads those vectors twice.
dummy_mask_idx = spec_aug_mask_idx[0] dummy_mask_idx = spec_aug_mask_idx[0]
spec_aug_mask_idx = np.concatenate( spec_aug_mask_idx = np.concatenate(
@@ -205,6 +216,7 @@ def _compute_mask_indices(
) )
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length) spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
# add offset to the starting indexes so that that indexes now create a span
offsets = np.arange(mask_length)[None, None, :] offsets = np.arange(mask_length)[None, None, :]
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape( offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
batch_size, max_num_masked_span * mask_length batch_size, max_num_masked_span * mask_length
@@ -1077,7 +1089,7 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel):
mask_prob=self.config.mask_time_prob, mask_prob=self.config.mask_time_prob,
mask_length=self.config.mask_time_length, mask_length=self.config.mask_time_length,
attention_mask=attention_mask, attention_mask=attention_mask,
min_masks=2, min_masks=self.config.mask_time_min_masks,
) )
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool) mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
@@ -1088,6 +1100,7 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel):
(batch_size, hidden_size), (batch_size, hidden_size),
mask_prob=self.config.mask_feature_prob, mask_prob=self.config.mask_feature_prob,
mask_length=self.config.mask_feature_length, mask_length=self.config.mask_feature_length,
min_masks=self.config.mask_feature_min_masks,
) )
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool) mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1) mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)

View File

@@ -101,17 +101,30 @@ class Wav2Vec2Config(PretrainedConfig):
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
<https://arxiv.org/abs/1904.08779>`__. <https://arxiv.org/abs/1904.08779>`__.
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05): mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
Propability of each feature vector along the time axis to be chosen as the start of the vector span to be Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
masked along the time axis. This is only relevant if ``apply_spec_augment is True``. reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
mask_time_length (:obj:`int`, `optional`, defaults to 10): mask_time_length (:obj:`int`, `optional`, defaults to 10):
Length of vector span along the time axis. Length of vector span along the time axis.
mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
step, irrespectively of ``mask_feature_prob``. Only relevant if
''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0): mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
Propability of each feature vector along the feature axis to be chosen as the start of the vector span to Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
masked along the time axis. This is only relevant if ``apply_spec_augment is True``. the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
is True``.
mask_feature_length (:obj:`int`, `optional`, defaults to 10): mask_feature_length (:obj:`int`, `optional`, defaults to 10):
Length of vector span along the feature axis. Length of vector span along the feature axis.
mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
step, irrespectively of ``mask_feature_prob``. Only relevant if
''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320): num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320):
Number of entries in each quantization codebook (group). Number of entries in each quantization codebook (group).
num_codevector_groups (:obj:`int`, `optional`, defaults to 2): num_codevector_groups (:obj:`int`, `optional`, defaults to 2):
@@ -198,8 +211,10 @@ class Wav2Vec2Config(PretrainedConfig):
apply_spec_augment=True, apply_spec_augment=True,
mask_time_prob=0.05, mask_time_prob=0.05,
mask_time_length=10, mask_time_length=10,
mask_time_min_masks=2,
mask_feature_prob=0.0, mask_feature_prob=0.0,
mask_feature_length=10, mask_feature_length=10,
mask_feature_min_masks=0,
num_codevectors_per_group=320, num_codevectors_per_group=320,
num_codevector_groups=2, num_codevector_groups=2,
contrastive_logits_temperature=0.1, contrastive_logits_temperature=0.1,
@@ -265,8 +280,10 @@ class Wav2Vec2Config(PretrainedConfig):
self.apply_spec_augment = apply_spec_augment self.apply_spec_augment = apply_spec_augment
self.mask_time_prob = mask_time_prob self.mask_time_prob = mask_time_prob
self.mask_time_length = mask_time_length self.mask_time_length = mask_time_length
self.mask_time_min_masks = mask_time_min_masks
self.mask_feature_prob = mask_feature_prob self.mask_feature_prob = mask_feature_prob
self.mask_feature_length = mask_feature_length self.mask_feature_length = mask_feature_length
self.mask_feature_min_masks = mask_feature_min_masks
# parameters for pretraining with codevector quantized representations # parameters for pretraining with codevector quantized representations
self.num_codevectors_per_group = num_codevectors_per_group self.num_codevectors_per_group = num_codevectors_per_group

View File

@@ -145,13 +145,16 @@ def _compute_mask_indices(
on CPU as part of the preprocessing during training. on CPU as part of the preprocessing during training.
Args: Args:
shape: the the shape for which to compute masks. shape: The shape for which to compute masks. This should be of a tuple of size 2 where
should be of size 2 where first element is batch size and 2nd is timesteps the first element is the batch size and the second element is the length of the axis to span.
mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
number of timesteps divided by length of mask span to mask approximately this percentage of all elements. independently generated mask spans of length `mask_length` is computed by
however due to overlaps, the actual number will be smaller (unless no_overlap is True) `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
actual percentage will be smaller.
mask_length: size of the mask mask_length: size of the mask
min_masks: minimum number of masked spans min_masks: minimum number of masked spans
attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
each batch dimension.
""" """
batch_size, sequence_length = shape batch_size, sequence_length = shape
@@ -160,9 +163,11 @@ def _compute_mask_indices(
if mask_length > sequence_length: if mask_length > sequence_length:
raise ValueError( raise ValueError(
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`" f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
f" and `sequence_length`: {sequence_length}`"
) )
# epsilon is used for probabilistic rounding
epsilon = np.random.rand(1).item() epsilon = np.random.rand(1).item()
def compute_num_masked_span(input_length): def compute_num_masked_span(input_length):
@@ -189,15 +194,21 @@ def _compute_mask_indices(
max_num_masked_span = compute_num_masked_span(sequence_length) max_num_masked_span = compute_num_masked_span(sequence_length)
if max_num_masked_span == 0:
return spec_aug_mask
for input_length in input_lengths: for input_length in input_lengths:
# compute num of masked spans for this input # compute num of masked spans for this input
num_masked_span = compute_num_masked_span(input_length) num_masked_span = compute_num_masked_span(input_length)
# get random indices to mask # get random indices to mask
spec_aug_mask_idx = np.random.choice( spec_aug_mask_idx = np.random.choice(
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
) )
# pick first sampled index that will serve as a dummy index to pad vector # pick first sampled index that will serve as a dummy index to pad vector
# to ensure same dimension for all batches due to probabilistic rounding
# Picking first sample just pads those vectors twice.
dummy_mask_idx = spec_aug_mask_idx[0] dummy_mask_idx = spec_aug_mask_idx[0]
spec_aug_mask_idx = np.concatenate( spec_aug_mask_idx = np.concatenate(
@@ -213,6 +224,7 @@ def _compute_mask_indices(
) )
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length) spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
# add offset to the starting indexes so that that indexes now create a span
offsets = np.arange(mask_length)[None, None, :] offsets = np.arange(mask_length)[None, None, :]
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape( offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
batch_size, max_num_masked_span * mask_length batch_size, max_num_masked_span * mask_length
@@ -1182,7 +1194,7 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel):
mask_prob=self.config.mask_time_prob, mask_prob=self.config.mask_time_prob,
mask_length=self.config.mask_time_length, mask_length=self.config.mask_time_length,
attention_mask=attention_mask, attention_mask=attention_mask,
min_masks=2, min_masks=self.config.mask_time_min_masks,
) )
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool) mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
@@ -1193,6 +1205,7 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel):
(batch_size, hidden_size), (batch_size, hidden_size),
mask_prob=self.config.mask_feature_prob, mask_prob=self.config.mask_feature_prob,
mask_length=self.config.mask_feature_length, mask_length=self.config.mask_feature_length,
min_masks=self.config.mask_feature_min_masks,
) )
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool) mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1) mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)

View File

@@ -854,6 +854,36 @@ class Wav2Vec2UtilsTest(unittest.TestCase):
self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)]) self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)])
def test_compute_mask_indices_low_prob(self):
# with these settings num_masked_spans=0.5, which means probabilistic rounding
# ensures that in 5 out of 10 method calls, num_masked_spans=0, and in
# the other 5 out of 10, cases num_masked_spans=1
n_trials = 100
batch_size = 4
sequence_length = 100
mask_prob = 0.05
mask_length = 10
count_dimensions_masked = 0
count_dimensions_not_masked = 0
for _ in range(n_trials):
mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
mask = torch.from_numpy(mask).to(torch_device)
num_masks = torch.sum(mask).item()
if num_masks > 0:
count_dimensions_masked += 1
else:
count_dimensions_not_masked += 1
# as we test for at least 10 masked dimension and at least
# 10 non-masked dimension, this test could fail with probability:
# P(100 coin flips, at most 9 heads) = 1.66e-18
self.assertGreater(count_dimensions_masked, int(n_trials * 0.1))
self.assertGreater(count_dimensions_not_masked, int(n_trials * 0.1))
def test_compute_mask_indices_overlap(self): def test_compute_mask_indices_overlap(self):
batch_size = 4 batch_size = 4
sequence_length = 80 sequence_length = 80