* fix #14524 (IndexError when mask prob is too low) * fix formatting * correct documentation, add option for setting min_num_masks * change the semantic meaning of `mask_prob` in _compute_mask_indices With this commit the meaing of `mask_prob` actually adhered to the probability for each vector to be the start of a masked span of length. * fix check_copies test * fix documentation to semantic meaning of `upper bound of overall masking percentage`, revert changes to _compute_mask_indices * fix typo
This commit is contained in:
@@ -101,17 +101,30 @@ class HubertConfig(PretrainedConfig):
|
||||
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
|
||||
<https://arxiv.org/abs/1904.08779>`__.
|
||||
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
|
||||
Propability of each feature vector along the time axis to be chosen as the start of the vector span to be
|
||||
masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be
|
||||
masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
|
||||
Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
|
||||
procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
|
||||
reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
|
||||
masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
|
||||
the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
|
||||
mask_time_length (:obj:`int`, `optional`, defaults to 10):
|
||||
Length of vector span along the time axis.
|
||||
mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
|
||||
The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
|
||||
step, irrespectively of ``mask_feature_prob``. Only relevant if
|
||||
''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
|
||||
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
|
||||
Propability of each feature vector along the feature axis to be chosen as the start of the vector span to
|
||||
be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be
|
||||
masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
|
||||
Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
|
||||
masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
|
||||
the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
|
||||
span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
|
||||
overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
|
||||
is True``.
|
||||
mask_feature_length (:obj:`int`, `optional`, defaults to 10):
|
||||
Length of vector span along the feature axis.
|
||||
mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
|
||||
The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
|
||||
step, irrespectively of ``mask_feature_prob``. Only relevant if
|
||||
''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
|
||||
ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`):
|
||||
Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an
|
||||
instance of :class:`~transformers.HubertForCTC`.
|
||||
@@ -169,8 +182,10 @@ class HubertConfig(PretrainedConfig):
|
||||
apply_spec_augment=True,
|
||||
mask_time_prob=0.05,
|
||||
mask_time_length=10,
|
||||
mask_time_min_masks=2,
|
||||
mask_feature_prob=0.0,
|
||||
mask_feature_length=10,
|
||||
mask_feature_min_masks=0,
|
||||
ctc_loss_reduction="sum",
|
||||
ctc_zero_infinity=False,
|
||||
use_weighted_layer_sum=False,
|
||||
@@ -225,8 +240,10 @@ class HubertConfig(PretrainedConfig):
|
||||
self.apply_spec_augment = apply_spec_augment
|
||||
self.mask_time_prob = mask_time_prob
|
||||
self.mask_time_length = mask_time_length
|
||||
self.mask_time_min_masks = mask_time_min_masks
|
||||
self.mask_feature_prob = mask_feature_prob
|
||||
self.mask_feature_length = mask_feature_length
|
||||
self.mask_feature_min_masks = mask_feature_min_masks
|
||||
|
||||
# ctc loss
|
||||
self.ctc_loss_reduction = ctc_loss_reduction
|
||||
|
||||
@@ -69,13 +69,16 @@ def _compute_mask_indices(
|
||||
on CPU as part of the preprocessing during training.
|
||||
|
||||
Args:
|
||||
shape: the the shape for which to compute masks.
|
||||
should be of size 2 where first element is batch size and 2nd is timesteps
|
||||
mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
|
||||
number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
|
||||
however due to overlaps, the actual number will be smaller (unless no_overlap is True)
|
||||
shape: The shape for which to compute masks. This should be of a tuple of size 2 where
|
||||
the first element is the batch size and the second element is the length of the axis to span.
|
||||
mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
|
||||
independently generated mask spans of length `mask_length` is computed by
|
||||
`mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
|
||||
actual percentage will be smaller.
|
||||
mask_length: size of the mask
|
||||
min_masks: minimum number of masked spans
|
||||
attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
|
||||
each batch dimension.
|
||||
"""
|
||||
batch_size, sequence_length = shape
|
||||
|
||||
@@ -84,9 +87,11 @@ def _compute_mask_indices(
|
||||
|
||||
if mask_length > sequence_length:
|
||||
raise ValueError(
|
||||
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
|
||||
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
|
||||
f" and `sequence_length`: {sequence_length}`"
|
||||
)
|
||||
|
||||
# epsilon is used for probabilistic rounding
|
||||
epsilon = np.random.rand(1).item()
|
||||
|
||||
def compute_num_masked_span(input_length):
|
||||
@@ -113,15 +118,21 @@ def _compute_mask_indices(
|
||||
|
||||
max_num_masked_span = compute_num_masked_span(sequence_length)
|
||||
|
||||
if max_num_masked_span == 0:
|
||||
return spec_aug_mask
|
||||
|
||||
for input_length in input_lengths:
|
||||
# compute num of masked spans for this input
|
||||
num_masked_span = compute_num_masked_span(input_length)
|
||||
|
||||
# get random indices to mask
|
||||
spec_aug_mask_idx = np.random.choice(
|
||||
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
|
||||
)
|
||||
|
||||
# pick first sampled index that will serve as a dummy index to pad vector
|
||||
# to ensure same dimension for all batches due to probabilistic rounding
|
||||
# Picking first sample just pads those vectors twice.
|
||||
dummy_mask_idx = spec_aug_mask_idx[0]
|
||||
|
||||
spec_aug_mask_idx = np.concatenate(
|
||||
@@ -137,6 +148,7 @@ def _compute_mask_indices(
|
||||
)
|
||||
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
|
||||
|
||||
# add offset to the starting indexes so that that indexes now create a span
|
||||
offsets = np.arange(mask_length)[None, None, :]
|
||||
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
|
||||
batch_size, max_num_masked_span * mask_length
|
||||
@@ -930,7 +942,7 @@ class HubertModel(HubertPreTrainedModel):
|
||||
mask_prob=self.config.mask_time_prob,
|
||||
mask_length=self.config.mask_time_length,
|
||||
attention_mask=attention_mask,
|
||||
min_masks=2,
|
||||
min_masks=self.config.mask_time_min_masks,
|
||||
)
|
||||
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
|
||||
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
|
||||
@@ -941,6 +953,7 @@ class HubertModel(HubertPreTrainedModel):
|
||||
(batch_size, hidden_size),
|
||||
mask_prob=self.config.mask_feature_prob,
|
||||
mask_length=self.config.mask_feature_length,
|
||||
min_masks=self.config.mask_feature_min_masks,
|
||||
)
|
||||
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
|
||||
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
|
||||
|
||||
@@ -95,17 +95,30 @@ class SEWConfig(PretrainedConfig):
|
||||
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
|
||||
<https://arxiv.org/abs/1904.08779>`__.
|
||||
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
|
||||
Propability of each feature vector along the time axis to be chosen as the start of the vector span to be
|
||||
masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be
|
||||
masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
|
||||
Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
|
||||
procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
|
||||
reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
|
||||
masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
|
||||
the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
|
||||
mask_time_length (:obj:`int`, `optional`, defaults to 10):
|
||||
Length of vector span along the time axis.
|
||||
mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
|
||||
The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
|
||||
step, irrespectively of ``mask_feature_prob``. Only relevant if
|
||||
''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
|
||||
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
|
||||
Propability of each feature vector along the feature axis to be chosen as the start of the vector span to
|
||||
be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be
|
||||
masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
|
||||
Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
|
||||
masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
|
||||
the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
|
||||
span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
|
||||
overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
|
||||
is True``.
|
||||
mask_feature_length (:obj:`int`, `optional`, defaults to 10):
|
||||
Length of vector span along the feature axis.
|
||||
mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
|
||||
The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
|
||||
step, irrespectively of ``mask_feature_prob``. Only relevant if
|
||||
''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
|
||||
ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`):
|
||||
Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an
|
||||
instance of :class:`~transformers.SEWForCTC`.
|
||||
@@ -162,8 +175,10 @@ class SEWConfig(PretrainedConfig):
|
||||
apply_spec_augment=True,
|
||||
mask_time_prob=0.05,
|
||||
mask_time_length=10,
|
||||
mask_time_min_masks=2,
|
||||
mask_feature_prob=0.0,
|
||||
mask_feature_length=10,
|
||||
mask_feature_min_masks=0,
|
||||
ctc_loss_reduction="mean",
|
||||
ctc_zero_infinity=False,
|
||||
use_weighted_layer_sum=False,
|
||||
@@ -215,8 +230,10 @@ class SEWConfig(PretrainedConfig):
|
||||
self.apply_spec_augment = apply_spec_augment
|
||||
self.mask_time_prob = mask_time_prob
|
||||
self.mask_time_length = mask_time_length
|
||||
self.mask_time_min_masks = mask_time_min_masks
|
||||
self.mask_feature_prob = mask_feature_prob
|
||||
self.mask_feature_length = mask_feature_length
|
||||
self.mask_feature_min_masks = mask_feature_min_masks
|
||||
|
||||
# ctc loss
|
||||
self.ctc_loss_reduction = ctc_loss_reduction
|
||||
|
||||
@@ -67,13 +67,16 @@ def _compute_mask_indices(
|
||||
on CPU as part of the preprocessing during training.
|
||||
|
||||
Args:
|
||||
shape: the the shape for which to compute masks.
|
||||
should be of size 2 where first element is batch size and 2nd is timesteps
|
||||
mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
|
||||
number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
|
||||
however due to overlaps, the actual number will be smaller (unless no_overlap is True)
|
||||
shape: The shape for which to compute masks. This should be of a tuple of size 2 where
|
||||
the first element is the batch size and the second element is the length of the axis to span.
|
||||
mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
|
||||
independently generated mask spans of length `mask_length` is computed by
|
||||
`mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
|
||||
actual percentage will be smaller.
|
||||
mask_length: size of the mask
|
||||
min_masks: minimum number of masked spans
|
||||
attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
|
||||
each batch dimension.
|
||||
"""
|
||||
batch_size, sequence_length = shape
|
||||
|
||||
@@ -82,9 +85,11 @@ def _compute_mask_indices(
|
||||
|
||||
if mask_length > sequence_length:
|
||||
raise ValueError(
|
||||
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
|
||||
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
|
||||
f" and `sequence_length`: {sequence_length}`"
|
||||
)
|
||||
|
||||
# epsilon is used for probabilistic rounding
|
||||
epsilon = np.random.rand(1).item()
|
||||
|
||||
def compute_num_masked_span(input_length):
|
||||
@@ -111,15 +116,21 @@ def _compute_mask_indices(
|
||||
|
||||
max_num_masked_span = compute_num_masked_span(sequence_length)
|
||||
|
||||
if max_num_masked_span == 0:
|
||||
return spec_aug_mask
|
||||
|
||||
for input_length in input_lengths:
|
||||
# compute num of masked spans for this input
|
||||
num_masked_span = compute_num_masked_span(input_length)
|
||||
|
||||
# get random indices to mask
|
||||
spec_aug_mask_idx = np.random.choice(
|
||||
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
|
||||
)
|
||||
|
||||
# pick first sampled index that will serve as a dummy index to pad vector
|
||||
# to ensure same dimension for all batches due to probabilistic rounding
|
||||
# Picking first sample just pads those vectors twice.
|
||||
dummy_mask_idx = spec_aug_mask_idx[0]
|
||||
|
||||
spec_aug_mask_idx = np.concatenate(
|
||||
@@ -135,6 +146,7 @@ def _compute_mask_indices(
|
||||
)
|
||||
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
|
||||
|
||||
# add offset to the starting indexes so that that indexes now create a span
|
||||
offsets = np.arange(mask_length)[None, None, :]
|
||||
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
|
||||
batch_size, max_num_masked_span * mask_length
|
||||
@@ -829,7 +841,7 @@ class SEWModel(SEWPreTrainedModel):
|
||||
mask_prob=self.config.mask_time_prob,
|
||||
mask_length=self.config.mask_time_length,
|
||||
attention_mask=attention_mask,
|
||||
min_masks=2,
|
||||
min_masks=self.config.mask_time_min_masks,
|
||||
)
|
||||
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
|
||||
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
|
||||
@@ -840,6 +852,7 @@ class SEWModel(SEWPreTrainedModel):
|
||||
(batch_size, hidden_size),
|
||||
mask_prob=self.config.mask_feature_prob,
|
||||
mask_length=self.config.mask_feature_length,
|
||||
min_masks=self.config.mask_feature_min_masks,
|
||||
)
|
||||
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
|
||||
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
|
||||
|
||||
@@ -113,17 +113,30 @@ class SEWDConfig(PretrainedConfig):
|
||||
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
|
||||
<https://arxiv.org/abs/1904.08779>`__.
|
||||
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
|
||||
Propability of each feature vector along the time axis to be chosen as the start of the vector span to be
|
||||
masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be
|
||||
masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
|
||||
Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
|
||||
procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
|
||||
reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
|
||||
masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
|
||||
the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
|
||||
mask_time_length (:obj:`int`, `optional`, defaults to 10):
|
||||
Length of vector span along the time axis.
|
||||
mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
|
||||
The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
|
||||
step, irrespectively of ``mask_feature_prob``. Only relevant if
|
||||
''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
|
||||
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
|
||||
Propability of each feature vector along the feature axis to be chosen as the start of the vector span to
|
||||
be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be
|
||||
masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
|
||||
Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
|
||||
masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
|
||||
the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
|
||||
span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
|
||||
overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
|
||||
is True``.
|
||||
mask_feature_length (:obj:`int`, `optional`, defaults to 10):
|
||||
Length of vector span along the feature axis.
|
||||
mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
|
||||
The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
|
||||
step, irrespectively of ``mask_feature_prob``. Only relevant if
|
||||
''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
|
||||
diversity_loss_weight (:obj:`int`, `optional`, defaults to 0.1):
|
||||
The weight of the codebook diversity loss component.
|
||||
ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`):
|
||||
@@ -190,8 +203,10 @@ class SEWDConfig(PretrainedConfig):
|
||||
apply_spec_augment=True,
|
||||
mask_time_prob=0.05,
|
||||
mask_time_length=10,
|
||||
mask_time_min_masks=2,
|
||||
mask_feature_prob=0.0,
|
||||
mask_feature_length=10,
|
||||
mask_feature_min_masks=0,
|
||||
ctc_loss_reduction="mean",
|
||||
ctc_zero_infinity=False,
|
||||
use_weighted_layer_sum=False,
|
||||
@@ -251,8 +266,10 @@ class SEWDConfig(PretrainedConfig):
|
||||
self.apply_spec_augment = apply_spec_augment
|
||||
self.mask_time_prob = mask_time_prob
|
||||
self.mask_time_length = mask_time_length
|
||||
self.mask_time_min_masks = mask_time_min_masks
|
||||
self.mask_feature_prob = mask_feature_prob
|
||||
self.mask_feature_length = mask_feature_length
|
||||
self.mask_feature_min_masks = mask_feature_min_masks
|
||||
|
||||
# ctc loss
|
||||
self.ctc_loss_reduction = ctc_loss_reduction
|
||||
|
||||
@@ -73,13 +73,16 @@ def _compute_mask_indices(
|
||||
on CPU as part of the preprocessing during training.
|
||||
|
||||
Args:
|
||||
shape: the the shape for which to compute masks.
|
||||
should be of size 2 where first element is batch size and 2nd is timesteps
|
||||
mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
|
||||
number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
|
||||
however due to overlaps, the actual number will be smaller (unless no_overlap is True)
|
||||
shape: The shape for which to compute masks. This should be of a tuple of size 2 where
|
||||
the first element is the batch size and the second element is the length of the axis to span.
|
||||
mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
|
||||
independently generated mask spans of length `mask_length` is computed by
|
||||
`mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
|
||||
actual percentage will be smaller.
|
||||
mask_length: size of the mask
|
||||
min_masks: minimum number of masked spans
|
||||
attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
|
||||
each batch dimension.
|
||||
"""
|
||||
batch_size, sequence_length = shape
|
||||
|
||||
@@ -88,9 +91,11 @@ def _compute_mask_indices(
|
||||
|
||||
if mask_length > sequence_length:
|
||||
raise ValueError(
|
||||
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
|
||||
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
|
||||
f" and `sequence_length`: {sequence_length}`"
|
||||
)
|
||||
|
||||
# epsilon is used for probabilistic rounding
|
||||
epsilon = np.random.rand(1).item()
|
||||
|
||||
def compute_num_masked_span(input_length):
|
||||
@@ -117,15 +122,21 @@ def _compute_mask_indices(
|
||||
|
||||
max_num_masked_span = compute_num_masked_span(sequence_length)
|
||||
|
||||
if max_num_masked_span == 0:
|
||||
return spec_aug_mask
|
||||
|
||||
for input_length in input_lengths:
|
||||
# compute num of masked spans for this input
|
||||
num_masked_span = compute_num_masked_span(input_length)
|
||||
|
||||
# get random indices to mask
|
||||
spec_aug_mask_idx = np.random.choice(
|
||||
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
|
||||
)
|
||||
|
||||
# pick first sampled index that will serve as a dummy index to pad vector
|
||||
# to ensure same dimension for all batches due to probabilistic rounding
|
||||
# Picking first sample just pads those vectors twice.
|
||||
dummy_mask_idx = spec_aug_mask_idx[0]
|
||||
|
||||
spec_aug_mask_idx = np.concatenate(
|
||||
@@ -141,6 +152,7 @@ def _compute_mask_indices(
|
||||
)
|
||||
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
|
||||
|
||||
# add offset to the starting indexes so that that indexes now create a span
|
||||
offsets = np.arange(mask_length)[None, None, :]
|
||||
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
|
||||
batch_size, max_num_masked_span * mask_length
|
||||
@@ -1360,7 +1372,7 @@ class SEWDModel(SEWDPreTrainedModel):
|
||||
mask_prob=self.config.mask_time_prob,
|
||||
mask_length=self.config.mask_time_length,
|
||||
attention_mask=attention_mask,
|
||||
min_masks=2,
|
||||
min_masks=self.config.mask_time_min_masks,
|
||||
)
|
||||
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
|
||||
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
|
||||
@@ -1371,6 +1383,7 @@ class SEWDModel(SEWDPreTrainedModel):
|
||||
(batch_size, hidden_size),
|
||||
mask_prob=self.config.mask_feature_prob,
|
||||
mask_length=self.config.mask_feature_length,
|
||||
min_masks=self.config.mask_feature_min_masks,
|
||||
)
|
||||
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
|
||||
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
|
||||
|
||||
@@ -101,17 +101,30 @@ class UniSpeechConfig(PretrainedConfig):
|
||||
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
|
||||
<https://arxiv.org/abs/1904.08779>`__.
|
||||
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
|
||||
Propability of each feature vector along the time axis to be chosen as the start of the vector span to be
|
||||
masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be
|
||||
masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
|
||||
Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
|
||||
procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
|
||||
reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
|
||||
masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
|
||||
the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
|
||||
mask_time_length (:obj:`int`, `optional`, defaults to 10):
|
||||
Length of vector span along the time axis.
|
||||
mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
|
||||
The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
|
||||
step, irrespectively of ``mask_feature_prob``. Only relevant if
|
||||
''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
|
||||
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
|
||||
Propability of each feature vector along the feature axis to be chosen as the start of the vector span to
|
||||
be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be
|
||||
masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
|
||||
Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
|
||||
masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
|
||||
the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
|
||||
span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
|
||||
overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
|
||||
is True``.
|
||||
mask_feature_length (:obj:`int`, `optional`, defaults to 10):
|
||||
Length of vector span along the feature axis.
|
||||
mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
|
||||
The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
|
||||
step, irrespectively of ``mask_feature_prob``. Only relevant if
|
||||
''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
|
||||
num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320):
|
||||
Number of entries in each quantization codebook (group).
|
||||
num_codevector_groups (:obj:`int`, `optional`, defaults to 2):
|
||||
@@ -187,8 +200,10 @@ class UniSpeechConfig(PretrainedConfig):
|
||||
apply_spec_augment=True,
|
||||
mask_time_prob=0.05,
|
||||
mask_time_length=10,
|
||||
mask_time_min_masks=2,
|
||||
mask_feature_prob=0.0,
|
||||
mask_feature_length=10,
|
||||
mask_feature_min_masks=0,
|
||||
num_codevectors_per_group=320,
|
||||
num_codevector_groups=2,
|
||||
contrastive_logits_temperature=0.1,
|
||||
@@ -252,8 +267,10 @@ class UniSpeechConfig(PretrainedConfig):
|
||||
self.apply_spec_augment = apply_spec_augment
|
||||
self.mask_time_prob = mask_time_prob
|
||||
self.mask_time_length = mask_time_length
|
||||
self.mask_time_min_masks = mask_time_min_masks
|
||||
self.mask_feature_prob = mask_feature_prob
|
||||
self.mask_feature_length = mask_feature_length
|
||||
self.mask_feature_min_masks = mask_feature_min_masks
|
||||
|
||||
# parameters for pretraining with codevector quantized representations
|
||||
self.num_codevectors_per_group = num_codevectors_per_group
|
||||
|
||||
@@ -136,13 +136,16 @@ def _compute_mask_indices(
|
||||
on CPU as part of the preprocessing during training.
|
||||
|
||||
Args:
|
||||
shape: the the shape for which to compute masks.
|
||||
should be of size 2 where first element is batch size and 2nd is timesteps
|
||||
mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
|
||||
number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
|
||||
however due to overlaps, the actual number will be smaller (unless no_overlap is True)
|
||||
shape: The shape for which to compute masks. This should be of a tuple of size 2 where
|
||||
the first element is the batch size and the second element is the length of the axis to span.
|
||||
mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
|
||||
independently generated mask spans of length `mask_length` is computed by
|
||||
`mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
|
||||
actual percentage will be smaller.
|
||||
mask_length: size of the mask
|
||||
min_masks: minimum number of masked spans
|
||||
attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
|
||||
each batch dimension.
|
||||
"""
|
||||
batch_size, sequence_length = shape
|
||||
|
||||
@@ -151,9 +154,11 @@ def _compute_mask_indices(
|
||||
|
||||
if mask_length > sequence_length:
|
||||
raise ValueError(
|
||||
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
|
||||
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
|
||||
f" and `sequence_length`: {sequence_length}`"
|
||||
)
|
||||
|
||||
# epsilon is used for probabilistic rounding
|
||||
epsilon = np.random.rand(1).item()
|
||||
|
||||
def compute_num_masked_span(input_length):
|
||||
@@ -180,15 +185,21 @@ def _compute_mask_indices(
|
||||
|
||||
max_num_masked_span = compute_num_masked_span(sequence_length)
|
||||
|
||||
if max_num_masked_span == 0:
|
||||
return spec_aug_mask
|
||||
|
||||
for input_length in input_lengths:
|
||||
# compute num of masked spans for this input
|
||||
num_masked_span = compute_num_masked_span(input_length)
|
||||
|
||||
# get random indices to mask
|
||||
spec_aug_mask_idx = np.random.choice(
|
||||
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
|
||||
)
|
||||
|
||||
# pick first sampled index that will serve as a dummy index to pad vector
|
||||
# to ensure same dimension for all batches due to probabilistic rounding
|
||||
# Picking first sample just pads those vectors twice.
|
||||
dummy_mask_idx = spec_aug_mask_idx[0]
|
||||
|
||||
spec_aug_mask_idx = np.concatenate(
|
||||
@@ -204,6 +215,7 @@ def _compute_mask_indices(
|
||||
)
|
||||
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
|
||||
|
||||
# add offset to the starting indexes so that that indexes now create a span
|
||||
offsets = np.arange(mask_length)[None, None, :]
|
||||
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
|
||||
batch_size, max_num_masked_span * mask_length
|
||||
@@ -1076,7 +1088,7 @@ class UniSpeechModel(UniSpeechPreTrainedModel):
|
||||
mask_prob=self.config.mask_time_prob,
|
||||
mask_length=self.config.mask_time_length,
|
||||
attention_mask=attention_mask,
|
||||
min_masks=2,
|
||||
min_masks=self.config.mask_time_min_masks,
|
||||
)
|
||||
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
|
||||
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
|
||||
@@ -1087,6 +1099,7 @@ class UniSpeechModel(UniSpeechPreTrainedModel):
|
||||
(batch_size, hidden_size),
|
||||
mask_prob=self.config.mask_feature_prob,
|
||||
mask_length=self.config.mask_feature_length,
|
||||
min_masks=self.config.mask_feature_min_masks,
|
||||
)
|
||||
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
|
||||
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
|
||||
|
||||
@@ -101,17 +101,30 @@ class UniSpeechSatConfig(PretrainedConfig):
|
||||
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
|
||||
<https://arxiv.org/abs/1904.08779>`__.
|
||||
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
|
||||
Propability of each feature vector along the time axis to be chosen as the start of the vector span to be
|
||||
masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be
|
||||
masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
|
||||
Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
|
||||
procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
|
||||
reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
|
||||
masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
|
||||
the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
|
||||
mask_time_length (:obj:`int`, `optional`, defaults to 10):
|
||||
Length of vector span along the time axis.
|
||||
mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
|
||||
The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
|
||||
step, irrespectively of ``mask_feature_prob``. Only relevant if
|
||||
''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
|
||||
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
|
||||
Propability of each feature vector along the feature axis to be chosen as the start of the vector span to
|
||||
be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be
|
||||
masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
|
||||
Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
|
||||
masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
|
||||
the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
|
||||
span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
|
||||
overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
|
||||
is True``.
|
||||
mask_feature_length (:obj:`int`, `optional`, defaults to 10):
|
||||
Length of vector span along the feature axis.
|
||||
mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
|
||||
The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
|
||||
step, irrespectively of ``mask_feature_prob``. Only relevant if
|
||||
''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
|
||||
num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320):
|
||||
Number of entries in each quantization codebook (group).
|
||||
num_codevector_groups (:obj:`int`, `optional`, defaults to 2):
|
||||
@@ -185,8 +198,10 @@ class UniSpeechSatConfig(PretrainedConfig):
|
||||
apply_spec_augment=True,
|
||||
mask_time_prob=0.05,
|
||||
mask_time_length=10,
|
||||
mask_time_min_masks=2,
|
||||
mask_feature_prob=0.0,
|
||||
mask_feature_length=10,
|
||||
mask_feature_min_masks=0,
|
||||
num_codevectors_per_group=320,
|
||||
num_codevector_groups=2,
|
||||
contrastive_logits_temperature=0.1,
|
||||
@@ -249,8 +264,10 @@ class UniSpeechSatConfig(PretrainedConfig):
|
||||
self.apply_spec_augment = apply_spec_augment
|
||||
self.mask_time_prob = mask_time_prob
|
||||
self.mask_time_length = mask_time_length
|
||||
self.mask_time_min_masks = mask_time_min_masks
|
||||
self.mask_feature_prob = mask_feature_prob
|
||||
self.mask_feature_length = mask_feature_length
|
||||
self.mask_feature_min_masks = mask_feature_min_masks
|
||||
|
||||
# parameters for pretraining with codevector quantized representations
|
||||
self.num_codevectors_per_group = num_codevectors_per_group
|
||||
|
||||
@@ -137,13 +137,16 @@ def _compute_mask_indices(
|
||||
on CPU as part of the preprocessing during training.
|
||||
|
||||
Args:
|
||||
shape: the the shape for which to compute masks.
|
||||
should be of size 2 where first element is batch size and 2nd is timesteps
|
||||
mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
|
||||
number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
|
||||
however due to overlaps, the actual number will be smaller (unless no_overlap is True)
|
||||
shape: The shape for which to compute masks. This should be of a tuple of size 2 where
|
||||
the first element is the batch size and the second element is the length of the axis to span.
|
||||
mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
|
||||
independently generated mask spans of length `mask_length` is computed by
|
||||
`mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
|
||||
actual percentage will be smaller.
|
||||
mask_length: size of the mask
|
||||
min_masks: minimum number of masked spans
|
||||
attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
|
||||
each batch dimension.
|
||||
"""
|
||||
batch_size, sequence_length = shape
|
||||
|
||||
@@ -152,9 +155,11 @@ def _compute_mask_indices(
|
||||
|
||||
if mask_length > sequence_length:
|
||||
raise ValueError(
|
||||
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
|
||||
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
|
||||
f" and `sequence_length`: {sequence_length}`"
|
||||
)
|
||||
|
||||
# epsilon is used for probabilistic rounding
|
||||
epsilon = np.random.rand(1).item()
|
||||
|
||||
def compute_num_masked_span(input_length):
|
||||
@@ -181,15 +186,21 @@ def _compute_mask_indices(
|
||||
|
||||
max_num_masked_span = compute_num_masked_span(sequence_length)
|
||||
|
||||
if max_num_masked_span == 0:
|
||||
return spec_aug_mask
|
||||
|
||||
for input_length in input_lengths:
|
||||
# compute num of masked spans for this input
|
||||
num_masked_span = compute_num_masked_span(input_length)
|
||||
|
||||
# get random indices to mask
|
||||
spec_aug_mask_idx = np.random.choice(
|
||||
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
|
||||
)
|
||||
|
||||
# pick first sampled index that will serve as a dummy index to pad vector
|
||||
# to ensure same dimension for all batches due to probabilistic rounding
|
||||
# Picking first sample just pads those vectors twice.
|
||||
dummy_mask_idx = spec_aug_mask_idx[0]
|
||||
|
||||
spec_aug_mask_idx = np.concatenate(
|
||||
@@ -205,6 +216,7 @@ def _compute_mask_indices(
|
||||
)
|
||||
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
|
||||
|
||||
# add offset to the starting indexes so that that indexes now create a span
|
||||
offsets = np.arange(mask_length)[None, None, :]
|
||||
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
|
||||
batch_size, max_num_masked_span * mask_length
|
||||
@@ -1077,7 +1089,7 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel):
|
||||
mask_prob=self.config.mask_time_prob,
|
||||
mask_length=self.config.mask_time_length,
|
||||
attention_mask=attention_mask,
|
||||
min_masks=2,
|
||||
min_masks=self.config.mask_time_min_masks,
|
||||
)
|
||||
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
|
||||
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
|
||||
@@ -1088,6 +1100,7 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel):
|
||||
(batch_size, hidden_size),
|
||||
mask_prob=self.config.mask_feature_prob,
|
||||
mask_length=self.config.mask_feature_length,
|
||||
min_masks=self.config.mask_feature_min_masks,
|
||||
)
|
||||
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
|
||||
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
|
||||
|
||||
@@ -101,17 +101,30 @@ class Wav2Vec2Config(PretrainedConfig):
|
||||
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
|
||||
<https://arxiv.org/abs/1904.08779>`__.
|
||||
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
|
||||
Propability of each feature vector along the time axis to be chosen as the start of the vector span to be
|
||||
masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be
|
||||
masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
|
||||
Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
|
||||
procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
|
||||
reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
|
||||
masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
|
||||
the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
|
||||
mask_time_length (:obj:`int`, `optional`, defaults to 10):
|
||||
Length of vector span along the time axis.
|
||||
mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
|
||||
The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
|
||||
step, irrespectively of ``mask_feature_prob``. Only relevant if
|
||||
''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
|
||||
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
|
||||
Propability of each feature vector along the feature axis to be chosen as the start of the vector span to
|
||||
be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be
|
||||
masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
|
||||
Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
|
||||
masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
|
||||
the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
|
||||
span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
|
||||
overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
|
||||
is True``.
|
||||
mask_feature_length (:obj:`int`, `optional`, defaults to 10):
|
||||
Length of vector span along the feature axis.
|
||||
mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
|
||||
The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
|
||||
step, irrespectively of ``mask_feature_prob``. Only relevant if
|
||||
''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
|
||||
num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320):
|
||||
Number of entries in each quantization codebook (group).
|
||||
num_codevector_groups (:obj:`int`, `optional`, defaults to 2):
|
||||
@@ -198,8 +211,10 @@ class Wav2Vec2Config(PretrainedConfig):
|
||||
apply_spec_augment=True,
|
||||
mask_time_prob=0.05,
|
||||
mask_time_length=10,
|
||||
mask_time_min_masks=2,
|
||||
mask_feature_prob=0.0,
|
||||
mask_feature_length=10,
|
||||
mask_feature_min_masks=0,
|
||||
num_codevectors_per_group=320,
|
||||
num_codevector_groups=2,
|
||||
contrastive_logits_temperature=0.1,
|
||||
@@ -265,8 +280,10 @@ class Wav2Vec2Config(PretrainedConfig):
|
||||
self.apply_spec_augment = apply_spec_augment
|
||||
self.mask_time_prob = mask_time_prob
|
||||
self.mask_time_length = mask_time_length
|
||||
self.mask_time_min_masks = mask_time_min_masks
|
||||
self.mask_feature_prob = mask_feature_prob
|
||||
self.mask_feature_length = mask_feature_length
|
||||
self.mask_feature_min_masks = mask_feature_min_masks
|
||||
|
||||
# parameters for pretraining with codevector quantized representations
|
||||
self.num_codevectors_per_group = num_codevectors_per_group
|
||||
|
||||
@@ -145,13 +145,16 @@ def _compute_mask_indices(
|
||||
on CPU as part of the preprocessing during training.
|
||||
|
||||
Args:
|
||||
shape: the the shape for which to compute masks.
|
||||
should be of size 2 where first element is batch size and 2nd is timesteps
|
||||
mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
|
||||
number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
|
||||
however due to overlaps, the actual number will be smaller (unless no_overlap is True)
|
||||
shape: The shape for which to compute masks. This should be of a tuple of size 2 where
|
||||
the first element is the batch size and the second element is the length of the axis to span.
|
||||
mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
|
||||
independently generated mask spans of length `mask_length` is computed by
|
||||
`mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
|
||||
actual percentage will be smaller.
|
||||
mask_length: size of the mask
|
||||
min_masks: minimum number of masked spans
|
||||
attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
|
||||
each batch dimension.
|
||||
"""
|
||||
batch_size, sequence_length = shape
|
||||
|
||||
@@ -160,9 +163,11 @@ def _compute_mask_indices(
|
||||
|
||||
if mask_length > sequence_length:
|
||||
raise ValueError(
|
||||
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
|
||||
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
|
||||
f" and `sequence_length`: {sequence_length}`"
|
||||
)
|
||||
|
||||
# epsilon is used for probabilistic rounding
|
||||
epsilon = np.random.rand(1).item()
|
||||
|
||||
def compute_num_masked_span(input_length):
|
||||
@@ -189,15 +194,21 @@ def _compute_mask_indices(
|
||||
|
||||
max_num_masked_span = compute_num_masked_span(sequence_length)
|
||||
|
||||
if max_num_masked_span == 0:
|
||||
return spec_aug_mask
|
||||
|
||||
for input_length in input_lengths:
|
||||
# compute num of masked spans for this input
|
||||
num_masked_span = compute_num_masked_span(input_length)
|
||||
|
||||
# get random indices to mask
|
||||
spec_aug_mask_idx = np.random.choice(
|
||||
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
|
||||
)
|
||||
|
||||
# pick first sampled index that will serve as a dummy index to pad vector
|
||||
# to ensure same dimension for all batches due to probabilistic rounding
|
||||
# Picking first sample just pads those vectors twice.
|
||||
dummy_mask_idx = spec_aug_mask_idx[0]
|
||||
|
||||
spec_aug_mask_idx = np.concatenate(
|
||||
@@ -213,6 +224,7 @@ def _compute_mask_indices(
|
||||
)
|
||||
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
|
||||
|
||||
# add offset to the starting indexes so that that indexes now create a span
|
||||
offsets = np.arange(mask_length)[None, None, :]
|
||||
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
|
||||
batch_size, max_num_masked_span * mask_length
|
||||
@@ -1182,7 +1194,7 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel):
|
||||
mask_prob=self.config.mask_time_prob,
|
||||
mask_length=self.config.mask_time_length,
|
||||
attention_mask=attention_mask,
|
||||
min_masks=2,
|
||||
min_masks=self.config.mask_time_min_masks,
|
||||
)
|
||||
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
|
||||
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
|
||||
@@ -1193,6 +1205,7 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel):
|
||||
(batch_size, hidden_size),
|
||||
mask_prob=self.config.mask_feature_prob,
|
||||
mask_length=self.config.mask_feature_length,
|
||||
min_masks=self.config.mask_feature_min_masks,
|
||||
)
|
||||
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
|
||||
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
|
||||
|
||||
@@ -854,6 +854,36 @@ class Wav2Vec2UtilsTest(unittest.TestCase):
|
||||
|
||||
self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)])
|
||||
|
||||
def test_compute_mask_indices_low_prob(self):
|
||||
# with these settings num_masked_spans=0.5, which means probabilistic rounding
|
||||
# ensures that in 5 out of 10 method calls, num_masked_spans=0, and in
|
||||
# the other 5 out of 10, cases num_masked_spans=1
|
||||
n_trials = 100
|
||||
batch_size = 4
|
||||
sequence_length = 100
|
||||
mask_prob = 0.05
|
||||
mask_length = 10
|
||||
|
||||
count_dimensions_masked = 0
|
||||
count_dimensions_not_masked = 0
|
||||
|
||||
for _ in range(n_trials):
|
||||
mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
|
||||
mask = torch.from_numpy(mask).to(torch_device)
|
||||
|
||||
num_masks = torch.sum(mask).item()
|
||||
|
||||
if num_masks > 0:
|
||||
count_dimensions_masked += 1
|
||||
else:
|
||||
count_dimensions_not_masked += 1
|
||||
|
||||
# as we test for at least 10 masked dimension and at least
|
||||
# 10 non-masked dimension, this test could fail with probability:
|
||||
# P(100 coin flips, at most 9 heads) = 1.66e-18
|
||||
self.assertGreater(count_dimensions_masked, int(n_trials * 0.1))
|
||||
self.assertGreater(count_dimensions_not_masked, int(n_trials * 0.1))
|
||||
|
||||
def test_compute_mask_indices_overlap(self):
|
||||
batch_size = 4
|
||||
sequence_length = 80
|
||||
|
||||
Reference in New Issue
Block a user