* fix #14524 (IndexError when mask prob is too low) * fix formatting * correct documentation, add option for setting min_num_masks * change the semantic meaning of `mask_prob` in _compute_mask_indices With this commit the meaing of `mask_prob` actually adhered to the probability for each vector to be the start of a masked span of length. * fix check_copies test * fix documentation to semantic meaning of `upper bound of overall masking percentage`, revert changes to _compute_mask_indices * fix typo
This commit is contained in:
@@ -101,17 +101,30 @@ class HubertConfig(PretrainedConfig):
|
|||||||
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
|
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
|
||||||
<https://arxiv.org/abs/1904.08779>`__.
|
<https://arxiv.org/abs/1904.08779>`__.
|
||||||
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
|
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
|
||||||
Propability of each feature vector along the time axis to be chosen as the start of the vector span to be
|
Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
|
||||||
masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be
|
procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
|
||||||
masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
|
reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
|
||||||
|
masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
|
||||||
|
the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
|
||||||
mask_time_length (:obj:`int`, `optional`, defaults to 10):
|
mask_time_length (:obj:`int`, `optional`, defaults to 10):
|
||||||
Length of vector span along the time axis.
|
Length of vector span along the time axis.
|
||||||
|
mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
|
||||||
|
The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
|
||||||
|
step, irrespectively of ``mask_feature_prob``. Only relevant if
|
||||||
|
''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
|
||||||
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
|
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
|
||||||
Propability of each feature vector along the feature axis to be chosen as the start of the vector span to
|
Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
|
||||||
be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be
|
masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
|
||||||
masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
|
the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
|
||||||
|
span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
|
||||||
|
overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
|
||||||
|
is True``.
|
||||||
mask_feature_length (:obj:`int`, `optional`, defaults to 10):
|
mask_feature_length (:obj:`int`, `optional`, defaults to 10):
|
||||||
Length of vector span along the feature axis.
|
Length of vector span along the feature axis.
|
||||||
|
mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
|
||||||
|
The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
|
||||||
|
step, irrespectively of ``mask_feature_prob``. Only relevant if
|
||||||
|
''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
|
||||||
ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`):
|
ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`):
|
||||||
Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an
|
Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an
|
||||||
instance of :class:`~transformers.HubertForCTC`.
|
instance of :class:`~transformers.HubertForCTC`.
|
||||||
@@ -169,8 +182,10 @@ class HubertConfig(PretrainedConfig):
|
|||||||
apply_spec_augment=True,
|
apply_spec_augment=True,
|
||||||
mask_time_prob=0.05,
|
mask_time_prob=0.05,
|
||||||
mask_time_length=10,
|
mask_time_length=10,
|
||||||
|
mask_time_min_masks=2,
|
||||||
mask_feature_prob=0.0,
|
mask_feature_prob=0.0,
|
||||||
mask_feature_length=10,
|
mask_feature_length=10,
|
||||||
|
mask_feature_min_masks=0,
|
||||||
ctc_loss_reduction="sum",
|
ctc_loss_reduction="sum",
|
||||||
ctc_zero_infinity=False,
|
ctc_zero_infinity=False,
|
||||||
use_weighted_layer_sum=False,
|
use_weighted_layer_sum=False,
|
||||||
@@ -225,8 +240,10 @@ class HubertConfig(PretrainedConfig):
|
|||||||
self.apply_spec_augment = apply_spec_augment
|
self.apply_spec_augment = apply_spec_augment
|
||||||
self.mask_time_prob = mask_time_prob
|
self.mask_time_prob = mask_time_prob
|
||||||
self.mask_time_length = mask_time_length
|
self.mask_time_length = mask_time_length
|
||||||
|
self.mask_time_min_masks = mask_time_min_masks
|
||||||
self.mask_feature_prob = mask_feature_prob
|
self.mask_feature_prob = mask_feature_prob
|
||||||
self.mask_feature_length = mask_feature_length
|
self.mask_feature_length = mask_feature_length
|
||||||
|
self.mask_feature_min_masks = mask_feature_min_masks
|
||||||
|
|
||||||
# ctc loss
|
# ctc loss
|
||||||
self.ctc_loss_reduction = ctc_loss_reduction
|
self.ctc_loss_reduction = ctc_loss_reduction
|
||||||
|
|||||||
@@ -69,13 +69,16 @@ def _compute_mask_indices(
|
|||||||
on CPU as part of the preprocessing during training.
|
on CPU as part of the preprocessing during training.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
shape: the the shape for which to compute masks.
|
shape: The shape for which to compute masks. This should be of a tuple of size 2 where
|
||||||
should be of size 2 where first element is batch size and 2nd is timesteps
|
the first element is the batch size and the second element is the length of the axis to span.
|
||||||
mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
|
mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
|
||||||
number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
|
independently generated mask spans of length `mask_length` is computed by
|
||||||
however due to overlaps, the actual number will be smaller (unless no_overlap is True)
|
`mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
|
||||||
|
actual percentage will be smaller.
|
||||||
mask_length: size of the mask
|
mask_length: size of the mask
|
||||||
min_masks: minimum number of masked spans
|
min_masks: minimum number of masked spans
|
||||||
|
attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
|
||||||
|
each batch dimension.
|
||||||
"""
|
"""
|
||||||
batch_size, sequence_length = shape
|
batch_size, sequence_length = shape
|
||||||
|
|
||||||
@@ -84,9 +87,11 @@ def _compute_mask_indices(
|
|||||||
|
|
||||||
if mask_length > sequence_length:
|
if mask_length > sequence_length:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
|
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
|
||||||
|
f" and `sequence_length`: {sequence_length}`"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# epsilon is used for probabilistic rounding
|
||||||
epsilon = np.random.rand(1).item()
|
epsilon = np.random.rand(1).item()
|
||||||
|
|
||||||
def compute_num_masked_span(input_length):
|
def compute_num_masked_span(input_length):
|
||||||
@@ -113,15 +118,21 @@ def _compute_mask_indices(
|
|||||||
|
|
||||||
max_num_masked_span = compute_num_masked_span(sequence_length)
|
max_num_masked_span = compute_num_masked_span(sequence_length)
|
||||||
|
|
||||||
|
if max_num_masked_span == 0:
|
||||||
|
return spec_aug_mask
|
||||||
|
|
||||||
for input_length in input_lengths:
|
for input_length in input_lengths:
|
||||||
# compute num of masked spans for this input
|
# compute num of masked spans for this input
|
||||||
num_masked_span = compute_num_masked_span(input_length)
|
num_masked_span = compute_num_masked_span(input_length)
|
||||||
|
|
||||||
# get random indices to mask
|
# get random indices to mask
|
||||||
spec_aug_mask_idx = np.random.choice(
|
spec_aug_mask_idx = np.random.choice(
|
||||||
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
|
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
|
||||||
)
|
)
|
||||||
|
|
||||||
# pick first sampled index that will serve as a dummy index to pad vector
|
# pick first sampled index that will serve as a dummy index to pad vector
|
||||||
|
# to ensure same dimension for all batches due to probabilistic rounding
|
||||||
|
# Picking first sample just pads those vectors twice.
|
||||||
dummy_mask_idx = spec_aug_mask_idx[0]
|
dummy_mask_idx = spec_aug_mask_idx[0]
|
||||||
|
|
||||||
spec_aug_mask_idx = np.concatenate(
|
spec_aug_mask_idx = np.concatenate(
|
||||||
@@ -137,6 +148,7 @@ def _compute_mask_indices(
|
|||||||
)
|
)
|
||||||
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
|
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
|
||||||
|
|
||||||
|
# add offset to the starting indexes so that that indexes now create a span
|
||||||
offsets = np.arange(mask_length)[None, None, :]
|
offsets = np.arange(mask_length)[None, None, :]
|
||||||
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
|
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
|
||||||
batch_size, max_num_masked_span * mask_length
|
batch_size, max_num_masked_span * mask_length
|
||||||
@@ -930,7 +942,7 @@ class HubertModel(HubertPreTrainedModel):
|
|||||||
mask_prob=self.config.mask_time_prob,
|
mask_prob=self.config.mask_time_prob,
|
||||||
mask_length=self.config.mask_time_length,
|
mask_length=self.config.mask_time_length,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
min_masks=2,
|
min_masks=self.config.mask_time_min_masks,
|
||||||
)
|
)
|
||||||
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
|
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
|
||||||
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
|
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
|
||||||
@@ -941,6 +953,7 @@ class HubertModel(HubertPreTrainedModel):
|
|||||||
(batch_size, hidden_size),
|
(batch_size, hidden_size),
|
||||||
mask_prob=self.config.mask_feature_prob,
|
mask_prob=self.config.mask_feature_prob,
|
||||||
mask_length=self.config.mask_feature_length,
|
mask_length=self.config.mask_feature_length,
|
||||||
|
min_masks=self.config.mask_feature_min_masks,
|
||||||
)
|
)
|
||||||
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
|
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
|
||||||
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
|
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
|
||||||
|
|||||||
@@ -95,17 +95,30 @@ class SEWConfig(PretrainedConfig):
|
|||||||
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
|
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
|
||||||
<https://arxiv.org/abs/1904.08779>`__.
|
<https://arxiv.org/abs/1904.08779>`__.
|
||||||
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
|
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
|
||||||
Propability of each feature vector along the time axis to be chosen as the start of the vector span to be
|
Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
|
||||||
masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be
|
procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
|
||||||
masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
|
reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
|
||||||
|
masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
|
||||||
|
the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
|
||||||
mask_time_length (:obj:`int`, `optional`, defaults to 10):
|
mask_time_length (:obj:`int`, `optional`, defaults to 10):
|
||||||
Length of vector span along the time axis.
|
Length of vector span along the time axis.
|
||||||
|
mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
|
||||||
|
The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
|
||||||
|
step, irrespectively of ``mask_feature_prob``. Only relevant if
|
||||||
|
''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
|
||||||
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
|
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
|
||||||
Propability of each feature vector along the feature axis to be chosen as the start of the vector span to
|
Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
|
||||||
be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be
|
masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
|
||||||
masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
|
the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
|
||||||
|
span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
|
||||||
|
overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
|
||||||
|
is True``.
|
||||||
mask_feature_length (:obj:`int`, `optional`, defaults to 10):
|
mask_feature_length (:obj:`int`, `optional`, defaults to 10):
|
||||||
Length of vector span along the feature axis.
|
Length of vector span along the feature axis.
|
||||||
|
mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
|
||||||
|
The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
|
||||||
|
step, irrespectively of ``mask_feature_prob``. Only relevant if
|
||||||
|
''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
|
||||||
ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`):
|
ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`):
|
||||||
Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an
|
Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an
|
||||||
instance of :class:`~transformers.SEWForCTC`.
|
instance of :class:`~transformers.SEWForCTC`.
|
||||||
@@ -162,8 +175,10 @@ class SEWConfig(PretrainedConfig):
|
|||||||
apply_spec_augment=True,
|
apply_spec_augment=True,
|
||||||
mask_time_prob=0.05,
|
mask_time_prob=0.05,
|
||||||
mask_time_length=10,
|
mask_time_length=10,
|
||||||
|
mask_time_min_masks=2,
|
||||||
mask_feature_prob=0.0,
|
mask_feature_prob=0.0,
|
||||||
mask_feature_length=10,
|
mask_feature_length=10,
|
||||||
|
mask_feature_min_masks=0,
|
||||||
ctc_loss_reduction="mean",
|
ctc_loss_reduction="mean",
|
||||||
ctc_zero_infinity=False,
|
ctc_zero_infinity=False,
|
||||||
use_weighted_layer_sum=False,
|
use_weighted_layer_sum=False,
|
||||||
@@ -215,8 +230,10 @@ class SEWConfig(PretrainedConfig):
|
|||||||
self.apply_spec_augment = apply_spec_augment
|
self.apply_spec_augment = apply_spec_augment
|
||||||
self.mask_time_prob = mask_time_prob
|
self.mask_time_prob = mask_time_prob
|
||||||
self.mask_time_length = mask_time_length
|
self.mask_time_length = mask_time_length
|
||||||
|
self.mask_time_min_masks = mask_time_min_masks
|
||||||
self.mask_feature_prob = mask_feature_prob
|
self.mask_feature_prob = mask_feature_prob
|
||||||
self.mask_feature_length = mask_feature_length
|
self.mask_feature_length = mask_feature_length
|
||||||
|
self.mask_feature_min_masks = mask_feature_min_masks
|
||||||
|
|
||||||
# ctc loss
|
# ctc loss
|
||||||
self.ctc_loss_reduction = ctc_loss_reduction
|
self.ctc_loss_reduction = ctc_loss_reduction
|
||||||
|
|||||||
@@ -67,13 +67,16 @@ def _compute_mask_indices(
|
|||||||
on CPU as part of the preprocessing during training.
|
on CPU as part of the preprocessing during training.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
shape: the the shape for which to compute masks.
|
shape: The shape for which to compute masks. This should be of a tuple of size 2 where
|
||||||
should be of size 2 where first element is batch size and 2nd is timesteps
|
the first element is the batch size and the second element is the length of the axis to span.
|
||||||
mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
|
mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
|
||||||
number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
|
independently generated mask spans of length `mask_length` is computed by
|
||||||
however due to overlaps, the actual number will be smaller (unless no_overlap is True)
|
`mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
|
||||||
|
actual percentage will be smaller.
|
||||||
mask_length: size of the mask
|
mask_length: size of the mask
|
||||||
min_masks: minimum number of masked spans
|
min_masks: minimum number of masked spans
|
||||||
|
attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
|
||||||
|
each batch dimension.
|
||||||
"""
|
"""
|
||||||
batch_size, sequence_length = shape
|
batch_size, sequence_length = shape
|
||||||
|
|
||||||
@@ -82,9 +85,11 @@ def _compute_mask_indices(
|
|||||||
|
|
||||||
if mask_length > sequence_length:
|
if mask_length > sequence_length:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
|
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
|
||||||
|
f" and `sequence_length`: {sequence_length}`"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# epsilon is used for probabilistic rounding
|
||||||
epsilon = np.random.rand(1).item()
|
epsilon = np.random.rand(1).item()
|
||||||
|
|
||||||
def compute_num_masked_span(input_length):
|
def compute_num_masked_span(input_length):
|
||||||
@@ -111,15 +116,21 @@ def _compute_mask_indices(
|
|||||||
|
|
||||||
max_num_masked_span = compute_num_masked_span(sequence_length)
|
max_num_masked_span = compute_num_masked_span(sequence_length)
|
||||||
|
|
||||||
|
if max_num_masked_span == 0:
|
||||||
|
return spec_aug_mask
|
||||||
|
|
||||||
for input_length in input_lengths:
|
for input_length in input_lengths:
|
||||||
# compute num of masked spans for this input
|
# compute num of masked spans for this input
|
||||||
num_masked_span = compute_num_masked_span(input_length)
|
num_masked_span = compute_num_masked_span(input_length)
|
||||||
|
|
||||||
# get random indices to mask
|
# get random indices to mask
|
||||||
spec_aug_mask_idx = np.random.choice(
|
spec_aug_mask_idx = np.random.choice(
|
||||||
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
|
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
|
||||||
)
|
)
|
||||||
|
|
||||||
# pick first sampled index that will serve as a dummy index to pad vector
|
# pick first sampled index that will serve as a dummy index to pad vector
|
||||||
|
# to ensure same dimension for all batches due to probabilistic rounding
|
||||||
|
# Picking first sample just pads those vectors twice.
|
||||||
dummy_mask_idx = spec_aug_mask_idx[0]
|
dummy_mask_idx = spec_aug_mask_idx[0]
|
||||||
|
|
||||||
spec_aug_mask_idx = np.concatenate(
|
spec_aug_mask_idx = np.concatenate(
|
||||||
@@ -135,6 +146,7 @@ def _compute_mask_indices(
|
|||||||
)
|
)
|
||||||
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
|
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
|
||||||
|
|
||||||
|
# add offset to the starting indexes so that that indexes now create a span
|
||||||
offsets = np.arange(mask_length)[None, None, :]
|
offsets = np.arange(mask_length)[None, None, :]
|
||||||
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
|
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
|
||||||
batch_size, max_num_masked_span * mask_length
|
batch_size, max_num_masked_span * mask_length
|
||||||
@@ -829,7 +841,7 @@ class SEWModel(SEWPreTrainedModel):
|
|||||||
mask_prob=self.config.mask_time_prob,
|
mask_prob=self.config.mask_time_prob,
|
||||||
mask_length=self.config.mask_time_length,
|
mask_length=self.config.mask_time_length,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
min_masks=2,
|
min_masks=self.config.mask_time_min_masks,
|
||||||
)
|
)
|
||||||
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
|
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
|
||||||
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
|
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
|
||||||
@@ -840,6 +852,7 @@ class SEWModel(SEWPreTrainedModel):
|
|||||||
(batch_size, hidden_size),
|
(batch_size, hidden_size),
|
||||||
mask_prob=self.config.mask_feature_prob,
|
mask_prob=self.config.mask_feature_prob,
|
||||||
mask_length=self.config.mask_feature_length,
|
mask_length=self.config.mask_feature_length,
|
||||||
|
min_masks=self.config.mask_feature_min_masks,
|
||||||
)
|
)
|
||||||
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
|
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
|
||||||
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
|
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
|
||||||
|
|||||||
@@ -113,17 +113,30 @@ class SEWDConfig(PretrainedConfig):
|
|||||||
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
|
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
|
||||||
<https://arxiv.org/abs/1904.08779>`__.
|
<https://arxiv.org/abs/1904.08779>`__.
|
||||||
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
|
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
|
||||||
Propability of each feature vector along the time axis to be chosen as the start of the vector span to be
|
Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
|
||||||
masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be
|
procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
|
||||||
masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
|
reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
|
||||||
|
masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
|
||||||
|
the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
|
||||||
mask_time_length (:obj:`int`, `optional`, defaults to 10):
|
mask_time_length (:obj:`int`, `optional`, defaults to 10):
|
||||||
Length of vector span along the time axis.
|
Length of vector span along the time axis.
|
||||||
|
mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
|
||||||
|
The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
|
||||||
|
step, irrespectively of ``mask_feature_prob``. Only relevant if
|
||||||
|
''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
|
||||||
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
|
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
|
||||||
Propability of each feature vector along the feature axis to be chosen as the start of the vector span to
|
Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
|
||||||
be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be
|
masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
|
||||||
masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
|
the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
|
||||||
|
span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
|
||||||
|
overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
|
||||||
|
is True``.
|
||||||
mask_feature_length (:obj:`int`, `optional`, defaults to 10):
|
mask_feature_length (:obj:`int`, `optional`, defaults to 10):
|
||||||
Length of vector span along the feature axis.
|
Length of vector span along the feature axis.
|
||||||
|
mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
|
||||||
|
The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
|
||||||
|
step, irrespectively of ``mask_feature_prob``. Only relevant if
|
||||||
|
''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
|
||||||
diversity_loss_weight (:obj:`int`, `optional`, defaults to 0.1):
|
diversity_loss_weight (:obj:`int`, `optional`, defaults to 0.1):
|
||||||
The weight of the codebook diversity loss component.
|
The weight of the codebook diversity loss component.
|
||||||
ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`):
|
ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`):
|
||||||
@@ -190,8 +203,10 @@ class SEWDConfig(PretrainedConfig):
|
|||||||
apply_spec_augment=True,
|
apply_spec_augment=True,
|
||||||
mask_time_prob=0.05,
|
mask_time_prob=0.05,
|
||||||
mask_time_length=10,
|
mask_time_length=10,
|
||||||
|
mask_time_min_masks=2,
|
||||||
mask_feature_prob=0.0,
|
mask_feature_prob=0.0,
|
||||||
mask_feature_length=10,
|
mask_feature_length=10,
|
||||||
|
mask_feature_min_masks=0,
|
||||||
ctc_loss_reduction="mean",
|
ctc_loss_reduction="mean",
|
||||||
ctc_zero_infinity=False,
|
ctc_zero_infinity=False,
|
||||||
use_weighted_layer_sum=False,
|
use_weighted_layer_sum=False,
|
||||||
@@ -251,8 +266,10 @@ class SEWDConfig(PretrainedConfig):
|
|||||||
self.apply_spec_augment = apply_spec_augment
|
self.apply_spec_augment = apply_spec_augment
|
||||||
self.mask_time_prob = mask_time_prob
|
self.mask_time_prob = mask_time_prob
|
||||||
self.mask_time_length = mask_time_length
|
self.mask_time_length = mask_time_length
|
||||||
|
self.mask_time_min_masks = mask_time_min_masks
|
||||||
self.mask_feature_prob = mask_feature_prob
|
self.mask_feature_prob = mask_feature_prob
|
||||||
self.mask_feature_length = mask_feature_length
|
self.mask_feature_length = mask_feature_length
|
||||||
|
self.mask_feature_min_masks = mask_feature_min_masks
|
||||||
|
|
||||||
# ctc loss
|
# ctc loss
|
||||||
self.ctc_loss_reduction = ctc_loss_reduction
|
self.ctc_loss_reduction = ctc_loss_reduction
|
||||||
|
|||||||
@@ -73,13 +73,16 @@ def _compute_mask_indices(
|
|||||||
on CPU as part of the preprocessing during training.
|
on CPU as part of the preprocessing during training.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
shape: the the shape for which to compute masks.
|
shape: The shape for which to compute masks. This should be of a tuple of size 2 where
|
||||||
should be of size 2 where first element is batch size and 2nd is timesteps
|
the first element is the batch size and the second element is the length of the axis to span.
|
||||||
mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
|
mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
|
||||||
number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
|
independently generated mask spans of length `mask_length` is computed by
|
||||||
however due to overlaps, the actual number will be smaller (unless no_overlap is True)
|
`mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
|
||||||
|
actual percentage will be smaller.
|
||||||
mask_length: size of the mask
|
mask_length: size of the mask
|
||||||
min_masks: minimum number of masked spans
|
min_masks: minimum number of masked spans
|
||||||
|
attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
|
||||||
|
each batch dimension.
|
||||||
"""
|
"""
|
||||||
batch_size, sequence_length = shape
|
batch_size, sequence_length = shape
|
||||||
|
|
||||||
@@ -88,9 +91,11 @@ def _compute_mask_indices(
|
|||||||
|
|
||||||
if mask_length > sequence_length:
|
if mask_length > sequence_length:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
|
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
|
||||||
|
f" and `sequence_length`: {sequence_length}`"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# epsilon is used for probabilistic rounding
|
||||||
epsilon = np.random.rand(1).item()
|
epsilon = np.random.rand(1).item()
|
||||||
|
|
||||||
def compute_num_masked_span(input_length):
|
def compute_num_masked_span(input_length):
|
||||||
@@ -117,15 +122,21 @@ def _compute_mask_indices(
|
|||||||
|
|
||||||
max_num_masked_span = compute_num_masked_span(sequence_length)
|
max_num_masked_span = compute_num_masked_span(sequence_length)
|
||||||
|
|
||||||
|
if max_num_masked_span == 0:
|
||||||
|
return spec_aug_mask
|
||||||
|
|
||||||
for input_length in input_lengths:
|
for input_length in input_lengths:
|
||||||
# compute num of masked spans for this input
|
# compute num of masked spans for this input
|
||||||
num_masked_span = compute_num_masked_span(input_length)
|
num_masked_span = compute_num_masked_span(input_length)
|
||||||
|
|
||||||
# get random indices to mask
|
# get random indices to mask
|
||||||
spec_aug_mask_idx = np.random.choice(
|
spec_aug_mask_idx = np.random.choice(
|
||||||
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
|
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
|
||||||
)
|
)
|
||||||
|
|
||||||
# pick first sampled index that will serve as a dummy index to pad vector
|
# pick first sampled index that will serve as a dummy index to pad vector
|
||||||
|
# to ensure same dimension for all batches due to probabilistic rounding
|
||||||
|
# Picking first sample just pads those vectors twice.
|
||||||
dummy_mask_idx = spec_aug_mask_idx[0]
|
dummy_mask_idx = spec_aug_mask_idx[0]
|
||||||
|
|
||||||
spec_aug_mask_idx = np.concatenate(
|
spec_aug_mask_idx = np.concatenate(
|
||||||
@@ -141,6 +152,7 @@ def _compute_mask_indices(
|
|||||||
)
|
)
|
||||||
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
|
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
|
||||||
|
|
||||||
|
# add offset to the starting indexes so that that indexes now create a span
|
||||||
offsets = np.arange(mask_length)[None, None, :]
|
offsets = np.arange(mask_length)[None, None, :]
|
||||||
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
|
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
|
||||||
batch_size, max_num_masked_span * mask_length
|
batch_size, max_num_masked_span * mask_length
|
||||||
@@ -1360,7 +1372,7 @@ class SEWDModel(SEWDPreTrainedModel):
|
|||||||
mask_prob=self.config.mask_time_prob,
|
mask_prob=self.config.mask_time_prob,
|
||||||
mask_length=self.config.mask_time_length,
|
mask_length=self.config.mask_time_length,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
min_masks=2,
|
min_masks=self.config.mask_time_min_masks,
|
||||||
)
|
)
|
||||||
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
|
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
|
||||||
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
|
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
|
||||||
@@ -1371,6 +1383,7 @@ class SEWDModel(SEWDPreTrainedModel):
|
|||||||
(batch_size, hidden_size),
|
(batch_size, hidden_size),
|
||||||
mask_prob=self.config.mask_feature_prob,
|
mask_prob=self.config.mask_feature_prob,
|
||||||
mask_length=self.config.mask_feature_length,
|
mask_length=self.config.mask_feature_length,
|
||||||
|
min_masks=self.config.mask_feature_min_masks,
|
||||||
)
|
)
|
||||||
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
|
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
|
||||||
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
|
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
|
||||||
|
|||||||
@@ -101,17 +101,30 @@ class UniSpeechConfig(PretrainedConfig):
|
|||||||
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
|
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
|
||||||
<https://arxiv.org/abs/1904.08779>`__.
|
<https://arxiv.org/abs/1904.08779>`__.
|
||||||
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
|
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
|
||||||
Propability of each feature vector along the time axis to be chosen as the start of the vector span to be
|
Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
|
||||||
masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be
|
procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
|
||||||
masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
|
reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
|
||||||
|
masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
|
||||||
|
the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
|
||||||
mask_time_length (:obj:`int`, `optional`, defaults to 10):
|
mask_time_length (:obj:`int`, `optional`, defaults to 10):
|
||||||
Length of vector span along the time axis.
|
Length of vector span along the time axis.
|
||||||
|
mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
|
||||||
|
The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
|
||||||
|
step, irrespectively of ``mask_feature_prob``. Only relevant if
|
||||||
|
''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
|
||||||
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
|
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
|
||||||
Propability of each feature vector along the feature axis to be chosen as the start of the vector span to
|
Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
|
||||||
be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be
|
masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
|
||||||
masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
|
the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
|
||||||
|
span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
|
||||||
|
overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
|
||||||
|
is True``.
|
||||||
mask_feature_length (:obj:`int`, `optional`, defaults to 10):
|
mask_feature_length (:obj:`int`, `optional`, defaults to 10):
|
||||||
Length of vector span along the feature axis.
|
Length of vector span along the feature axis.
|
||||||
|
mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
|
||||||
|
The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
|
||||||
|
step, irrespectively of ``mask_feature_prob``. Only relevant if
|
||||||
|
''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
|
||||||
num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320):
|
num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320):
|
||||||
Number of entries in each quantization codebook (group).
|
Number of entries in each quantization codebook (group).
|
||||||
num_codevector_groups (:obj:`int`, `optional`, defaults to 2):
|
num_codevector_groups (:obj:`int`, `optional`, defaults to 2):
|
||||||
@@ -187,8 +200,10 @@ class UniSpeechConfig(PretrainedConfig):
|
|||||||
apply_spec_augment=True,
|
apply_spec_augment=True,
|
||||||
mask_time_prob=0.05,
|
mask_time_prob=0.05,
|
||||||
mask_time_length=10,
|
mask_time_length=10,
|
||||||
|
mask_time_min_masks=2,
|
||||||
mask_feature_prob=0.0,
|
mask_feature_prob=0.0,
|
||||||
mask_feature_length=10,
|
mask_feature_length=10,
|
||||||
|
mask_feature_min_masks=0,
|
||||||
num_codevectors_per_group=320,
|
num_codevectors_per_group=320,
|
||||||
num_codevector_groups=2,
|
num_codevector_groups=2,
|
||||||
contrastive_logits_temperature=0.1,
|
contrastive_logits_temperature=0.1,
|
||||||
@@ -252,8 +267,10 @@ class UniSpeechConfig(PretrainedConfig):
|
|||||||
self.apply_spec_augment = apply_spec_augment
|
self.apply_spec_augment = apply_spec_augment
|
||||||
self.mask_time_prob = mask_time_prob
|
self.mask_time_prob = mask_time_prob
|
||||||
self.mask_time_length = mask_time_length
|
self.mask_time_length = mask_time_length
|
||||||
|
self.mask_time_min_masks = mask_time_min_masks
|
||||||
self.mask_feature_prob = mask_feature_prob
|
self.mask_feature_prob = mask_feature_prob
|
||||||
self.mask_feature_length = mask_feature_length
|
self.mask_feature_length = mask_feature_length
|
||||||
|
self.mask_feature_min_masks = mask_feature_min_masks
|
||||||
|
|
||||||
# parameters for pretraining with codevector quantized representations
|
# parameters for pretraining with codevector quantized representations
|
||||||
self.num_codevectors_per_group = num_codevectors_per_group
|
self.num_codevectors_per_group = num_codevectors_per_group
|
||||||
|
|||||||
@@ -136,13 +136,16 @@ def _compute_mask_indices(
|
|||||||
on CPU as part of the preprocessing during training.
|
on CPU as part of the preprocessing during training.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
shape: the the shape for which to compute masks.
|
shape: The shape for which to compute masks. This should be of a tuple of size 2 where
|
||||||
should be of size 2 where first element is batch size and 2nd is timesteps
|
the first element is the batch size and the second element is the length of the axis to span.
|
||||||
mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
|
mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
|
||||||
number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
|
independently generated mask spans of length `mask_length` is computed by
|
||||||
however due to overlaps, the actual number will be smaller (unless no_overlap is True)
|
`mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
|
||||||
|
actual percentage will be smaller.
|
||||||
mask_length: size of the mask
|
mask_length: size of the mask
|
||||||
min_masks: minimum number of masked spans
|
min_masks: minimum number of masked spans
|
||||||
|
attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
|
||||||
|
each batch dimension.
|
||||||
"""
|
"""
|
||||||
batch_size, sequence_length = shape
|
batch_size, sequence_length = shape
|
||||||
|
|
||||||
@@ -151,9 +154,11 @@ def _compute_mask_indices(
|
|||||||
|
|
||||||
if mask_length > sequence_length:
|
if mask_length > sequence_length:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
|
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
|
||||||
|
f" and `sequence_length`: {sequence_length}`"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# epsilon is used for probabilistic rounding
|
||||||
epsilon = np.random.rand(1).item()
|
epsilon = np.random.rand(1).item()
|
||||||
|
|
||||||
def compute_num_masked_span(input_length):
|
def compute_num_masked_span(input_length):
|
||||||
@@ -180,15 +185,21 @@ def _compute_mask_indices(
|
|||||||
|
|
||||||
max_num_masked_span = compute_num_masked_span(sequence_length)
|
max_num_masked_span = compute_num_masked_span(sequence_length)
|
||||||
|
|
||||||
|
if max_num_masked_span == 0:
|
||||||
|
return spec_aug_mask
|
||||||
|
|
||||||
for input_length in input_lengths:
|
for input_length in input_lengths:
|
||||||
# compute num of masked spans for this input
|
# compute num of masked spans for this input
|
||||||
num_masked_span = compute_num_masked_span(input_length)
|
num_masked_span = compute_num_masked_span(input_length)
|
||||||
|
|
||||||
# get random indices to mask
|
# get random indices to mask
|
||||||
spec_aug_mask_idx = np.random.choice(
|
spec_aug_mask_idx = np.random.choice(
|
||||||
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
|
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
|
||||||
)
|
)
|
||||||
|
|
||||||
# pick first sampled index that will serve as a dummy index to pad vector
|
# pick first sampled index that will serve as a dummy index to pad vector
|
||||||
|
# to ensure same dimension for all batches due to probabilistic rounding
|
||||||
|
# Picking first sample just pads those vectors twice.
|
||||||
dummy_mask_idx = spec_aug_mask_idx[0]
|
dummy_mask_idx = spec_aug_mask_idx[0]
|
||||||
|
|
||||||
spec_aug_mask_idx = np.concatenate(
|
spec_aug_mask_idx = np.concatenate(
|
||||||
@@ -204,6 +215,7 @@ def _compute_mask_indices(
|
|||||||
)
|
)
|
||||||
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
|
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
|
||||||
|
|
||||||
|
# add offset to the starting indexes so that that indexes now create a span
|
||||||
offsets = np.arange(mask_length)[None, None, :]
|
offsets = np.arange(mask_length)[None, None, :]
|
||||||
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
|
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
|
||||||
batch_size, max_num_masked_span * mask_length
|
batch_size, max_num_masked_span * mask_length
|
||||||
@@ -1076,7 +1088,7 @@ class UniSpeechModel(UniSpeechPreTrainedModel):
|
|||||||
mask_prob=self.config.mask_time_prob,
|
mask_prob=self.config.mask_time_prob,
|
||||||
mask_length=self.config.mask_time_length,
|
mask_length=self.config.mask_time_length,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
min_masks=2,
|
min_masks=self.config.mask_time_min_masks,
|
||||||
)
|
)
|
||||||
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
|
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
|
||||||
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
|
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
|
||||||
@@ -1087,6 +1099,7 @@ class UniSpeechModel(UniSpeechPreTrainedModel):
|
|||||||
(batch_size, hidden_size),
|
(batch_size, hidden_size),
|
||||||
mask_prob=self.config.mask_feature_prob,
|
mask_prob=self.config.mask_feature_prob,
|
||||||
mask_length=self.config.mask_feature_length,
|
mask_length=self.config.mask_feature_length,
|
||||||
|
min_masks=self.config.mask_feature_min_masks,
|
||||||
)
|
)
|
||||||
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
|
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
|
||||||
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
|
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
|
||||||
|
|||||||
@@ -101,17 +101,30 @@ class UniSpeechSatConfig(PretrainedConfig):
|
|||||||
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
|
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
|
||||||
<https://arxiv.org/abs/1904.08779>`__.
|
<https://arxiv.org/abs/1904.08779>`__.
|
||||||
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
|
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
|
||||||
Propability of each feature vector along the time axis to be chosen as the start of the vector span to be
|
Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
|
||||||
masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be
|
procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
|
||||||
masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
|
reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
|
||||||
|
masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
|
||||||
|
the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
|
||||||
mask_time_length (:obj:`int`, `optional`, defaults to 10):
|
mask_time_length (:obj:`int`, `optional`, defaults to 10):
|
||||||
Length of vector span along the time axis.
|
Length of vector span along the time axis.
|
||||||
|
mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
|
||||||
|
The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
|
||||||
|
step, irrespectively of ``mask_feature_prob``. Only relevant if
|
||||||
|
''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
|
||||||
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
|
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
|
||||||
Propability of each feature vector along the feature axis to be chosen as the start of the vector span to
|
Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
|
||||||
be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be
|
masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
|
||||||
masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
|
the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
|
||||||
|
span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
|
||||||
|
overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
|
||||||
|
is True``.
|
||||||
mask_feature_length (:obj:`int`, `optional`, defaults to 10):
|
mask_feature_length (:obj:`int`, `optional`, defaults to 10):
|
||||||
Length of vector span along the feature axis.
|
Length of vector span along the feature axis.
|
||||||
|
mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
|
||||||
|
The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
|
||||||
|
step, irrespectively of ``mask_feature_prob``. Only relevant if
|
||||||
|
''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
|
||||||
num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320):
|
num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320):
|
||||||
Number of entries in each quantization codebook (group).
|
Number of entries in each quantization codebook (group).
|
||||||
num_codevector_groups (:obj:`int`, `optional`, defaults to 2):
|
num_codevector_groups (:obj:`int`, `optional`, defaults to 2):
|
||||||
@@ -185,8 +198,10 @@ class UniSpeechSatConfig(PretrainedConfig):
|
|||||||
apply_spec_augment=True,
|
apply_spec_augment=True,
|
||||||
mask_time_prob=0.05,
|
mask_time_prob=0.05,
|
||||||
mask_time_length=10,
|
mask_time_length=10,
|
||||||
|
mask_time_min_masks=2,
|
||||||
mask_feature_prob=0.0,
|
mask_feature_prob=0.0,
|
||||||
mask_feature_length=10,
|
mask_feature_length=10,
|
||||||
|
mask_feature_min_masks=0,
|
||||||
num_codevectors_per_group=320,
|
num_codevectors_per_group=320,
|
||||||
num_codevector_groups=2,
|
num_codevector_groups=2,
|
||||||
contrastive_logits_temperature=0.1,
|
contrastive_logits_temperature=0.1,
|
||||||
@@ -249,8 +264,10 @@ class UniSpeechSatConfig(PretrainedConfig):
|
|||||||
self.apply_spec_augment = apply_spec_augment
|
self.apply_spec_augment = apply_spec_augment
|
||||||
self.mask_time_prob = mask_time_prob
|
self.mask_time_prob = mask_time_prob
|
||||||
self.mask_time_length = mask_time_length
|
self.mask_time_length = mask_time_length
|
||||||
|
self.mask_time_min_masks = mask_time_min_masks
|
||||||
self.mask_feature_prob = mask_feature_prob
|
self.mask_feature_prob = mask_feature_prob
|
||||||
self.mask_feature_length = mask_feature_length
|
self.mask_feature_length = mask_feature_length
|
||||||
|
self.mask_feature_min_masks = mask_feature_min_masks
|
||||||
|
|
||||||
# parameters for pretraining with codevector quantized representations
|
# parameters for pretraining with codevector quantized representations
|
||||||
self.num_codevectors_per_group = num_codevectors_per_group
|
self.num_codevectors_per_group = num_codevectors_per_group
|
||||||
|
|||||||
@@ -137,13 +137,16 @@ def _compute_mask_indices(
|
|||||||
on CPU as part of the preprocessing during training.
|
on CPU as part of the preprocessing during training.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
shape: the the shape for which to compute masks.
|
shape: The shape for which to compute masks. This should be of a tuple of size 2 where
|
||||||
should be of size 2 where first element is batch size and 2nd is timesteps
|
the first element is the batch size and the second element is the length of the axis to span.
|
||||||
mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
|
mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
|
||||||
number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
|
independently generated mask spans of length `mask_length` is computed by
|
||||||
however due to overlaps, the actual number will be smaller (unless no_overlap is True)
|
`mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
|
||||||
|
actual percentage will be smaller.
|
||||||
mask_length: size of the mask
|
mask_length: size of the mask
|
||||||
min_masks: minimum number of masked spans
|
min_masks: minimum number of masked spans
|
||||||
|
attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
|
||||||
|
each batch dimension.
|
||||||
"""
|
"""
|
||||||
batch_size, sequence_length = shape
|
batch_size, sequence_length = shape
|
||||||
|
|
||||||
@@ -152,9 +155,11 @@ def _compute_mask_indices(
|
|||||||
|
|
||||||
if mask_length > sequence_length:
|
if mask_length > sequence_length:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
|
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
|
||||||
|
f" and `sequence_length`: {sequence_length}`"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# epsilon is used for probabilistic rounding
|
||||||
epsilon = np.random.rand(1).item()
|
epsilon = np.random.rand(1).item()
|
||||||
|
|
||||||
def compute_num_masked_span(input_length):
|
def compute_num_masked_span(input_length):
|
||||||
@@ -181,15 +186,21 @@ def _compute_mask_indices(
|
|||||||
|
|
||||||
max_num_masked_span = compute_num_masked_span(sequence_length)
|
max_num_masked_span = compute_num_masked_span(sequence_length)
|
||||||
|
|
||||||
|
if max_num_masked_span == 0:
|
||||||
|
return spec_aug_mask
|
||||||
|
|
||||||
for input_length in input_lengths:
|
for input_length in input_lengths:
|
||||||
# compute num of masked spans for this input
|
# compute num of masked spans for this input
|
||||||
num_masked_span = compute_num_masked_span(input_length)
|
num_masked_span = compute_num_masked_span(input_length)
|
||||||
|
|
||||||
# get random indices to mask
|
# get random indices to mask
|
||||||
spec_aug_mask_idx = np.random.choice(
|
spec_aug_mask_idx = np.random.choice(
|
||||||
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
|
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
|
||||||
)
|
)
|
||||||
|
|
||||||
# pick first sampled index that will serve as a dummy index to pad vector
|
# pick first sampled index that will serve as a dummy index to pad vector
|
||||||
|
# to ensure same dimension for all batches due to probabilistic rounding
|
||||||
|
# Picking first sample just pads those vectors twice.
|
||||||
dummy_mask_idx = spec_aug_mask_idx[0]
|
dummy_mask_idx = spec_aug_mask_idx[0]
|
||||||
|
|
||||||
spec_aug_mask_idx = np.concatenate(
|
spec_aug_mask_idx = np.concatenate(
|
||||||
@@ -205,6 +216,7 @@ def _compute_mask_indices(
|
|||||||
)
|
)
|
||||||
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
|
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
|
||||||
|
|
||||||
|
# add offset to the starting indexes so that that indexes now create a span
|
||||||
offsets = np.arange(mask_length)[None, None, :]
|
offsets = np.arange(mask_length)[None, None, :]
|
||||||
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
|
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
|
||||||
batch_size, max_num_masked_span * mask_length
|
batch_size, max_num_masked_span * mask_length
|
||||||
@@ -1077,7 +1089,7 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel):
|
|||||||
mask_prob=self.config.mask_time_prob,
|
mask_prob=self.config.mask_time_prob,
|
||||||
mask_length=self.config.mask_time_length,
|
mask_length=self.config.mask_time_length,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
min_masks=2,
|
min_masks=self.config.mask_time_min_masks,
|
||||||
)
|
)
|
||||||
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
|
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
|
||||||
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
|
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
|
||||||
@@ -1088,6 +1100,7 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel):
|
|||||||
(batch_size, hidden_size),
|
(batch_size, hidden_size),
|
||||||
mask_prob=self.config.mask_feature_prob,
|
mask_prob=self.config.mask_feature_prob,
|
||||||
mask_length=self.config.mask_feature_length,
|
mask_length=self.config.mask_feature_length,
|
||||||
|
min_masks=self.config.mask_feature_min_masks,
|
||||||
)
|
)
|
||||||
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
|
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
|
||||||
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
|
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
|
||||||
|
|||||||
@@ -101,17 +101,30 @@ class Wav2Vec2Config(PretrainedConfig):
|
|||||||
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
|
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
|
||||||
<https://arxiv.org/abs/1904.08779>`__.
|
<https://arxiv.org/abs/1904.08779>`__.
|
||||||
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
|
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
|
||||||
Propability of each feature vector along the time axis to be chosen as the start of the vector span to be
|
Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
|
||||||
masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be
|
procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
|
||||||
masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
|
reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
|
||||||
|
masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
|
||||||
|
the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
|
||||||
mask_time_length (:obj:`int`, `optional`, defaults to 10):
|
mask_time_length (:obj:`int`, `optional`, defaults to 10):
|
||||||
Length of vector span along the time axis.
|
Length of vector span along the time axis.
|
||||||
|
mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
|
||||||
|
The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
|
||||||
|
step, irrespectively of ``mask_feature_prob``. Only relevant if
|
||||||
|
''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
|
||||||
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
|
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
|
||||||
Propability of each feature vector along the feature axis to be chosen as the start of the vector span to
|
Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
|
||||||
be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be
|
masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
|
||||||
masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
|
the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
|
||||||
|
span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
|
||||||
|
overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
|
||||||
|
is True``.
|
||||||
mask_feature_length (:obj:`int`, `optional`, defaults to 10):
|
mask_feature_length (:obj:`int`, `optional`, defaults to 10):
|
||||||
Length of vector span along the feature axis.
|
Length of vector span along the feature axis.
|
||||||
|
mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
|
||||||
|
The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
|
||||||
|
step, irrespectively of ``mask_feature_prob``. Only relevant if
|
||||||
|
''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
|
||||||
num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320):
|
num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320):
|
||||||
Number of entries in each quantization codebook (group).
|
Number of entries in each quantization codebook (group).
|
||||||
num_codevector_groups (:obj:`int`, `optional`, defaults to 2):
|
num_codevector_groups (:obj:`int`, `optional`, defaults to 2):
|
||||||
@@ -198,8 +211,10 @@ class Wav2Vec2Config(PretrainedConfig):
|
|||||||
apply_spec_augment=True,
|
apply_spec_augment=True,
|
||||||
mask_time_prob=0.05,
|
mask_time_prob=0.05,
|
||||||
mask_time_length=10,
|
mask_time_length=10,
|
||||||
|
mask_time_min_masks=2,
|
||||||
mask_feature_prob=0.0,
|
mask_feature_prob=0.0,
|
||||||
mask_feature_length=10,
|
mask_feature_length=10,
|
||||||
|
mask_feature_min_masks=0,
|
||||||
num_codevectors_per_group=320,
|
num_codevectors_per_group=320,
|
||||||
num_codevector_groups=2,
|
num_codevector_groups=2,
|
||||||
contrastive_logits_temperature=0.1,
|
contrastive_logits_temperature=0.1,
|
||||||
@@ -265,8 +280,10 @@ class Wav2Vec2Config(PretrainedConfig):
|
|||||||
self.apply_spec_augment = apply_spec_augment
|
self.apply_spec_augment = apply_spec_augment
|
||||||
self.mask_time_prob = mask_time_prob
|
self.mask_time_prob = mask_time_prob
|
||||||
self.mask_time_length = mask_time_length
|
self.mask_time_length = mask_time_length
|
||||||
|
self.mask_time_min_masks = mask_time_min_masks
|
||||||
self.mask_feature_prob = mask_feature_prob
|
self.mask_feature_prob = mask_feature_prob
|
||||||
self.mask_feature_length = mask_feature_length
|
self.mask_feature_length = mask_feature_length
|
||||||
|
self.mask_feature_min_masks = mask_feature_min_masks
|
||||||
|
|
||||||
# parameters for pretraining with codevector quantized representations
|
# parameters for pretraining with codevector quantized representations
|
||||||
self.num_codevectors_per_group = num_codevectors_per_group
|
self.num_codevectors_per_group = num_codevectors_per_group
|
||||||
|
|||||||
@@ -145,13 +145,16 @@ def _compute_mask_indices(
|
|||||||
on CPU as part of the preprocessing during training.
|
on CPU as part of the preprocessing during training.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
shape: the the shape for which to compute masks.
|
shape: The shape for which to compute masks. This should be of a tuple of size 2 where
|
||||||
should be of size 2 where first element is batch size and 2nd is timesteps
|
the first element is the batch size and the second element is the length of the axis to span.
|
||||||
mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
|
mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
|
||||||
number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
|
independently generated mask spans of length `mask_length` is computed by
|
||||||
however due to overlaps, the actual number will be smaller (unless no_overlap is True)
|
`mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
|
||||||
|
actual percentage will be smaller.
|
||||||
mask_length: size of the mask
|
mask_length: size of the mask
|
||||||
min_masks: minimum number of masked spans
|
min_masks: minimum number of masked spans
|
||||||
|
attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
|
||||||
|
each batch dimension.
|
||||||
"""
|
"""
|
||||||
batch_size, sequence_length = shape
|
batch_size, sequence_length = shape
|
||||||
|
|
||||||
@@ -160,9 +163,11 @@ def _compute_mask_indices(
|
|||||||
|
|
||||||
if mask_length > sequence_length:
|
if mask_length > sequence_length:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
|
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
|
||||||
|
f" and `sequence_length`: {sequence_length}`"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# epsilon is used for probabilistic rounding
|
||||||
epsilon = np.random.rand(1).item()
|
epsilon = np.random.rand(1).item()
|
||||||
|
|
||||||
def compute_num_masked_span(input_length):
|
def compute_num_masked_span(input_length):
|
||||||
@@ -189,15 +194,21 @@ def _compute_mask_indices(
|
|||||||
|
|
||||||
max_num_masked_span = compute_num_masked_span(sequence_length)
|
max_num_masked_span = compute_num_masked_span(sequence_length)
|
||||||
|
|
||||||
|
if max_num_masked_span == 0:
|
||||||
|
return spec_aug_mask
|
||||||
|
|
||||||
for input_length in input_lengths:
|
for input_length in input_lengths:
|
||||||
# compute num of masked spans for this input
|
# compute num of masked spans for this input
|
||||||
num_masked_span = compute_num_masked_span(input_length)
|
num_masked_span = compute_num_masked_span(input_length)
|
||||||
|
|
||||||
# get random indices to mask
|
# get random indices to mask
|
||||||
spec_aug_mask_idx = np.random.choice(
|
spec_aug_mask_idx = np.random.choice(
|
||||||
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
|
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
|
||||||
)
|
)
|
||||||
|
|
||||||
# pick first sampled index that will serve as a dummy index to pad vector
|
# pick first sampled index that will serve as a dummy index to pad vector
|
||||||
|
# to ensure same dimension for all batches due to probabilistic rounding
|
||||||
|
# Picking first sample just pads those vectors twice.
|
||||||
dummy_mask_idx = spec_aug_mask_idx[0]
|
dummy_mask_idx = spec_aug_mask_idx[0]
|
||||||
|
|
||||||
spec_aug_mask_idx = np.concatenate(
|
spec_aug_mask_idx = np.concatenate(
|
||||||
@@ -213,6 +224,7 @@ def _compute_mask_indices(
|
|||||||
)
|
)
|
||||||
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
|
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
|
||||||
|
|
||||||
|
# add offset to the starting indexes so that that indexes now create a span
|
||||||
offsets = np.arange(mask_length)[None, None, :]
|
offsets = np.arange(mask_length)[None, None, :]
|
||||||
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
|
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
|
||||||
batch_size, max_num_masked_span * mask_length
|
batch_size, max_num_masked_span * mask_length
|
||||||
@@ -1182,7 +1194,7 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel):
|
|||||||
mask_prob=self.config.mask_time_prob,
|
mask_prob=self.config.mask_time_prob,
|
||||||
mask_length=self.config.mask_time_length,
|
mask_length=self.config.mask_time_length,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
min_masks=2,
|
min_masks=self.config.mask_time_min_masks,
|
||||||
)
|
)
|
||||||
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
|
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
|
||||||
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
|
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
|
||||||
@@ -1193,6 +1205,7 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel):
|
|||||||
(batch_size, hidden_size),
|
(batch_size, hidden_size),
|
||||||
mask_prob=self.config.mask_feature_prob,
|
mask_prob=self.config.mask_feature_prob,
|
||||||
mask_length=self.config.mask_feature_length,
|
mask_length=self.config.mask_feature_length,
|
||||||
|
min_masks=self.config.mask_feature_min_masks,
|
||||||
)
|
)
|
||||||
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
|
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
|
||||||
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
|
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
|
||||||
|
|||||||
@@ -854,6 +854,36 @@ class Wav2Vec2UtilsTest(unittest.TestCase):
|
|||||||
|
|
||||||
self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)])
|
self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)])
|
||||||
|
|
||||||
|
def test_compute_mask_indices_low_prob(self):
|
||||||
|
# with these settings num_masked_spans=0.5, which means probabilistic rounding
|
||||||
|
# ensures that in 5 out of 10 method calls, num_masked_spans=0, and in
|
||||||
|
# the other 5 out of 10, cases num_masked_spans=1
|
||||||
|
n_trials = 100
|
||||||
|
batch_size = 4
|
||||||
|
sequence_length = 100
|
||||||
|
mask_prob = 0.05
|
||||||
|
mask_length = 10
|
||||||
|
|
||||||
|
count_dimensions_masked = 0
|
||||||
|
count_dimensions_not_masked = 0
|
||||||
|
|
||||||
|
for _ in range(n_trials):
|
||||||
|
mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
|
||||||
|
mask = torch.from_numpy(mask).to(torch_device)
|
||||||
|
|
||||||
|
num_masks = torch.sum(mask).item()
|
||||||
|
|
||||||
|
if num_masks > 0:
|
||||||
|
count_dimensions_masked += 1
|
||||||
|
else:
|
||||||
|
count_dimensions_not_masked += 1
|
||||||
|
|
||||||
|
# as we test for at least 10 masked dimension and at least
|
||||||
|
# 10 non-masked dimension, this test could fail with probability:
|
||||||
|
# P(100 coin flips, at most 9 heads) = 1.66e-18
|
||||||
|
self.assertGreater(count_dimensions_masked, int(n_trials * 0.1))
|
||||||
|
self.assertGreater(count_dimensions_not_masked, int(n_trials * 0.1))
|
||||||
|
|
||||||
def test_compute_mask_indices_overlap(self):
|
def test_compute_mask_indices_overlap(self):
|
||||||
batch_size = 4
|
batch_size = 4
|
||||||
sequence_length = 80
|
sequence_length = 80
|
||||||
|
|||||||
Reference in New Issue
Block a user