Update modeling_wav2vec2.py (#15423)
* Update modeling_wav2vec2.py With very tiny sound files (less than 0.1 seconds) the num_masked_span can be too long. The issue is described in issue #15366 and discussed with @patrickvonplaten. * correct errors with mask time indices * remove bogus file * make fix-copies Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
This commit is contained in:
@@ -114,10 +114,14 @@ def _compute_mask_indices(
|
||||
num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
|
||||
num_masked_span = max(num_masked_span, min_masks)
|
||||
|
||||
# make sure num masked indices <= sequence_length
|
||||
# make sure num masked span <= sequence_length
|
||||
if num_masked_span * mask_length > sequence_length:
|
||||
num_masked_span = sequence_length // mask_length
|
||||
|
||||
# make sure num_masked span is also <= input_length - (mask_length - 1)
|
||||
if input_length - (mask_length - 1) < num_masked_span:
|
||||
num_masked_span = max(input_length - (mask_length - 1), 0)
|
||||
|
||||
return num_masked_span
|
||||
|
||||
# compute number of masked spans in batch
|
||||
@@ -148,7 +152,13 @@ def _compute_mask_indices(
|
||||
# pick first sampled index that will serve as a dummy index to pad vector
|
||||
# to ensure same dimension for all batches due to probabilistic rounding
|
||||
# Picking first sample just pads those vectors twice.
|
||||
dummy_mask_idx = spec_aug_mask_idx[0]
|
||||
if len(spec_aug_mask_idx) == 0:
|
||||
# this case can only happen if `input_length` is strictly smaller then
|
||||
# `sequence_length` in which case the last token has to be a padding
|
||||
# token which we can use as a dummy mask id
|
||||
dummy_mask_idx = sequence_length - 1
|
||||
else:
|
||||
dummy_mask_idx = spec_aug_mask_idx[0]
|
||||
|
||||
spec_aug_mask_idx = np.concatenate(
|
||||
[spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
|
||||
@@ -170,6 +180,10 @@ def _compute_mask_indices(
|
||||
)
|
||||
spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
|
||||
|
||||
# ensure that we cannot have indices larger than sequence_length
|
||||
if spec_aug_mask_idxs.max() > sequence_length - 1:
|
||||
spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
|
||||
|
||||
# scatter indices to mask
|
||||
np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
|
||||
|
||||
|
||||
@@ -115,10 +115,14 @@ def _compute_mask_indices(
|
||||
num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
|
||||
num_masked_span = max(num_masked_span, min_masks)
|
||||
|
||||
# make sure num masked indices <= sequence_length
|
||||
# make sure num masked span <= sequence_length
|
||||
if num_masked_span * mask_length > sequence_length:
|
||||
num_masked_span = sequence_length // mask_length
|
||||
|
||||
# make sure num_masked span is also <= input_length - (mask_length - 1)
|
||||
if input_length - (mask_length - 1) < num_masked_span:
|
||||
num_masked_span = max(input_length - (mask_length - 1), 0)
|
||||
|
||||
return num_masked_span
|
||||
|
||||
# compute number of masked spans in batch
|
||||
@@ -149,7 +153,13 @@ def _compute_mask_indices(
|
||||
# pick first sampled index that will serve as a dummy index to pad vector
|
||||
# to ensure same dimension for all batches due to probabilistic rounding
|
||||
# Picking first sample just pads those vectors twice.
|
||||
dummy_mask_idx = spec_aug_mask_idx[0]
|
||||
if len(spec_aug_mask_idx) == 0:
|
||||
# this case can only happen if `input_length` is strictly smaller then
|
||||
# `sequence_length` in which case the last token has to be a padding
|
||||
# token which we can use as a dummy mask id
|
||||
dummy_mask_idx = sequence_length - 1
|
||||
else:
|
||||
dummy_mask_idx = spec_aug_mask_idx[0]
|
||||
|
||||
spec_aug_mask_idx = np.concatenate(
|
||||
[spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
|
||||
@@ -171,6 +181,10 @@ def _compute_mask_indices(
|
||||
)
|
||||
spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
|
||||
|
||||
# ensure that we cannot have indices larger than sequence_length
|
||||
if spec_aug_mask_idxs.max() > sequence_length - 1:
|
||||
spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
|
||||
|
||||
# scatter indices to mask
|
||||
np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
|
||||
|
||||
|
||||
@@ -117,10 +117,14 @@ def _compute_mask_indices(
|
||||
num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
|
||||
num_masked_span = max(num_masked_span, min_masks)
|
||||
|
||||
# make sure num masked indices <= sequence_length
|
||||
# make sure num masked span <= sequence_length
|
||||
if num_masked_span * mask_length > sequence_length:
|
||||
num_masked_span = sequence_length // mask_length
|
||||
|
||||
# make sure num_masked span is also <= input_length - (mask_length - 1)
|
||||
if input_length - (mask_length - 1) < num_masked_span:
|
||||
num_masked_span = max(input_length - (mask_length - 1), 0)
|
||||
|
||||
return num_masked_span
|
||||
|
||||
# compute number of masked spans in batch
|
||||
@@ -151,7 +155,13 @@ def _compute_mask_indices(
|
||||
# pick first sampled index that will serve as a dummy index to pad vector
|
||||
# to ensure same dimension for all batches due to probabilistic rounding
|
||||
# Picking first sample just pads those vectors twice.
|
||||
dummy_mask_idx = spec_aug_mask_idx[0]
|
||||
if len(spec_aug_mask_idx) == 0:
|
||||
# this case can only happen if `input_length` is strictly smaller then
|
||||
# `sequence_length` in which case the last token has to be a padding
|
||||
# token which we can use as a dummy mask id
|
||||
dummy_mask_idx = sequence_length - 1
|
||||
else:
|
||||
dummy_mask_idx = spec_aug_mask_idx[0]
|
||||
|
||||
spec_aug_mask_idx = np.concatenate(
|
||||
[spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
|
||||
@@ -173,6 +183,10 @@ def _compute_mask_indices(
|
||||
)
|
||||
spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
|
||||
|
||||
# ensure that we cannot have indices larger than sequence_length
|
||||
if spec_aug_mask_idxs.max() > sequence_length - 1:
|
||||
spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
|
||||
|
||||
# scatter indices to mask
|
||||
np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
|
||||
|
||||
|
||||
@@ -179,10 +179,14 @@ def _compute_mask_indices(
|
||||
num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
|
||||
num_masked_span = max(num_masked_span, min_masks)
|
||||
|
||||
# make sure num masked indices <= sequence_length
|
||||
# make sure num masked span <= sequence_length
|
||||
if num_masked_span * mask_length > sequence_length:
|
||||
num_masked_span = sequence_length // mask_length
|
||||
|
||||
# make sure num_masked span is also <= input_length - (mask_length - 1)
|
||||
if input_length - (mask_length - 1) < num_masked_span:
|
||||
num_masked_span = max(input_length - (mask_length - 1), 0)
|
||||
|
||||
return num_masked_span
|
||||
|
||||
# compute number of masked spans in batch
|
||||
@@ -213,7 +217,13 @@ def _compute_mask_indices(
|
||||
# pick first sampled index that will serve as a dummy index to pad vector
|
||||
# to ensure same dimension for all batches due to probabilistic rounding
|
||||
# Picking first sample just pads those vectors twice.
|
||||
dummy_mask_idx = spec_aug_mask_idx[0]
|
||||
if len(spec_aug_mask_idx) == 0:
|
||||
# this case can only happen if `input_length` is strictly smaller then
|
||||
# `sequence_length` in which case the last token has to be a padding
|
||||
# token which we can use as a dummy mask id
|
||||
dummy_mask_idx = sequence_length - 1
|
||||
else:
|
||||
dummy_mask_idx = spec_aug_mask_idx[0]
|
||||
|
||||
spec_aug_mask_idx = np.concatenate(
|
||||
[spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
|
||||
@@ -235,6 +245,10 @@ def _compute_mask_indices(
|
||||
)
|
||||
spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
|
||||
|
||||
# ensure that we cannot have indices larger than sequence_length
|
||||
if spec_aug_mask_idxs.max() > sequence_length - 1:
|
||||
spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
|
||||
|
||||
# scatter indices to mask
|
||||
np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
|
||||
|
||||
|
||||
@@ -218,10 +218,14 @@ def _compute_mask_indices(
|
||||
num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
|
||||
num_masked_span = max(num_masked_span, min_masks)
|
||||
|
||||
# make sure num masked indices <= sequence_length
|
||||
# make sure num masked span <= sequence_length
|
||||
if num_masked_span * mask_length > sequence_length:
|
||||
num_masked_span = sequence_length // mask_length
|
||||
|
||||
# make sure num_masked span is also <= input_length - (mask_length - 1)
|
||||
if input_length - (mask_length - 1) < num_masked_span:
|
||||
num_masked_span = max(input_length - (mask_length - 1), 0)
|
||||
|
||||
return num_masked_span
|
||||
|
||||
# compute number of masked spans in batch
|
||||
@@ -252,7 +256,13 @@ def _compute_mask_indices(
|
||||
# pick first sampled index that will serve as a dummy index to pad vector
|
||||
# to ensure same dimension for all batches due to probabilistic rounding
|
||||
# Picking first sample just pads those vectors twice.
|
||||
dummy_mask_idx = spec_aug_mask_idx[0]
|
||||
if len(spec_aug_mask_idx) == 0:
|
||||
# this case can only happen if `input_length` is strictly smaller then
|
||||
# `sequence_length` in which case the last token has to be a padding
|
||||
# token which we can use as a dummy mask id
|
||||
dummy_mask_idx = sequence_length - 1
|
||||
else:
|
||||
dummy_mask_idx = spec_aug_mask_idx[0]
|
||||
|
||||
spec_aug_mask_idx = np.concatenate(
|
||||
[spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
|
||||
@@ -274,6 +284,10 @@ def _compute_mask_indices(
|
||||
)
|
||||
spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
|
||||
|
||||
# ensure that we cannot have indices larger than sequence_length
|
||||
if spec_aug_mask_idxs.max() > sequence_length - 1:
|
||||
spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
|
||||
|
||||
# scatter indices to mask
|
||||
np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
|
||||
|
||||
|
||||
@@ -233,10 +233,14 @@ def _compute_mask_indices(
|
||||
num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
|
||||
num_masked_span = max(num_masked_span, min_masks)
|
||||
|
||||
# make sure num masked indices <= sequence_length
|
||||
# make sure num masked span <= sequence_length
|
||||
if num_masked_span * mask_length > sequence_length:
|
||||
num_masked_span = sequence_length // mask_length
|
||||
|
||||
# make sure num_masked span is also <= input_length - (mask_length - 1)
|
||||
if input_length - (mask_length - 1) < num_masked_span:
|
||||
num_masked_span = max(input_length - (mask_length - 1), 0)
|
||||
|
||||
return num_masked_span
|
||||
|
||||
# compute number of masked spans in batch
|
||||
@@ -267,7 +271,13 @@ def _compute_mask_indices(
|
||||
# pick first sampled index that will serve as a dummy index to pad vector
|
||||
# to ensure same dimension for all batches due to probabilistic rounding
|
||||
# Picking first sample just pads those vectors twice.
|
||||
dummy_mask_idx = spec_aug_mask_idx[0]
|
||||
if len(spec_aug_mask_idx) == 0:
|
||||
# this case can only happen if `input_length` is strictly smaller then
|
||||
# `sequence_length` in which case the last token has to be a padding
|
||||
# token which we can use as a dummy mask id
|
||||
dummy_mask_idx = sequence_length - 1
|
||||
else:
|
||||
dummy_mask_idx = spec_aug_mask_idx[0]
|
||||
|
||||
spec_aug_mask_idx = np.concatenate(
|
||||
[spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
|
||||
@@ -289,6 +299,10 @@ def _compute_mask_indices(
|
||||
)
|
||||
spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
|
||||
|
||||
# ensure that we cannot have indices larger than sequence_length
|
||||
if spec_aug_mask_idxs.max() > sequence_length - 1:
|
||||
spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
|
||||
|
||||
# scatter indices to mask
|
||||
np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
|
||||
|
||||
|
||||
@@ -184,10 +184,14 @@ def _compute_mask_indices(
|
||||
num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
|
||||
num_masked_span = max(num_masked_span, min_masks)
|
||||
|
||||
# make sure num masked indices <= sequence_length
|
||||
# make sure num masked span <= sequence_length
|
||||
if num_masked_span * mask_length > sequence_length:
|
||||
num_masked_span = sequence_length // mask_length
|
||||
|
||||
# make sure num_masked span is also <= input_length - (mask_length - 1)
|
||||
if input_length - (mask_length - 1) < num_masked_span:
|
||||
num_masked_span = max(input_length - (mask_length - 1), 0)
|
||||
|
||||
return num_masked_span
|
||||
|
||||
# compute number of masked spans in batch
|
||||
@@ -218,7 +222,13 @@ def _compute_mask_indices(
|
||||
# pick first sampled index that will serve as a dummy index to pad vector
|
||||
# to ensure same dimension for all batches due to probabilistic rounding
|
||||
# Picking first sample just pads those vectors twice.
|
||||
dummy_mask_idx = spec_aug_mask_idx[0]
|
||||
if len(spec_aug_mask_idx) == 0:
|
||||
# this case can only happen if `input_length` is strictly smaller then
|
||||
# `sequence_length` in which case the last token has to be a padding
|
||||
# token which we can use as a dummy mask id
|
||||
dummy_mask_idx = sequence_length - 1
|
||||
else:
|
||||
dummy_mask_idx = spec_aug_mask_idx[0]
|
||||
|
||||
spec_aug_mask_idx = np.concatenate(
|
||||
[spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
|
||||
@@ -240,6 +250,10 @@ def _compute_mask_indices(
|
||||
)
|
||||
spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
|
||||
|
||||
# ensure that we cannot have indices larger than sequence_length
|
||||
if spec_aug_mask_idxs.max() > sequence_length - 1:
|
||||
spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
|
||||
|
||||
# scatter indices to mask
|
||||
np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
|
||||
|
||||
|
||||
@@ -990,6 +990,23 @@ class Wav2Vec2UtilsTest(unittest.TestCase):
|
||||
|
||||
self.assertTrue(mask[:2, sequence_length // 2 :].sum() == 0)
|
||||
|
||||
def test_compute_mask_indices_short_audio(self):
|
||||
batch_size = 4
|
||||
sequence_length = 100
|
||||
mask_prob = 0.05
|
||||
mask_length = 10
|
||||
|
||||
attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device)
|
||||
# force one example to be heavily padded
|
||||
attention_mask[0, 5:] = 0
|
||||
|
||||
mask = _compute_mask_indices(
|
||||
(batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask, min_masks=2
|
||||
)
|
||||
|
||||
# make sure that non-padded examples cannot be padded
|
||||
self.assertFalse(mask[0][attention_mask[0].to(torch.bool).cpu()].any())
|
||||
|
||||
def test_compute_perplexity(self):
|
||||
probs = torch.arange(100, device=torch_device).reshape(2, 5, 10) / 100
|
||||
|
||||
|
||||
Reference in New Issue
Block a user