From 125a2882b4997f8ad37beadb8a025114f0f0e1a0 Mon Sep 17 00:00:00 2001 From: peregilk Date: Mon, 31 Jan 2022 21:22:11 +0100 Subject: [PATCH] Update modeling_wav2vec2.py (#15423) * Update modeling_wav2vec2.py With very tiny sound files (less than 0.1 seconds) the num_masked_span can be too long. The issue is described in issue #15366 and discussed with @patrickvonplaten. * correct errors with mask time indices * remove bogus file * make fix-copies Co-authored-by: Patrick von Platen --- .../models/hubert/modeling_hubert.py | 18 ++++++++++++++++-- src/transformers/models/sew/modeling_sew.py | 18 ++++++++++++++++-- .../models/sew_d/modeling_sew_d.py | 18 ++++++++++++++++-- .../models/unispeech/modeling_unispeech.py | 18 ++++++++++++++++-- .../unispeech_sat/modeling_unispeech_sat.py | 18 ++++++++++++++++-- .../models/wav2vec2/modeling_wav2vec2.py | 18 ++++++++++++++++-- .../models/wavlm/modeling_wavlm.py | 18 ++++++++++++++++-- tests/test_modeling_wav2vec2.py | 17 +++++++++++++++++ 8 files changed, 129 insertions(+), 14 deletions(-) diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index 5ab155e367..76cd97ffed 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -114,10 +114,14 @@ def _compute_mask_indices( num_masked_span = int(mask_prob * input_length / mask_length + epsilon) num_masked_span = max(num_masked_span, min_masks) - # make sure num masked indices <= sequence_length + # make sure num masked span <= sequence_length if num_masked_span * mask_length > sequence_length: num_masked_span = sequence_length // mask_length + # make sure num_masked span is also <= input_length - (mask_length - 1) + if input_length - (mask_length - 1) < num_masked_span: + num_masked_span = max(input_length - (mask_length - 1), 0) + return num_masked_span # compute number of masked spans in batch @@ -148,7 +152,13 @@ def _compute_mask_indices( # pick first sampled index that will serve as a dummy index to pad vector # to ensure same dimension for all batches due to probabilistic rounding # Picking first sample just pads those vectors twice. - dummy_mask_idx = spec_aug_mask_idx[0] + if len(spec_aug_mask_idx) == 0: + # this case can only happen if `input_length` is strictly smaller then + # `sequence_length` in which case the last token has to be a padding + # token which we can use as a dummy mask id + dummy_mask_idx = sequence_length - 1 + else: + dummy_mask_idx = spec_aug_mask_idx[0] spec_aug_mask_idx = np.concatenate( [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx] @@ -170,6 +180,10 @@ def _compute_mask_indices( ) spec_aug_mask_idxs = spec_aug_mask_idxs + offsets + # ensure that we cannot have indices larger than sequence_length + if spec_aug_mask_idxs.max() > sequence_length - 1: + spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1 + # scatter indices to mask np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1) diff --git a/src/transformers/models/sew/modeling_sew.py b/src/transformers/models/sew/modeling_sew.py index f0dd74f3cf..ecc30478b0 100644 --- a/src/transformers/models/sew/modeling_sew.py +++ b/src/transformers/models/sew/modeling_sew.py @@ -115,10 +115,14 @@ def _compute_mask_indices( num_masked_span = int(mask_prob * input_length / mask_length + epsilon) num_masked_span = max(num_masked_span, min_masks) - # make sure num masked indices <= sequence_length + # make sure num masked span <= sequence_length if num_masked_span * mask_length > sequence_length: num_masked_span = sequence_length // mask_length + # make sure num_masked span is also <= input_length - (mask_length - 1) + if input_length - (mask_length - 1) < num_masked_span: + num_masked_span = max(input_length - (mask_length - 1), 0) + return num_masked_span # compute number of masked spans in batch @@ -149,7 +153,13 @@ def _compute_mask_indices( # pick first sampled index that will serve as a dummy index to pad vector # to ensure same dimension for all batches due to probabilistic rounding # Picking first sample just pads those vectors twice. - dummy_mask_idx = spec_aug_mask_idx[0] + if len(spec_aug_mask_idx) == 0: + # this case can only happen if `input_length` is strictly smaller then + # `sequence_length` in which case the last token has to be a padding + # token which we can use as a dummy mask id + dummy_mask_idx = sequence_length - 1 + else: + dummy_mask_idx = spec_aug_mask_idx[0] spec_aug_mask_idx = np.concatenate( [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx] @@ -171,6 +181,10 @@ def _compute_mask_indices( ) spec_aug_mask_idxs = spec_aug_mask_idxs + offsets + # ensure that we cannot have indices larger than sequence_length + if spec_aug_mask_idxs.max() > sequence_length - 1: + spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1 + # scatter indices to mask np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1) diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py index 03b2918dbe..33ca04337f 100644 --- a/src/transformers/models/sew_d/modeling_sew_d.py +++ b/src/transformers/models/sew_d/modeling_sew_d.py @@ -117,10 +117,14 @@ def _compute_mask_indices( num_masked_span = int(mask_prob * input_length / mask_length + epsilon) num_masked_span = max(num_masked_span, min_masks) - # make sure num masked indices <= sequence_length + # make sure num masked span <= sequence_length if num_masked_span * mask_length > sequence_length: num_masked_span = sequence_length // mask_length + # make sure num_masked span is also <= input_length - (mask_length - 1) + if input_length - (mask_length - 1) < num_masked_span: + num_masked_span = max(input_length - (mask_length - 1), 0) + return num_masked_span # compute number of masked spans in batch @@ -151,7 +155,13 @@ def _compute_mask_indices( # pick first sampled index that will serve as a dummy index to pad vector # to ensure same dimension for all batches due to probabilistic rounding # Picking first sample just pads those vectors twice. - dummy_mask_idx = spec_aug_mask_idx[0] + if len(spec_aug_mask_idx) == 0: + # this case can only happen if `input_length` is strictly smaller then + # `sequence_length` in which case the last token has to be a padding + # token which we can use as a dummy mask id + dummy_mask_idx = sequence_length - 1 + else: + dummy_mask_idx = spec_aug_mask_idx[0] spec_aug_mask_idx = np.concatenate( [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx] @@ -173,6 +183,10 @@ def _compute_mask_indices( ) spec_aug_mask_idxs = spec_aug_mask_idxs + offsets + # ensure that we cannot have indices larger than sequence_length + if spec_aug_mask_idxs.max() > sequence_length - 1: + spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1 + # scatter indices to mask np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1) diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py index 9b3b53929a..3969fa6ab4 100755 --- a/src/transformers/models/unispeech/modeling_unispeech.py +++ b/src/transformers/models/unispeech/modeling_unispeech.py @@ -179,10 +179,14 @@ def _compute_mask_indices( num_masked_span = int(mask_prob * input_length / mask_length + epsilon) num_masked_span = max(num_masked_span, min_masks) - # make sure num masked indices <= sequence_length + # make sure num masked span <= sequence_length if num_masked_span * mask_length > sequence_length: num_masked_span = sequence_length // mask_length + # make sure num_masked span is also <= input_length - (mask_length - 1) + if input_length - (mask_length - 1) < num_masked_span: + num_masked_span = max(input_length - (mask_length - 1), 0) + return num_masked_span # compute number of masked spans in batch @@ -213,7 +217,13 @@ def _compute_mask_indices( # pick first sampled index that will serve as a dummy index to pad vector # to ensure same dimension for all batches due to probabilistic rounding # Picking first sample just pads those vectors twice. - dummy_mask_idx = spec_aug_mask_idx[0] + if len(spec_aug_mask_idx) == 0: + # this case can only happen if `input_length` is strictly smaller then + # `sequence_length` in which case the last token has to be a padding + # token which we can use as a dummy mask id + dummy_mask_idx = sequence_length - 1 + else: + dummy_mask_idx = spec_aug_mask_idx[0] spec_aug_mask_idx = np.concatenate( [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx] @@ -235,6 +245,10 @@ def _compute_mask_indices( ) spec_aug_mask_idxs = spec_aug_mask_idxs + offsets + # ensure that we cannot have indices larger than sequence_length + if spec_aug_mask_idxs.max() > sequence_length - 1: + spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1 + # scatter indices to mask np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1) diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py index 37999b001e..4a42f9e60f 100755 --- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py +++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py @@ -218,10 +218,14 @@ def _compute_mask_indices( num_masked_span = int(mask_prob * input_length / mask_length + epsilon) num_masked_span = max(num_masked_span, min_masks) - # make sure num masked indices <= sequence_length + # make sure num masked span <= sequence_length if num_masked_span * mask_length > sequence_length: num_masked_span = sequence_length // mask_length + # make sure num_masked span is also <= input_length - (mask_length - 1) + if input_length - (mask_length - 1) < num_masked_span: + num_masked_span = max(input_length - (mask_length - 1), 0) + return num_masked_span # compute number of masked spans in batch @@ -252,7 +256,13 @@ def _compute_mask_indices( # pick first sampled index that will serve as a dummy index to pad vector # to ensure same dimension for all batches due to probabilistic rounding # Picking first sample just pads those vectors twice. - dummy_mask_idx = spec_aug_mask_idx[0] + if len(spec_aug_mask_idx) == 0: + # this case can only happen if `input_length` is strictly smaller then + # `sequence_length` in which case the last token has to be a padding + # token which we can use as a dummy mask id + dummy_mask_idx = sequence_length - 1 + else: + dummy_mask_idx = spec_aug_mask_idx[0] spec_aug_mask_idx = np.concatenate( [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx] @@ -274,6 +284,10 @@ def _compute_mask_indices( ) spec_aug_mask_idxs = spec_aug_mask_idxs + offsets + # ensure that we cannot have indices larger than sequence_length + if spec_aug_mask_idxs.max() > sequence_length - 1: + spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1 + # scatter indices to mask np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1) diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py index 5eb042b6dd..f80a8bd18c 100755 --- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py +++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py @@ -233,10 +233,14 @@ def _compute_mask_indices( num_masked_span = int(mask_prob * input_length / mask_length + epsilon) num_masked_span = max(num_masked_span, min_masks) - # make sure num masked indices <= sequence_length + # make sure num masked span <= sequence_length if num_masked_span * mask_length > sequence_length: num_masked_span = sequence_length // mask_length + # make sure num_masked span is also <= input_length - (mask_length - 1) + if input_length - (mask_length - 1) < num_masked_span: + num_masked_span = max(input_length - (mask_length - 1), 0) + return num_masked_span # compute number of masked spans in batch @@ -267,7 +271,13 @@ def _compute_mask_indices( # pick first sampled index that will serve as a dummy index to pad vector # to ensure same dimension for all batches due to probabilistic rounding # Picking first sample just pads those vectors twice. - dummy_mask_idx = spec_aug_mask_idx[0] + if len(spec_aug_mask_idx) == 0: + # this case can only happen if `input_length` is strictly smaller then + # `sequence_length` in which case the last token has to be a padding + # token which we can use as a dummy mask id + dummy_mask_idx = sequence_length - 1 + else: + dummy_mask_idx = spec_aug_mask_idx[0] spec_aug_mask_idx = np.concatenate( [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx] @@ -289,6 +299,10 @@ def _compute_mask_indices( ) spec_aug_mask_idxs = spec_aug_mask_idxs + offsets + # ensure that we cannot have indices larger than sequence_length + if spec_aug_mask_idxs.max() > sequence_length - 1: + spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1 + # scatter indices to mask np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1) diff --git a/src/transformers/models/wavlm/modeling_wavlm.py b/src/transformers/models/wavlm/modeling_wavlm.py index 8062e40885..9f67e61350 100755 --- a/src/transformers/models/wavlm/modeling_wavlm.py +++ b/src/transformers/models/wavlm/modeling_wavlm.py @@ -184,10 +184,14 @@ def _compute_mask_indices( num_masked_span = int(mask_prob * input_length / mask_length + epsilon) num_masked_span = max(num_masked_span, min_masks) - # make sure num masked indices <= sequence_length + # make sure num masked span <= sequence_length if num_masked_span * mask_length > sequence_length: num_masked_span = sequence_length // mask_length + # make sure num_masked span is also <= input_length - (mask_length - 1) + if input_length - (mask_length - 1) < num_masked_span: + num_masked_span = max(input_length - (mask_length - 1), 0) + return num_masked_span # compute number of masked spans in batch @@ -218,7 +222,13 @@ def _compute_mask_indices( # pick first sampled index that will serve as a dummy index to pad vector # to ensure same dimension for all batches due to probabilistic rounding # Picking first sample just pads those vectors twice. - dummy_mask_idx = spec_aug_mask_idx[0] + if len(spec_aug_mask_idx) == 0: + # this case can only happen if `input_length` is strictly smaller then + # `sequence_length` in which case the last token has to be a padding + # token which we can use as a dummy mask id + dummy_mask_idx = sequence_length - 1 + else: + dummy_mask_idx = spec_aug_mask_idx[0] spec_aug_mask_idx = np.concatenate( [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx] @@ -240,6 +250,10 @@ def _compute_mask_indices( ) spec_aug_mask_idxs = spec_aug_mask_idxs + offsets + # ensure that we cannot have indices larger than sequence_length + if spec_aug_mask_idxs.max() > sequence_length - 1: + spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1 + # scatter indices to mask np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1) diff --git a/tests/test_modeling_wav2vec2.py b/tests/test_modeling_wav2vec2.py index 51a371b225..9c07df5fed 100644 --- a/tests/test_modeling_wav2vec2.py +++ b/tests/test_modeling_wav2vec2.py @@ -990,6 +990,23 @@ class Wav2Vec2UtilsTest(unittest.TestCase): self.assertTrue(mask[:2, sequence_length // 2 :].sum() == 0) + def test_compute_mask_indices_short_audio(self): + batch_size = 4 + sequence_length = 100 + mask_prob = 0.05 + mask_length = 10 + + attention_mask = torch.ones((batch_size, sequence_length), dtype=torch.long, device=torch_device) + # force one example to be heavily padded + attention_mask[0, 5:] = 0 + + mask = _compute_mask_indices( + (batch_size, sequence_length), mask_prob, mask_length, attention_mask=attention_mask, min_masks=2 + ) + + # make sure that non-padded examples cannot be padded + self.assertFalse(mask[0][attention_mask[0].to(torch.bool).cpu()].any()) + def test_compute_perplexity(self): probs = torch.arange(100, device=torch_device).reshape(2, 5, 10) / 100