Add dithering to the Speech2TextFeatureExtractor API. (#34638)

* Add dithering to the `Speech2TextFeatureExtractor` API.

- in kaldi : 4a8b7f6732/src/feat/feature-window.cc (L145)
- with dithering without a seed, the features become non-deterministic due
  to small Gaussian noise added to the audio (i.e. 2 runs lead to little
  different outputs)

* update the PR

- add dithering also for WhisperFeatureExtractor
- not adding to Wav2Vec2FeatureExtractor (no FBANK computation)

* add unit-tests for dithering, fix docstrings

* ruff

* utils/check_copies.py --fix_and_overwrite

* update code, add seed to unit-test

* adding explanation of dithering
This commit is contained in:
Karel Vesely
2025-02-19 11:50:02 +01:00
committed by GitHub
parent 9f51dc2535
commit 1a81d774b1
5 changed files with 119 additions and 1 deletions

View File

@@ -144,6 +144,40 @@ class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
def test_dither(self):
np.random.seed(42) # seed the dithering randn()
# Tests that features with and without little dithering are similar, but not the same
dict_no_dither = self.feat_extract_tester.prepare_feat_extract_dict()
dict_no_dither["dither"] = 0.0
dict_dither = self.feat_extract_tester.prepare_feat_extract_dict()
dict_dither["dither"] = 1.0
feature_extractor_no_dither = self.feature_extraction_class(**dict_no_dither)
feature_extractor_dither = self.feature_extraction_class(**dict_dither)
# create three inputs of length 800, 1000, and 1200
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
# compute features
input_features_no_dither = feature_extractor_no_dither(
np_speech_inputs, padding=True, return_tensors="np"
).input_features
input_features_dither = feature_extractor_dither(
np_speech_inputs, padding=True, return_tensors="np"
).input_features
# test there is a difference between features (there's added noise to input signal)
diff = input_features_dither - input_features_no_dither
# features are not identical
self.assertTrue(np.abs(diff).mean() > 1e-5)
# features are not too different
self.assertTrue(np.abs(diff).mean() <= 1e-3)
self.assertTrue(np.abs(diff).max() <= 1e-2)
def test_cepstral_mean_and_variance_normalization(self):
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]

View File

@@ -200,6 +200,40 @@ class WhisperFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
def test_dither(self):
np.random.seed(42) # seed the dithering randn()
# Tests that features with and without little dithering are similar, but not the same
dict_no_dither = self.feat_extract_tester.prepare_feat_extract_dict()
dict_no_dither["dither"] = 0.0
dict_dither = self.feat_extract_tester.prepare_feat_extract_dict()
dict_dither["dither"] = 0.00003 # approx. 1/32k
feature_extractor_no_dither = self.feature_extraction_class(**dict_no_dither)
feature_extractor_dither = self.feature_extraction_class(**dict_dither)
# create three inputs of length 800, 1000, and 1200
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
# compute features
input_features_no_dither = feature_extractor_no_dither(
np_speech_inputs, padding=True, return_tensors="np"
).input_features
input_features_dither = feature_extractor_dither(
np_speech_inputs, padding=True, return_tensors="np"
).input_features
# test there is a difference between features (there's added noise to input signal)
diff = input_features_dither - input_features_no_dither
# features are not identical
self.assertTrue(np.abs(diff).mean() > 1e-6)
# features are not too different
self.assertTrue(np.abs(diff).mean() <= 1e-4)
self.assertTrue(np.abs(diff).max() <= 1e-3)
@require_torch
def test_double_precision_pad(self):
import torch