[audio utils] fix fft_bin_width computation (#36603)
* fix fft_bin_width computation * update docstring + enforce correct params * update test with correct value * udpate test * update feature extractors for concerned models * update * make * udpate docstring * udpate docstring
This commit is contained in:
@@ -293,7 +293,7 @@ def mel_filter_bank(
|
||||
|
||||
Args:
|
||||
num_frequency_bins (`int`):
|
||||
Number of frequencies used to compute the spectrogram (should be the same as in `stft`).
|
||||
Number of frequency bins (should be the same as `n_fft // 2 + 1` where `n_fft` is the size of the Fourier Transform used to compute the spectrogram).
|
||||
num_mel_filters (`int`):
|
||||
Number of mel filters to generate.
|
||||
min_frequency (`float`):
|
||||
@@ -317,6 +317,12 @@ def mel_filter_bank(
|
||||
if norm is not None and norm != "slaney":
|
||||
raise ValueError('norm must be one of None or "slaney"')
|
||||
|
||||
if num_frequency_bins < 2:
|
||||
raise ValueError(f"Require num_frequency_bins: {num_frequency_bins} >= 2")
|
||||
|
||||
if min_frequency > max_frequency:
|
||||
raise ValueError(f"Require min_frequency: {min_frequency} <= max_frequency: {max_frequency}")
|
||||
|
||||
# center points of the triangular mel filters
|
||||
mel_min = hertz_to_mel(min_frequency, mel_scale=mel_scale)
|
||||
mel_max = hertz_to_mel(max_frequency, mel_scale=mel_scale)
|
||||
@@ -325,7 +331,7 @@ def mel_filter_bank(
|
||||
|
||||
if triangularize_in_mel_space:
|
||||
# frequencies of FFT bins in Hz, but filters triangularized in mel space
|
||||
fft_bin_width = sampling_rate / (num_frequency_bins * 2)
|
||||
fft_bin_width = sampling_rate / ((num_frequency_bins - 1) * 2)
|
||||
fft_freqs = hertz_to_mel(fft_bin_width * np.arange(num_frequency_bins), mel_scale=mel_scale)
|
||||
filter_freqs = mel_freqs
|
||||
else:
|
||||
|
||||
@@ -91,7 +91,7 @@ class ASTFeatureExtractor(SequenceFeatureExtractor):
|
||||
|
||||
if not is_speech_available():
|
||||
mel_filters = mel_filter_bank(
|
||||
num_frequency_bins=256,
|
||||
num_frequency_bins=257,
|
||||
num_mel_filters=self.num_mel_bins,
|
||||
min_frequency=20,
|
||||
max_frequency=sampling_rate // 2,
|
||||
@@ -101,7 +101,7 @@ class ASTFeatureExtractor(SequenceFeatureExtractor):
|
||||
triangularize_in_mel_space=True,
|
||||
)
|
||||
|
||||
self.mel_filters = np.pad(mel_filters, ((0, 1), (0, 0)))
|
||||
self.mel_filters = mel_filters
|
||||
self.window = window_function(400, "hann", periodic=False)
|
||||
|
||||
def _extract_fbank_features(
|
||||
|
||||
@@ -74,7 +74,7 @@ class SeamlessM4TFeatureExtractor(SequenceFeatureExtractor):
|
||||
self.stride = stride
|
||||
|
||||
mel_filters = mel_filter_bank(
|
||||
num_frequency_bins=256,
|
||||
num_frequency_bins=257,
|
||||
num_mel_filters=self.num_mel_bins,
|
||||
min_frequency=20,
|
||||
max_frequency=sampling_rate // 2,
|
||||
@@ -84,7 +84,7 @@ class SeamlessM4TFeatureExtractor(SequenceFeatureExtractor):
|
||||
triangularize_in_mel_space=True,
|
||||
)
|
||||
|
||||
self.mel_filters = np.pad(mel_filters, ((0, 1), (0, 0)))
|
||||
self.mel_filters = mel_filters
|
||||
self.window = window_function(400, "povey", periodic=False)
|
||||
|
||||
super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
|
||||
|
||||
@@ -91,7 +91,7 @@ class Speech2TextFeatureExtractor(SequenceFeatureExtractor):
|
||||
|
||||
if not is_speech_available():
|
||||
mel_filters = mel_filter_bank(
|
||||
num_frequency_bins=256,
|
||||
num_frequency_bins=257,
|
||||
num_mel_filters=self.num_mel_bins,
|
||||
min_frequency=20,
|
||||
max_frequency=sampling_rate // 2,
|
||||
@@ -101,7 +101,7 @@ class Speech2TextFeatureExtractor(SequenceFeatureExtractor):
|
||||
triangularize_in_mel_space=True,
|
||||
)
|
||||
|
||||
self.mel_filters = np.pad(mel_filters, ((0, 1), (0, 0)))
|
||||
self.mel_filters = mel_filters
|
||||
self.window = window_function(400, "povey", periodic=False)
|
||||
|
||||
def _extract_fbank_features(
|
||||
|
||||
Reference in New Issue
Block a user