audio_utils improvements (#21998)

* silly change to allow making a PR

* clean up doc comments

* simplify hertz_to_mel and mel_to_hertz

* fixup

* clean up power_to_db

* also add amplitude_to_db

* move functions

* clean up mel_filter_bank

* fixup

* credit librosa & torchaudio authors

* add unit tests

* tests for power_to_db and amplitude_to_db

* add mel_filter_bank tests

* rewrite STFT

* add convenience spectrogram function

* missing transpose

* fewer transposes

* add integration test to M-CTC-T

* frame length can be either window or FFT length

* rewrite stft API

* add preemphasis coefficient

* move argument

* add log option to spectrogram

* replace M-CTC-T feature extractor

* fix api thing

* replace whisper STFT

* replace whisper mel filters

* replace tvlt's stft

* allow alternate window names

* replace speecht5 stft

* fixup

* fix integration tests

* fix doc comments

* remove manual FFT length calculation

* fix docs

* go away, deprecation warnings

* combine everything into spectrogram function

* add deprecated functions back

* fixup
This commit is contained in:
Matthijs Hollemans
2023-05-09 15:10:17 +02:00
committed by GitHub
parent 431b04d8c4
commit 7f91950901
14 changed files with 1356 additions and 615 deletions

View File

@@ -160,6 +160,7 @@ class ASTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Test
# fmt: on
input_speech = self._load_datasamples(1)
feaure_extractor = ASTFeatureExtractor()
input_values = feaure_extractor(input_speech, return_tensors="pt").input_values
feature_extractor = ASTFeatureExtractor()
input_values = feature_extractor(input_speech, return_tensors="pt").input_values
self.assertEquals(input_values.shape, (1, 1024, 128))
self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-4))

View File

@@ -21,7 +21,7 @@ import unittest
import numpy as np
from transformers import is_speech_available
from transformers.testing_utils import require_torch, require_torchaudio
from transformers.testing_utils import require_torch
from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
@@ -47,7 +47,6 @@ def floats_list(shape, scale=1.0, rng=None, name=None):
@require_torch
@require_torchaudio
class MCTCTFeatureExtractionTester(unittest.TestCase):
def __init__(
self,
@@ -102,7 +101,6 @@ class MCTCTFeatureExtractionTester(unittest.TestCase):
@require_torch
@require_torchaudio
class MCTCTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
feature_extraction_class = MCTCTFeatureExtractor if is_speech_available() else None
@@ -271,3 +269,38 @@ class MCTCTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Te
self.assertTrue(np_processed.input_features.dtype == np.float32)
pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
self.assertTrue(pt_processed.input_features.dtype == torch.float32)
def _load_datasamples(self, num_samples):
from datasets import load_dataset
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
return [x["array"] for x in speech_samples]
def test_integration(self):
# fmt: off
expected = np.array([
[
1.1280, 1.1319, 1.2744, 1.4369, 1.4328, 1.3671, 1.2889, 1.3046,
1.4419, 0.8387, 0.2995, 0.0404, 0.1068, 0.0472, 0.3728, 1.3356,
1.4491, 0.4770, 0.3997, 0.2776, 0.3184, -0.1243, -0.1170, -0.0828
],
[
1.0826, 1.0565, 1.2110, 1.3886, 1.3416, 1.2009, 1.1894, 1.2707,
1.5153, 0.7005, 0.4916, 0.4017, 0.3743, 0.1935, 0.4228, 1.1084,
0.9768, 0.0608, 0.2044, 0.1723, 0.0433, -0.2360, -0.2478, -0.2643
],
[
1.0590, 0.9923, 1.1185, 1.3309, 1.1971, 1.0067, 1.0080, 1.2036,
1.5397, 1.0383, 0.7672, 0.7551, 0.4878, 0.8771, 0.7565, 0.8775,
0.9042, 0.4595, 0.6157, 0.4954, 0.1857, 0.0307, 0.0199, 0.1033
],
])
# fmt: on
input_speech = self._load_datasamples(1)
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
input_features = feature_extractor(input_speech, sampling_rate=16000, return_tensors="pt").input_features
self.assertTrue(np.allclose(input_features[0, 100:103], expected, atol=1e-4))

View File

@@ -247,3 +247,27 @@ class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt
self.assertTrue(np_processed.input_features.dtype == np.float32)
pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
self.assertTrue(pt_processed.input_features.dtype == torch.float32)
def _load_datasamples(self, num_samples):
from datasets import load_dataset
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
# automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
return [x["array"] for x in speech_samples]
def test_integration(self):
# fmt: off
expected = np.array([
-1.5745, -1.7713, -1.7020, -1.6069, -1.2250, -1.1105, -0.9072, -0.8241,
-1.2310, -0.8098, -0.3320, -0.4101, -0.7985, -0.4996, -0.8213, -0.9128,
-1.0420, -1.1286, -1.0440, -0.7999, -0.8405, -1.2275, -1.5443, -1.4625,
])
# fmt: on
input_speech = self._load_datasamples(1)
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
input_features = feature_extractor(input_speech, return_tensors="pt").input_features
self.assertEquals(input_features.shape, (1, 584, 24))
self.assertTrue(np.allclose(input_features[0, 0, :30], expected, atol=1e-4))

View File

@@ -395,7 +395,8 @@ class SpeechT5FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest
input_speech = self._load_datasamples(1)
feature_extractor = SpeechT5FeatureExtractor()
input_values = feature_extractor(input_speech, return_tensors="pt").input_values
self.assertTrue(torch.allclose(input_values[0, :30], EXPECTED_INPUT_VALUES, atol=1e-4))
self.assertEquals(input_values.shape, (1, 93680))
self.assertTrue(torch.allclose(input_values[0, :30], EXPECTED_INPUT_VALUES, atol=1e-6))
def test_integration_target(self):
# fmt: off
@@ -410,4 +411,5 @@ class SpeechT5FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest
input_speech = self._load_datasamples(1)
feature_extractor = SpeechT5FeatureExtractor()
input_values = feature_extractor(audio_target=input_speech, return_tensors="pt").input_values
self.assertEquals(input_values.shape, (1, 366, 80))
self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-4))

View File

@@ -198,10 +198,10 @@ class TvltFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Tes
def test_integration(self):
input_speech = self._load_datasamples(1)
feaure_extractor = TvltFeatureExtractor()
audio_values = feaure_extractor(input_speech, return_tensors="pt").audio_values
feature_extractor = TvltFeatureExtractor()
audio_values = feature_extractor(input_speech, return_tensors="pt").audio_values
self.assertTrue(audio_values.shape, [1, 1, 192, 128])
self.assertEquals(audio_values.shape, (1, 1, 192, 128))
expected_slice = torch.tensor([[-0.3032, -0.2708], [-0.4434, -0.4007]])
self.assertTrue(torch.allclose(audio_values[0, 0, :2, :2], expected_slice, atol=1e-4))

View File

@@ -218,8 +218,9 @@ class WhisperFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.
# fmt: on
input_speech = self._load_datasamples(1)
feaure_extractor = WhisperFeatureExtractor()
input_features = feaure_extractor(input_speech, return_tensors="pt").input_features
feature_extractor = WhisperFeatureExtractor()
input_features = feature_extractor(input_speech, return_tensors="pt").input_features
self.assertEqual(input_features.shape, (1, 80, 3000))
self.assertTrue(torch.allclose(input_features[0, 0, :30], EXPECTED_INPUT_FEATURES, atol=1e-4))
def test_zero_mean_unit_variance_normalization_trunc_np_longest(self):