audio_utils improvements (#21998)
* silly change to allow making a PR * clean up doc comments * simplify hertz_to_mel and mel_to_hertz * fixup * clean up power_to_db * also add amplitude_to_db * move functions * clean up mel_filter_bank * fixup * credit librosa & torchaudio authors * add unit tests * tests for power_to_db and amplitude_to_db * add mel_filter_bank tests * rewrite STFT * add convenience spectrogram function * missing transpose * fewer transposes * add integration test to M-CTC-T * frame length can be either window or FFT length * rewrite stft API * add preemphasis coefficient * move argument * add log option to spectrogram * replace M-CTC-T feature extractor * fix api thing * replace whisper STFT * replace whisper mel filters * replace tvlt's stft * allow alternate window names * replace speecht5 stft * fixup * fix integration tests * fix doc comments * remove manual FFT length calculation * fix docs * go away, deprecation warnings * combine everything into spectrogram function * add deprecated functions back * fixup
This commit is contained in:
committed by
GitHub
parent
431b04d8c4
commit
7f91950901
@@ -160,6 +160,7 @@ class ASTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Test
|
||||
# fmt: on
|
||||
|
||||
input_speech = self._load_datasamples(1)
|
||||
feaure_extractor = ASTFeatureExtractor()
|
||||
input_values = feaure_extractor(input_speech, return_tensors="pt").input_values
|
||||
feature_extractor = ASTFeatureExtractor()
|
||||
input_values = feature_extractor(input_speech, return_tensors="pt").input_values
|
||||
self.assertEquals(input_values.shape, (1, 1024, 128))
|
||||
self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-4))
|
||||
|
||||
@@ -21,7 +21,7 @@ import unittest
|
||||
import numpy as np
|
||||
|
||||
from transformers import is_speech_available
|
||||
from transformers.testing_utils import require_torch, require_torchaudio
|
||||
from transformers.testing_utils import require_torch
|
||||
|
||||
from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
|
||||
|
||||
@@ -47,7 +47,6 @@ def floats_list(shape, scale=1.0, rng=None, name=None):
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_torchaudio
|
||||
class MCTCTFeatureExtractionTester(unittest.TestCase):
|
||||
def __init__(
|
||||
self,
|
||||
@@ -102,7 +101,6 @@ class MCTCTFeatureExtractionTester(unittest.TestCase):
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_torchaudio
|
||||
class MCTCTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
|
||||
feature_extraction_class = MCTCTFeatureExtractor if is_speech_available() else None
|
||||
|
||||
@@ -271,3 +269,38 @@ class MCTCTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Te
|
||||
self.assertTrue(np_processed.input_features.dtype == np.float32)
|
||||
pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
|
||||
self.assertTrue(pt_processed.input_features.dtype == torch.float32)
|
||||
|
||||
def _load_datasamples(self, num_samples):
|
||||
from datasets import load_dataset
|
||||
|
||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||
# automatic decoding with librispeech
|
||||
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
||||
|
||||
return [x["array"] for x in speech_samples]
|
||||
|
||||
def test_integration(self):
|
||||
# fmt: off
|
||||
expected = np.array([
|
||||
[
|
||||
1.1280, 1.1319, 1.2744, 1.4369, 1.4328, 1.3671, 1.2889, 1.3046,
|
||||
1.4419, 0.8387, 0.2995, 0.0404, 0.1068, 0.0472, 0.3728, 1.3356,
|
||||
1.4491, 0.4770, 0.3997, 0.2776, 0.3184, -0.1243, -0.1170, -0.0828
|
||||
],
|
||||
[
|
||||
1.0826, 1.0565, 1.2110, 1.3886, 1.3416, 1.2009, 1.1894, 1.2707,
|
||||
1.5153, 0.7005, 0.4916, 0.4017, 0.3743, 0.1935, 0.4228, 1.1084,
|
||||
0.9768, 0.0608, 0.2044, 0.1723, 0.0433, -0.2360, -0.2478, -0.2643
|
||||
],
|
||||
[
|
||||
1.0590, 0.9923, 1.1185, 1.3309, 1.1971, 1.0067, 1.0080, 1.2036,
|
||||
1.5397, 1.0383, 0.7672, 0.7551, 0.4878, 0.8771, 0.7565, 0.8775,
|
||||
0.9042, 0.4595, 0.6157, 0.4954, 0.1857, 0.0307, 0.0199, 0.1033
|
||||
],
|
||||
])
|
||||
# fmt: on
|
||||
|
||||
input_speech = self._load_datasamples(1)
|
||||
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
||||
input_features = feature_extractor(input_speech, sampling_rate=16000, return_tensors="pt").input_features
|
||||
self.assertTrue(np.allclose(input_features[0, 100:103], expected, atol=1e-4))
|
||||
|
||||
@@ -247,3 +247,27 @@ class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt
|
||||
self.assertTrue(np_processed.input_features.dtype == np.float32)
|
||||
pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
|
||||
self.assertTrue(pt_processed.input_features.dtype == torch.float32)
|
||||
|
||||
def _load_datasamples(self, num_samples):
|
||||
from datasets import load_dataset
|
||||
|
||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||
# automatic decoding with librispeech
|
||||
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
||||
|
||||
return [x["array"] for x in speech_samples]
|
||||
|
||||
def test_integration(self):
|
||||
# fmt: off
|
||||
expected = np.array([
|
||||
-1.5745, -1.7713, -1.7020, -1.6069, -1.2250, -1.1105, -0.9072, -0.8241,
|
||||
-1.2310, -0.8098, -0.3320, -0.4101, -0.7985, -0.4996, -0.8213, -0.9128,
|
||||
-1.0420, -1.1286, -1.0440, -0.7999, -0.8405, -1.2275, -1.5443, -1.4625,
|
||||
])
|
||||
# fmt: on
|
||||
|
||||
input_speech = self._load_datasamples(1)
|
||||
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
||||
input_features = feature_extractor(input_speech, return_tensors="pt").input_features
|
||||
self.assertEquals(input_features.shape, (1, 584, 24))
|
||||
self.assertTrue(np.allclose(input_features[0, 0, :30], expected, atol=1e-4))
|
||||
|
||||
@@ -395,7 +395,8 @@ class SpeechT5FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest
|
||||
input_speech = self._load_datasamples(1)
|
||||
feature_extractor = SpeechT5FeatureExtractor()
|
||||
input_values = feature_extractor(input_speech, return_tensors="pt").input_values
|
||||
self.assertTrue(torch.allclose(input_values[0, :30], EXPECTED_INPUT_VALUES, atol=1e-4))
|
||||
self.assertEquals(input_values.shape, (1, 93680))
|
||||
self.assertTrue(torch.allclose(input_values[0, :30], EXPECTED_INPUT_VALUES, atol=1e-6))
|
||||
|
||||
def test_integration_target(self):
|
||||
# fmt: off
|
||||
@@ -410,4 +411,5 @@ class SpeechT5FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest
|
||||
input_speech = self._load_datasamples(1)
|
||||
feature_extractor = SpeechT5FeatureExtractor()
|
||||
input_values = feature_extractor(audio_target=input_speech, return_tensors="pt").input_values
|
||||
self.assertEquals(input_values.shape, (1, 366, 80))
|
||||
self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-4))
|
||||
|
||||
@@ -198,10 +198,10 @@ class TvltFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Tes
|
||||
|
||||
def test_integration(self):
|
||||
input_speech = self._load_datasamples(1)
|
||||
feaure_extractor = TvltFeatureExtractor()
|
||||
audio_values = feaure_extractor(input_speech, return_tensors="pt").audio_values
|
||||
feature_extractor = TvltFeatureExtractor()
|
||||
audio_values = feature_extractor(input_speech, return_tensors="pt").audio_values
|
||||
|
||||
self.assertTrue(audio_values.shape, [1, 1, 192, 128])
|
||||
self.assertEquals(audio_values.shape, (1, 1, 192, 128))
|
||||
|
||||
expected_slice = torch.tensor([[-0.3032, -0.2708], [-0.4434, -0.4007]])
|
||||
self.assertTrue(torch.allclose(audio_values[0, 0, :2, :2], expected_slice, atol=1e-4))
|
||||
|
||||
@@ -218,8 +218,9 @@ class WhisperFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.
|
||||
# fmt: on
|
||||
|
||||
input_speech = self._load_datasamples(1)
|
||||
feaure_extractor = WhisperFeatureExtractor()
|
||||
input_features = feaure_extractor(input_speech, return_tensors="pt").input_features
|
||||
feature_extractor = WhisperFeatureExtractor()
|
||||
input_features = feature_extractor(input_speech, return_tensors="pt").input_features
|
||||
self.assertEqual(input_features.shape, (1, 80, 3000))
|
||||
self.assertTrue(torch.allclose(input_features[0, 0, :30], EXPECTED_INPUT_FEATURES, atol=1e-4))
|
||||
|
||||
def test_zero_mean_unit_variance_normalization_trunc_np_longest(self):
|
||||
|
||||
Reference in New Issue
Block a user