[WIP] add SpeechT5 model (#18922)
* make SpeechT5 model by copying Wav2Vec2 * add paper to docs * whoops added docs in wrong file * remove SpeechT5Tokenizer + put CTC back in the name * remove deprecated class * remove unused docstring * delete SpeechT5FeatureExtractor, use Wav2Vec2FeatureExtractor instead * remove classes we don't need right now * initial stab at speech encoder prenet * add more speech encoder prenet stuff * improve SpeechEncoderPrenet * add encoder (not finished yet) * add relative position bias to self-attention * add encoder CTC layers * fix formatting * add decoder from BART, doesn't work yet * make it work with generate loop * wrap the encoder into a speech encoder class * wrap the decoder in a text decoder class * changed my mind * changed my mind again ;-) * load decoder weights, make it work * add weights for text decoder postnet * add SpeechT5ForCTC model that uses only the encoder * clean up EncoderLayer and DecoderLayer * implement _init_weights in SpeechT5PreTrainedModel * cleanup config + Encoder and Decoder * add head + cross attention masks * improve doc comments * fixup * more cleanup * more fixup * TextDecoderPrenet works now, thanks Kendall * add CTC loss * add placeholders for other pre/postnets * add type annotation * fix freeze_feature_encoder * set padding tokens to 0 in decoder attention mask * encoder attention mask downsampling * remove features_pen calculation * disable the padding tokens thing again * fixup * more fixup * code review fixes * rename encoder/decoder wrapper classes * allow checkpoints to be loaded into SpeechT5Model * put encoder into wrapper for CTC model * clean up conversion script * add encoder for TTS model * add speech decoder prenet * add speech decoder post-net * attempt to reconstruct the generation loop * add speech generation loop * clean up generate_speech * small tweaks * fix forward pass * enable always dropout on speech decoder prenet * sort declaration * rename models * fixup * fix copies * more fixup * make consistency checker happy * add Seq2SeqSpectrogramOutput class * doc comments * quick note about loss and labels * add HiFi-GAN implementation (from Speech2Speech PR) * rename file * add vocoder to TTS model * improve vocoder * working on tokenizer * more better tokenizer * add CTC tokenizer * fix decode and batch_code in CTC tokenizer * fix processor * two processors and feature extractors * use SpeechT5WaveformFeatureExtractor instead of Wav2Vec2 * cleanup * more cleanup * even more fixup * notebooks * fix log-mel spectrograms * support reduction factor * fixup * shift spectrograms to right to create decoder inputs * return correct labels * add labels for stop token prediction * fix doc comments * fixup * remove SpeechT5ForPreTraining * more fixup * update copyright headers * add usage examples * add SpeechT5ProcessorForCTC * fixup * push unofficial checkpoints to hub * initial version of tokenizer unit tests * add slow test * fix failing tests * tests for CTC tokenizer * finish CTC tokenizer tests * processor tests * initial test for feature extractors * tests for spectrogram feature extractor * fixup * more fixup * add decorators * require speech for tests * modeling tests * more tests for ASR model * fix imports * add fake tests for the other models * fixup * remove jupyter notebooks * add missing SpeechT5Model tests * add missing tests for SpeechT5ForCTC * add missing tests for SpeechT5ForTextToSpeech * sort tests by name * fix Hi-Fi GAN tests * fixup * add speech-to-speech model * refactor duplicate speech generation code * add processor for SpeechToSpeech model * add usage example * add tests for speech-to-speech model * fixup * enable gradient checkpointing for SpeechT5FeatureEncoder * code review * push_to_hub now takes repo_id * improve doc comments for HiFi-GAN config * add missing test * add integration tests * make number of layers in speech decoder prenet configurable * rename variable * rename variables * add auto classes for TTS and S2S * REMOVE CTC!!! * S2S processor does not support save/load_pretrained * fixup * these models are now in an auto mapping * fix doc links * rename HiFiGAN to HifiGan, remove separate config file * REMOVE auto classes * there can be only one * fixup * replace assert * reformat * feature extractor can process input and target at same time * update checkpoint names * fix commit hash
This commit is contained in:
committed by
GitHub
parent
fb13a7df95
commit
e4bacf6614
BIN
tests/fixtures/test_sentencepiece_bpe_char.model
vendored
Normal file
BIN
tests/fixtures/test_sentencepiece_bpe_char.model
vendored
Normal file
Binary file not shown.
0
tests/models/speecht5/__init__.py
Normal file
0
tests/models/speecht5/__init__.py
Normal file
421
tests/models/speecht5/test_feature_extraction_speecht5.py
Normal file
421
tests/models/speecht5/test_feature_extraction_speecht5.py
Normal file
@@ -0,0 +1,421 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2021-2023 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Tests for the SpeechT5 feature extractors."""
|
||||
|
||||
import itertools
|
||||
import random
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
|
||||
from transformers import BatchFeature, is_speech_available
|
||||
from transformers.testing_utils import require_torch, require_torchaudio
|
||||
from transformers.utils.import_utils import is_torch_available
|
||||
|
||||
from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
|
||||
|
||||
|
||||
if is_speech_available():
|
||||
from transformers import SpeechT5FeatureExtractor
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
|
||||
global_rng = random.Random()
|
||||
|
||||
|
||||
def floats_list(shape, scale=1.0, rng=None, name=None):
|
||||
"""Creates a random float32 tensor"""
|
||||
if rng is None:
|
||||
rng = global_rng
|
||||
|
||||
values = []
|
||||
for batch_idx in range(shape[0]):
|
||||
values.append([])
|
||||
for _ in range(shape[1]):
|
||||
values[-1].append(rng.random() * scale)
|
||||
|
||||
return values
|
||||
|
||||
|
||||
@require_torch
|
||||
class SpeechT5FeatureExtractionTester(unittest.TestCase):
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
batch_size=7,
|
||||
min_seq_length=400,
|
||||
max_seq_length=2000,
|
||||
feature_size=1,
|
||||
padding_value=0.0,
|
||||
sampling_rate=16000,
|
||||
do_normalize=True,
|
||||
num_mel_bins=80,
|
||||
hop_length=16,
|
||||
win_length=64,
|
||||
win_function="hann_window",
|
||||
frame_signal_scale=1.0,
|
||||
fmin=80,
|
||||
fmax=7600,
|
||||
mel_floor=1e-10,
|
||||
reduction_factor=2,
|
||||
return_attention_mask=True,
|
||||
):
|
||||
self.parent = parent
|
||||
self.batch_size = batch_size
|
||||
self.min_seq_length = min_seq_length
|
||||
self.max_seq_length = max_seq_length
|
||||
self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
|
||||
self.feature_size = feature_size
|
||||
self.padding_value = padding_value
|
||||
self.sampling_rate = sampling_rate
|
||||
self.do_normalize = do_normalize
|
||||
self.num_mel_bins = num_mel_bins
|
||||
self.hop_length = hop_length
|
||||
self.win_length = win_length
|
||||
self.win_function = win_function
|
||||
self.frame_signal_scale = frame_signal_scale
|
||||
self.fmin = fmin
|
||||
self.fmax = fmax
|
||||
self.mel_floor = mel_floor
|
||||
self.reduction_factor = reduction_factor
|
||||
self.return_attention_mask = return_attention_mask
|
||||
|
||||
def prepare_feat_extract_dict(self):
|
||||
return {
|
||||
"feature_size": self.feature_size,
|
||||
"padding_value": self.padding_value,
|
||||
"sampling_rate": self.sampling_rate,
|
||||
"do_normalize": self.do_normalize,
|
||||
"num_mel_bins": self.num_mel_bins,
|
||||
"hop_length": self.hop_length,
|
||||
"win_length": self.win_length,
|
||||
"win_function": self.win_function,
|
||||
"frame_signal_scale": self.frame_signal_scale,
|
||||
"fmin": self.fmin,
|
||||
"fmax": self.fmax,
|
||||
"mel_floor": self.mel_floor,
|
||||
"reduction_factor": self.reduction_factor,
|
||||
"return_attention_mask": self.return_attention_mask,
|
||||
}
|
||||
|
||||
def prepare_inputs_for_common(self, equal_length=False, numpify=False):
|
||||
def _flatten(list_of_lists):
|
||||
return list(itertools.chain(*list_of_lists))
|
||||
|
||||
if equal_length:
|
||||
speech_inputs = floats_list((self.batch_size, self.max_seq_length))
|
||||
else:
|
||||
# make sure that inputs increase in size
|
||||
speech_inputs = [
|
||||
_flatten(floats_list((x, self.feature_size)))
|
||||
for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
|
||||
]
|
||||
|
||||
if numpify:
|
||||
speech_inputs = [np.asarray(x) for x in speech_inputs]
|
||||
|
||||
return speech_inputs
|
||||
|
||||
def prepare_inputs_for_target(self, equal_length=False, numpify=False):
|
||||
if equal_length:
|
||||
speech_inputs = [floats_list((self.max_seq_length, self.num_mel_bins)) for _ in range(self.batch_size)]
|
||||
else:
|
||||
# make sure that inputs increase in size
|
||||
speech_inputs = [
|
||||
floats_list((x, self.num_mel_bins))
|
||||
for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
|
||||
]
|
||||
|
||||
if numpify:
|
||||
speech_inputs = [np.asarray(x) for x in speech_inputs]
|
||||
|
||||
return speech_inputs
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_torchaudio
|
||||
class SpeechT5FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
|
||||
|
||||
feature_extraction_class = SpeechT5FeatureExtractor if is_speech_available() else None
|
||||
|
||||
def setUp(self):
|
||||
self.feat_extract_tester = SpeechT5FeatureExtractionTester(self)
|
||||
|
||||
def _check_zero_mean_unit_variance(self, input_vector):
|
||||
self.assertTrue(np.all(np.mean(input_vector, axis=0) < 1e-3))
|
||||
self.assertTrue(np.all(np.abs(np.var(input_vector, axis=0) - 1) < 1e-3))
|
||||
|
||||
def test_call(self):
|
||||
# Tests that all call wrap to encode_plus and batch_encode_plus
|
||||
feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
||||
# create three inputs of length 800, 1000, and 1200
|
||||
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
|
||||
np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
|
||||
|
||||
# Test not batched input
|
||||
encoded_sequences_1 = feat_extract(speech_inputs[0], return_tensors="np").input_values
|
||||
encoded_sequences_2 = feat_extract(np_speech_inputs[0], return_tensors="np").input_values
|
||||
self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
|
||||
|
||||
# Test batched
|
||||
encoded_sequences_1 = feat_extract(speech_inputs, return_tensors="np").input_values
|
||||
encoded_sequences_2 = feat_extract(np_speech_inputs, return_tensors="np").input_values
|
||||
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
|
||||
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
|
||||
|
||||
def test_zero_mean_unit_variance_normalization_np(self):
|
||||
feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
||||
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
|
||||
|
||||
paddings = ["longest", "max_length", "do_not_pad"]
|
||||
max_lengths = [None, 1600, None]
|
||||
for max_length, padding in zip(max_lengths, paddings):
|
||||
processed = feat_extract(speech_inputs, padding=padding, max_length=max_length, return_tensors="np")
|
||||
input_values = processed.input_values
|
||||
|
||||
self._check_zero_mean_unit_variance(input_values[0][:800])
|
||||
self.assertTrue(input_values[0][800:].sum() < 1e-6)
|
||||
self._check_zero_mean_unit_variance(input_values[1][:1000])
|
||||
self.assertTrue(input_values[0][1000:].sum() < 1e-6)
|
||||
self._check_zero_mean_unit_variance(input_values[2][:1200])
|
||||
|
||||
def test_zero_mean_unit_variance_normalization(self):
|
||||
feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
||||
lengths = range(800, 1400, 200)
|
||||
speech_inputs = [floats_list((1, x))[0] for x in lengths]
|
||||
|
||||
paddings = ["longest", "max_length", "do_not_pad"]
|
||||
max_lengths = [None, 1600, None]
|
||||
|
||||
for max_length, padding in zip(max_lengths, paddings):
|
||||
processed = feat_extract(speech_inputs, max_length=max_length, padding=padding)
|
||||
input_values = processed.input_values
|
||||
|
||||
self._check_zero_mean_unit_variance(input_values[0][:800])
|
||||
self._check_zero_mean_unit_variance(input_values[1][:1000])
|
||||
self._check_zero_mean_unit_variance(input_values[2][:1200])
|
||||
|
||||
def test_zero_mean_unit_variance_normalization_trunc_np_max_length(self):
|
||||
feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
||||
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
|
||||
processed = feat_extract(
|
||||
speech_inputs, truncation=True, max_length=1000, padding="max_length", return_tensors="np"
|
||||
)
|
||||
input_values = processed.input_values
|
||||
|
||||
self._check_zero_mean_unit_variance(input_values[0, :800])
|
||||
self._check_zero_mean_unit_variance(input_values[1])
|
||||
self._check_zero_mean_unit_variance(input_values[2])
|
||||
|
||||
def test_zero_mean_unit_variance_normalization_trunc_np_longest(self):
|
||||
feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
||||
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
|
||||
processed = feat_extract(
|
||||
speech_inputs, truncation=True, max_length=1000, padding="longest", return_tensors="np"
|
||||
)
|
||||
input_values = processed.input_values
|
||||
|
||||
self._check_zero_mean_unit_variance(input_values[0, :800])
|
||||
self._check_zero_mean_unit_variance(input_values[1, :1000])
|
||||
self._check_zero_mean_unit_variance(input_values[2])
|
||||
|
||||
# make sure that if max_length < longest -> then pad to max_length
|
||||
self.assertTrue(input_values.shape == (3, 1000))
|
||||
|
||||
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
|
||||
processed = feat_extract(
|
||||
speech_inputs, truncation=True, max_length=2000, padding="longest", return_tensors="np"
|
||||
)
|
||||
input_values = processed.input_values
|
||||
|
||||
self._check_zero_mean_unit_variance(input_values[0, :800])
|
||||
self._check_zero_mean_unit_variance(input_values[1, :1000])
|
||||
self._check_zero_mean_unit_variance(input_values[2])
|
||||
|
||||
# make sure that if max_length > longest -> then pad to longest
|
||||
self.assertTrue(input_values.shape == (3, 1200))
|
||||
|
||||
def test_double_precision_pad(self):
|
||||
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
||||
np_speech_inputs = np.random.rand(100).astype(np.float64)
|
||||
py_speech_inputs = np_speech_inputs.tolist()
|
||||
|
||||
for inputs in [py_speech_inputs, np_speech_inputs]:
|
||||
np_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="np")
|
||||
self.assertTrue(np_processed.input_values.dtype == np.float32)
|
||||
pt_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="pt")
|
||||
self.assertTrue(pt_processed.input_values.dtype == torch.float32)
|
||||
|
||||
def test_call_target(self):
|
||||
# Tests that all call wrap to encode_plus and batch_encode_plus
|
||||
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
||||
# create three inputs of length 8000, 14000, and 2000
|
||||
speech_inputs = [floats_list((1, x))[0] for x in range(8000, 14000, 2000)]
|
||||
np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
|
||||
|
||||
# Test feature size
|
||||
input_values = feature_extractor(audio_target=np_speech_inputs, padding=True, return_tensors="np").input_values
|
||||
self.assertTrue(input_values.ndim == 3)
|
||||
self.assertTrue(input_values.shape[-1] == feature_extractor.num_mel_bins)
|
||||
|
||||
# Test not batched input
|
||||
encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_values
|
||||
encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_values
|
||||
self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
|
||||
|
||||
# Test batched
|
||||
encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_values
|
||||
encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_values
|
||||
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
|
||||
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
|
||||
|
||||
def test_batch_feature_target(self):
|
||||
speech_inputs = self.feat_extract_tester.prepare_inputs_for_target()
|
||||
feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
|
||||
input_name = feat_extract.model_input_names[0]
|
||||
|
||||
processed_features = BatchFeature({input_name: speech_inputs})
|
||||
|
||||
self.assertTrue(all(len(x) == len(y) for x, y in zip(speech_inputs, processed_features[input_name])))
|
||||
|
||||
speech_inputs = self.feat_extract_tester.prepare_inputs_for_target(equal_length=True)
|
||||
processed_features = BatchFeature({input_name: speech_inputs}, tensor_type="np")
|
||||
|
||||
batch_features_input = processed_features[input_name]
|
||||
|
||||
if len(batch_features_input.shape) < 3:
|
||||
batch_features_input = batch_features_input[:, :, None]
|
||||
|
||||
self.assertTrue(
|
||||
batch_features_input.shape
|
||||
== (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.num_mel_bins)
|
||||
)
|
||||
|
||||
@require_torch
|
||||
def test_batch_feature_target_pt(self):
|
||||
speech_inputs = self.feat_extract_tester.prepare_inputs_for_target(equal_length=True)
|
||||
feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
|
||||
input_name = feat_extract.model_input_names[0]
|
||||
|
||||
processed_features = BatchFeature({input_name: speech_inputs}, tensor_type="pt")
|
||||
|
||||
batch_features_input = processed_features[input_name]
|
||||
|
||||
if len(batch_features_input.shape) < 3:
|
||||
batch_features_input = batch_features_input[:, :, None]
|
||||
|
||||
self.assertTrue(
|
||||
batch_features_input.shape
|
||||
== (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.num_mel_bins)
|
||||
)
|
||||
|
||||
@require_torch
|
||||
def test_padding_accepts_tensors_target_pt(self):
|
||||
feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
|
||||
speech_inputs = self.feat_extract_tester.prepare_inputs_for_target()
|
||||
input_name = feat_extract.model_input_names[0]
|
||||
|
||||
processed_features = BatchFeature({input_name: speech_inputs})
|
||||
|
||||
feat_extract.feature_size = feat_extract.num_mel_bins # hack!
|
||||
|
||||
input_np = feat_extract.pad(processed_features, padding="longest", return_tensors="np")[input_name]
|
||||
input_pt = feat_extract.pad(processed_features, padding="longest", return_tensors="pt")[input_name]
|
||||
|
||||
self.assertTrue(abs(input_np.astype(np.float32).sum() - input_pt.numpy().astype(np.float32).sum()) < 1e-2)
|
||||
|
||||
def test_attention_mask_target(self):
|
||||
feat_dict = self.feat_extract_dict
|
||||
feat_dict["return_attention_mask"] = True
|
||||
feat_extract = self.feature_extraction_class(**feat_dict)
|
||||
speech_inputs = self.feat_extract_tester.prepare_inputs_for_target()
|
||||
input_lenghts = [len(x) for x in speech_inputs]
|
||||
input_name = feat_extract.model_input_names[0]
|
||||
|
||||
processed = BatchFeature({input_name: speech_inputs})
|
||||
|
||||
feat_extract.feature_size = feat_extract.num_mel_bins # hack!
|
||||
|
||||
processed = feat_extract.pad(processed, padding="longest", return_tensors="np")
|
||||
self.assertIn("attention_mask", processed)
|
||||
self.assertListEqual(list(processed.attention_mask.shape), list(processed[input_name].shape[:2]))
|
||||
self.assertListEqual(processed.attention_mask.sum(-1).tolist(), input_lenghts)
|
||||
|
||||
def test_attention_mask_with_truncation_target(self):
|
||||
feat_dict = self.feat_extract_dict
|
||||
feat_dict["return_attention_mask"] = True
|
||||
feat_extract = self.feature_extraction_class(**feat_dict)
|
||||
speech_inputs = self.feat_extract_tester.prepare_inputs_for_target()
|
||||
input_lenghts = [len(x) for x in speech_inputs]
|
||||
input_name = feat_extract.model_input_names[0]
|
||||
|
||||
processed = BatchFeature({input_name: speech_inputs})
|
||||
max_length = min(input_lenghts)
|
||||
|
||||
feat_extract.feature_size = feat_extract.num_mel_bins # hack!
|
||||
|
||||
processed_pad = feat_extract.pad(
|
||||
processed, padding="max_length", max_length=max_length, truncation=True, return_tensors="np"
|
||||
)
|
||||
self.assertIn("attention_mask", processed_pad)
|
||||
self.assertListEqual(
|
||||
list(processed_pad.attention_mask.shape), list((processed_pad[input_name].shape[0], max_length))
|
||||
)
|
||||
self.assertListEqual(
|
||||
processed_pad.attention_mask[:, :max_length].sum(-1).tolist(), [max_length for x in speech_inputs]
|
||||
)
|
||||
|
||||
def _load_datasamples(self, num_samples):
|
||||
from datasets import load_dataset
|
||||
|
||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
||||
# automatic decoding with librispeech
|
||||
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
||||
|
||||
return [x["array"] for x in speech_samples]
|
||||
|
||||
def test_integration(self):
|
||||
# fmt: off
|
||||
EXPECTED_INPUT_VALUES = torch.tensor(
|
||||
[2.3804e-03, 2.0752e-03, 1.9836e-03, 2.1057e-03, 1.6174e-03,
|
||||
3.0518e-04, 9.1553e-05, 3.3569e-04, 9.7656e-04, 1.8311e-03,
|
||||
2.0142e-03, 2.1057e-03, 1.7395e-03, 4.5776e-04, -3.9673e-04,
|
||||
4.5776e-04, 1.0071e-03, 9.1553e-05, 4.8828e-04, 1.1597e-03,
|
||||
7.3242e-04, 9.4604e-04, 1.8005e-03, 1.8311e-03, 8.8501e-04,
|
||||
4.2725e-04, 4.8828e-04, 7.3242e-04, 1.0986e-03, 2.1057e-03]
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
input_speech = self._load_datasamples(1)
|
||||
feature_extractor = SpeechT5FeatureExtractor()
|
||||
input_values = feature_extractor(input_speech, return_tensors="pt").input_values
|
||||
self.assertTrue(torch.allclose(input_values[0, :30], EXPECTED_INPUT_VALUES, atol=1e-4))
|
||||
|
||||
def test_integration_target(self):
|
||||
# fmt: off
|
||||
EXPECTED_INPUT_VALUES = torch.tensor(
|
||||
[-2.7713, -2.8896, -3.2619, -3.0843, -2.9919, -3.0084, -3.2796, -3.3169,
|
||||
-3.2397, -3.2053, -2.9151, -2.7921, -2.9403, -2.7411, -3.0654, -2.8314,
|
||||
-3.0026, -2.9797, -3.1314, -2.9939, -2.6748, -2.7725, -2.8563, -2.9462,
|
||||
-3.2623, -3.3044, -3.1318, -3.2672, -3.4030, -3.1988]
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
input_speech = self._load_datasamples(1)
|
||||
feature_extractor = SpeechT5FeatureExtractor()
|
||||
input_values = feature_extractor(audio_target=input_speech, return_tensors="pt").input_values
|
||||
self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-4))
|
||||
1547
tests/models/speecht5/test_modeling_speecht5.py
Normal file
1547
tests/models/speecht5/test_modeling_speecht5.py
Normal file
File diff suppressed because it is too large
Load Diff
187
tests/models/speecht5/test_processor_speecht5.py
Normal file
187
tests/models/speecht5/test_processor_speecht5.py
Normal file
@@ -0,0 +1,187 @@
|
||||
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Tests for the SpeechT5 processors."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
from transformers import is_speech_available, is_torch_available
|
||||
from transformers.models.speecht5 import SpeechT5Tokenizer
|
||||
from transformers.testing_utils import get_tests_dir, require_torch, require_torchaudio
|
||||
from transformers.utils import FEATURE_EXTRACTOR_NAME
|
||||
|
||||
|
||||
if is_speech_available() and is_torch_available():
|
||||
from transformers import SpeechT5FeatureExtractor, SpeechT5Processor
|
||||
|
||||
from .test_feature_extraction_speecht5 import floats_list
|
||||
|
||||
|
||||
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece_bpe_char.model")
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_torchaudio
|
||||
class SpeechT5ProcessorTest(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.tmpdirname = tempfile.mkdtemp()
|
||||
|
||||
tokenizer = SpeechT5Tokenizer(SAMPLE_VOCAB)
|
||||
tokenizer.save_pretrained(self.tmpdirname)
|
||||
|
||||
feature_extractor_map = {
|
||||
"feature_size": 1,
|
||||
"padding_value": 0.0,
|
||||
"sampling_rate": 16000,
|
||||
"do_normalize": False,
|
||||
"num_mel_bins": 80,
|
||||
"hop_length": 16,
|
||||
"win_length": 64,
|
||||
"win_function": "hann_window",
|
||||
"frame_signal_scale": 1.0,
|
||||
"fmin": 80,
|
||||
"fmax": 7600,
|
||||
"mel_floor": 1e-10,
|
||||
"reduction_factor": 2,
|
||||
"return_attention_mask": True,
|
||||
}
|
||||
|
||||
self.feature_extraction_file = os.path.join(self.tmpdirname, FEATURE_EXTRACTOR_NAME)
|
||||
with open(self.feature_extraction_file, "w", encoding="utf-8") as fp:
|
||||
fp.write(json.dumps(feature_extractor_map) + "\n")
|
||||
|
||||
def get_tokenizer(self, **kwargs):
|
||||
return SpeechT5Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
def get_feature_extractor(self, **kwargs):
|
||||
return SpeechT5FeatureExtractor.from_pretrained(self.tmpdirname, **kwargs)
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.tmpdirname)
|
||||
|
||||
def test_save_load_pretrained_default(self):
|
||||
tokenizer = self.get_tokenizer()
|
||||
feature_extractor = self.get_feature_extractor()
|
||||
|
||||
processor = SpeechT5Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
||||
|
||||
processor.save_pretrained(self.tmpdirname)
|
||||
processor = SpeechT5Processor.from_pretrained(self.tmpdirname)
|
||||
|
||||
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer.get_vocab())
|
||||
self.assertIsInstance(processor.tokenizer, SpeechT5Tokenizer)
|
||||
|
||||
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor.to_json_string())
|
||||
self.assertIsInstance(processor.feature_extractor, SpeechT5FeatureExtractor)
|
||||
|
||||
def test_save_load_pretrained_additional_features(self):
|
||||
processor = SpeechT5Processor(tokenizer=self.get_tokenizer(), feature_extractor=self.get_feature_extractor())
|
||||
processor.save_pretrained(self.tmpdirname)
|
||||
|
||||
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
|
||||
feature_extractor_add_kwargs = self.get_feature_extractor(do_normalize=False, padding_value=1.0)
|
||||
|
||||
processor = SpeechT5Processor.from_pretrained(
|
||||
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
|
||||
)
|
||||
|
||||
self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
|
||||
self.assertIsInstance(processor.tokenizer, SpeechT5Tokenizer)
|
||||
|
||||
self.assertEqual(processor.feature_extractor.to_json_string(), feature_extractor_add_kwargs.to_json_string())
|
||||
self.assertIsInstance(processor.feature_extractor, SpeechT5FeatureExtractor)
|
||||
|
||||
def test_feature_extractor(self):
|
||||
feature_extractor = self.get_feature_extractor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
|
||||
processor = SpeechT5Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
||||
|
||||
raw_speech = floats_list((3, 1000))
|
||||
|
||||
input_feat_extract = feature_extractor(audio=raw_speech, return_tensors="np")
|
||||
input_processor = processor(audio=raw_speech, return_tensors="np")
|
||||
|
||||
for key in input_feat_extract.keys():
|
||||
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
|
||||
|
||||
def test_feature_extractor_target(self):
|
||||
feature_extractor = self.get_feature_extractor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
|
||||
processor = SpeechT5Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
||||
|
||||
raw_speech = floats_list((3, 1000))
|
||||
|
||||
input_feat_extract = feature_extractor(audio_target=raw_speech, return_tensors="np")
|
||||
input_processor = processor(audio_target=raw_speech, return_tensors="np")
|
||||
|
||||
for key in input_feat_extract.keys():
|
||||
self.assertAlmostEqual(input_feat_extract[key].sum(), input_processor[key].sum(), delta=1e-2)
|
||||
|
||||
def test_tokenizer(self):
|
||||
feature_extractor = self.get_feature_extractor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
|
||||
processor = SpeechT5Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
||||
|
||||
input_str = "This is a test string"
|
||||
|
||||
encoded_processor = processor(text=input_str)
|
||||
encoded_tok = tokenizer(input_str)
|
||||
|
||||
for key in encoded_tok.keys():
|
||||
self.assertListEqual(encoded_tok[key], encoded_processor[key])
|
||||
|
||||
def test_tokenizer_target(self):
|
||||
feature_extractor = self.get_feature_extractor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
|
||||
processor = SpeechT5Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
||||
|
||||
input_str = "This is a test string"
|
||||
|
||||
encoded_processor = processor(text_target=input_str)
|
||||
encoded_tok = tokenizer(input_str)
|
||||
|
||||
for key in encoded_tok.keys():
|
||||
self.assertListEqual(encoded_tok[key], encoded_processor[key])
|
||||
|
||||
def test_tokenizer_decode(self):
|
||||
feature_extractor = self.get_feature_extractor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
|
||||
processor = SpeechT5Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
||||
|
||||
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]]
|
||||
|
||||
decoded_processor = processor.batch_decode(predicted_ids)
|
||||
decoded_tok = tokenizer.batch_decode(predicted_ids)
|
||||
|
||||
self.assertListEqual(decoded_tok, decoded_processor)
|
||||
|
||||
def test_model_input_names(self):
|
||||
feature_extractor = self.get_feature_extractor()
|
||||
tokenizer = self.get_tokenizer()
|
||||
|
||||
processor = SpeechT5Processor(tokenizer=tokenizer, feature_extractor=feature_extractor)
|
||||
|
||||
self.assertListEqual(
|
||||
processor.model_input_names,
|
||||
feature_extractor.model_input_names,
|
||||
msg="`processor` and `feature_extractor` model input names do not match",
|
||||
)
|
||||
186
tests/models/speecht5/test_tokenization_speecht5.py
Normal file
186
tests/models/speecht5/test_tokenization_speecht5.py
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user