Fix: [SeamlessM4T - S2TT] Bug in batch loading of audio in torch.Tensor format in the SeamlessM4TFeatureExtractor class (#27914)
* fixes: code fixes on is_batched condition to also check for batched audio data in torch.Tensor format instead of only just checking for batched audio data in np.ndarray format * Update src/transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py Co-authored-by: Yoach Lacombe <52246514+ylacombe@users.noreply.github.com> * refactor: code refactoring to remove torch framework dependency * docs: updated docstring to add torch tensor compatibility * test: add test cases to incorporate torch tensor inputs * test: ran make fix-copies for code conformity * test: refactor test to separate the test_call into test_call_numpy and test_call_torch --------- Co-authored-by: Yoach Lacombe <52246514+ylacombe@users.noreply.github.com>
This commit is contained in:
@@ -139,7 +139,7 @@ class SeamlessM4TFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt
|
||||
dict_second = feat_extract_second.to_dict()
|
||||
self.assertEqual(dict_first, dict_second)
|
||||
|
||||
def test_call(self):
|
||||
def test_call_numpy(self):
|
||||
# Tests that all call wrap to encode_plus and batch_encode_plus
|
||||
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
||||
# create three inputs of length 800, 1000, and 1200
|
||||
@@ -171,6 +171,41 @@ class SeamlessM4TFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt
|
||||
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
|
||||
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
|
||||
|
||||
@require_torch
|
||||
def test_call_torch(self):
|
||||
import torch
|
||||
|
||||
# Tests that all call wrap to encode_plus and batch_encode_plus
|
||||
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
||||
# create three inputs of length 800, 1000, and 1200
|
||||
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
|
||||
pt_speech_inputs = [torch.tensor(speech_input) for speech_input in speech_inputs]
|
||||
|
||||
# Test feature size
|
||||
input_features = feature_extractor(pt_speech_inputs, padding=True, return_tensors="pt").input_features
|
||||
self.assertTrue(input_features.ndim == 3)
|
||||
self.assertTrue(input_features.shape[0] == 3)
|
||||
self.assertTrue(input_features.shape[-1] == feature_extractor.feature_size * feature_extractor.stride)
|
||||
|
||||
# Test not batched input
|
||||
encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="pt").input_features
|
||||
encoded_sequences_2 = feature_extractor(pt_speech_inputs[0], return_tensors="pt").input_features
|
||||
self.assertTrue(torch.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
|
||||
|
||||
# Test batched
|
||||
encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="pt").input_features
|
||||
encoded_sequences_2 = feature_extractor(pt_speech_inputs, return_tensors="pt").input_features
|
||||
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
|
||||
self.assertTrue(torch.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
|
||||
|
||||
# Test 2-D numpy arrays are batched.
|
||||
speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
|
||||
pt_speech_inputs = torch.tensor(speech_inputs)
|
||||
encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="pt").input_features
|
||||
encoded_sequences_2 = feature_extractor(pt_speech_inputs, return_tensors="pt").input_features
|
||||
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
|
||||
self.assertTrue(torch.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
|
||||
|
||||
@require_torch
|
||||
# Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_double_precision_pad
|
||||
def test_double_precision_pad(self):
|
||||
|
||||
Reference in New Issue
Block a user