[Wav2Vec2] Add New Wav2Vec2 Translation (#14392)
* add new wav2vec2 translation * correct * up * add tests * correct end copy * correct more * up * correct unispeech sat * finish * finalize * finish * up
This commit is contained in:
committed by
GitHub
parent
b567510cff
commit
700a748fe6
@@ -82,6 +82,8 @@ class Wav2Vec2ModelTester:
|
||||
mask_time_length=2,
|
||||
vocab_size=32,
|
||||
do_stable_layer_norm=False,
|
||||
num_adapter_layers=1,
|
||||
adapter_stride=2,
|
||||
scope=None,
|
||||
):
|
||||
self.parent = parent
|
||||
@@ -107,6 +109,8 @@ class Wav2Vec2ModelTester:
|
||||
self.initializer_range = initializer_range
|
||||
self.vocab_size = vocab_size
|
||||
self.do_stable_layer_norm = do_stable_layer_norm
|
||||
self.num_adapter_layers = num_adapter_layers
|
||||
self.adapter_stride = adapter_stride
|
||||
self.mask_time_prob = mask_time_prob
|
||||
self.mask_time_length = mask_time_length
|
||||
self.scope = scope
|
||||
@@ -117,6 +121,8 @@ class Wav2Vec2ModelTester:
|
||||
self.output_seq_length = int(math.ceil(output_seq_length))
|
||||
self.encoder_seq_length = self.output_seq_length
|
||||
|
||||
self.adapter_output_seq_length = (self.output_seq_length - 1) // adapter_stride + 1
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
input_values = floats_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||
attention_mask = random_attention_mask([self.batch_size, self.seq_length])
|
||||
@@ -148,6 +154,8 @@ class Wav2Vec2ModelTester:
|
||||
hidden_act=self.hidden_act,
|
||||
initializer_range=self.initializer_range,
|
||||
vocab_size=self.vocab_size,
|
||||
num_adapter_layers=self.num_adapter_layers,
|
||||
adapter_stride=self.adapter_stride,
|
||||
)
|
||||
|
||||
def create_and_check_model(self, config, input_values, attention_mask):
|
||||
@@ -159,6 +167,28 @@ class Wav2Vec2ModelTester:
|
||||
result.last_hidden_state.shape, (self.batch_size, self.output_seq_length, self.hidden_size)
|
||||
)
|
||||
|
||||
def create_and_check_model_with_adapter(self, config, input_values, attention_mask):
|
||||
config.add_adapter = True
|
||||
model = Wav2Vec2Model(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
result = model(input_values, attention_mask=attention_mask)
|
||||
self.parent.assertEqual(
|
||||
result.last_hidden_state.shape, (self.batch_size, self.adapter_output_seq_length, self.hidden_size)
|
||||
)
|
||||
|
||||
def create_and_check_model_with_adapter_proj_dim(self, config, input_values, attention_mask):
|
||||
config.add_adapter = True
|
||||
config.output_hidden_size = 8
|
||||
model = Wav2Vec2Model(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
result = model(input_values, attention_mask=attention_mask)
|
||||
self.parent.assertEqual(
|
||||
result.last_hidden_state.shape,
|
||||
(self.batch_size, self.adapter_output_seq_length, config.output_hidden_size),
|
||||
)
|
||||
|
||||
def create_and_check_batch_inference(self, config, input_values, *args):
|
||||
# test does not pass for models making use of `group_norm`
|
||||
# check: https://github.com/pytorch/fairseq/issues/3227
|
||||
@@ -332,6 +362,14 @@ class Wav2Vec2ModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||
|
||||
def test_model_with_adapter(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model_with_adapter(*config_and_inputs)
|
||||
|
||||
def test_model_with_adapter_proj_dim(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model_with_adapter_proj_dim(*config_and_inputs)
|
||||
|
||||
def test_ctc_loss_inference(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.check_ctc_loss(*config_and_inputs)
|
||||
@@ -544,6 +582,14 @@ class Wav2Vec2RobustModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||
|
||||
def test_model_with_adapter(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model_with_adapter(*config_and_inputs)
|
||||
|
||||
def test_model_with_adapter_proj_dim(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model_with_adapter_proj_dim(*config_and_inputs)
|
||||
|
||||
def test_batched_inference(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_batch_inference(*config_and_inputs)
|
||||
|
||||
@@ -203,3 +203,41 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase, metaclass=Pipel
|
||||
data = f.read()
|
||||
output = asr(data)
|
||||
self.assertEqual(output, {"text": "Un uomo disse all'universo: \"Signore, io esisto."})
|
||||
|
||||
@slow
|
||||
@require_torch
|
||||
@require_torchaudio
|
||||
@require_datasets
|
||||
def test_xls_r_to_en(self):
|
||||
speech_recognizer = pipeline(
|
||||
task="automatic-speech-recognition",
|
||||
model="facebook/wav2vec2-xls-r-1b-21-to-en",
|
||||
feature_extractor="facebook/wav2vec2-xls-r-1b-21-to-en",
|
||||
framework="pt",
|
||||
)
|
||||
|
||||
from datasets import load_dataset
|
||||
|
||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
|
||||
filename = ds[40]["file"]
|
||||
output = speech_recognizer(filename)
|
||||
self.assertEqual(output, {"text": "A man said to the universe: “Sir, I exist."})
|
||||
|
||||
@slow
|
||||
@require_torch
|
||||
@require_torchaudio
|
||||
@require_datasets
|
||||
def test_xls_r_from_en(self):
|
||||
speech_recognizer = pipeline(
|
||||
task="automatic-speech-recognition",
|
||||
model="facebook/wav2vec2-xls-r-1b-en-to-15",
|
||||
feature_extractor="facebook/wav2vec2-xls-r-1b-en-to-15",
|
||||
framework="pt",
|
||||
)
|
||||
|
||||
from datasets import load_dataset
|
||||
|
||||
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id")
|
||||
filename = ds[40]["file"]
|
||||
output = speech_recognizer(filename)
|
||||
self.assertEqual(output, {"text": "Ein Mann sagte zu dem Universum, Sir, ich bin da."})
|
||||
|
||||
Reference in New Issue
Block a user