|
|
|
|
@@ -1835,9 +1835,7 @@ class WhisperModelIntegrationTests(unittest.TestCase):
|
|
|
|
|
return WhisperProcessor.from_pretrained("openai/whisper-base")
|
|
|
|
|
|
|
|
|
|
def _load_datasamples(self, num_samples):
|
|
|
|
|
ds = load_dataset(
|
|
|
|
|
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
|
|
|
|
|
)
|
|
|
|
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
|
|
|
|
# automatic decoding with librispeech
|
|
|
|
|
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
|
|
|
|
|
|
|
|
|
@@ -2718,9 +2716,7 @@ class WhisperModelIntegrationTests(unittest.TestCase):
|
|
|
|
|
)
|
|
|
|
|
assistant_model.to(torch_device)
|
|
|
|
|
|
|
|
|
|
dataset = load_dataset(
|
|
|
|
|
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
|
|
|
|
|
)
|
|
|
|
|
dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
|
|
|
|
sample = dataset[0]["audio"]
|
|
|
|
|
|
|
|
|
|
input_features = processor(sample["array"], return_tensors="pt", sampling_rate=16_000).input_features
|
|
|
|
|
@@ -2769,9 +2765,7 @@ class WhisperModelIntegrationTests(unittest.TestCase):
|
|
|
|
|
)
|
|
|
|
|
assistant_model.to(torch_device)
|
|
|
|
|
|
|
|
|
|
dataset = load_dataset(
|
|
|
|
|
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
|
|
|
|
|
)
|
|
|
|
|
dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
|
|
|
|
sample = dataset[0]["audio"]
|
|
|
|
|
|
|
|
|
|
input_features = processor(sample["array"], return_tensors="pt", sampling_rate=16_000).input_features
|
|
|
|
|
@@ -2812,7 +2806,7 @@ class WhisperModelIntegrationTests(unittest.TestCase):
|
|
|
|
|
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
|
|
|
|
|
model = model.to(torch_device)
|
|
|
|
|
|
|
|
|
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", trust_remote_code=True)
|
|
|
|
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean")
|
|
|
|
|
one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32)
|
|
|
|
|
|
|
|
|
|
input_features = processor(
|
|
|
|
|
@@ -2848,9 +2842,7 @@ class WhisperModelIntegrationTests(unittest.TestCase):
|
|
|
|
|
prompt = "Mr. Kilter, Brionno." # let's force Quilter -> Kilter, Brion -> Brionno
|
|
|
|
|
prompt_ids = processor.get_prompt_ids(prompt, return_tensors="pt").to(torch_device)
|
|
|
|
|
|
|
|
|
|
ds = load_dataset(
|
|
|
|
|
"hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:-1]", trust_remote_code=True
|
|
|
|
|
)
|
|
|
|
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:-1]")
|
|
|
|
|
one_audio = np.concatenate([x["array"] for x in ds["audio"]], dtype=np.float32)
|
|
|
|
|
|
|
|
|
|
first_text = ds[0]["text"].lower()
|
|
|
|
|
@@ -2901,7 +2893,7 @@ class WhisperModelIntegrationTests(unittest.TestCase):
|
|
|
|
|
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
|
|
|
|
|
model = model.to(torch_device)
|
|
|
|
|
|
|
|
|
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", trust_remote_code=True)
|
|
|
|
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean")
|
|
|
|
|
one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32)
|
|
|
|
|
|
|
|
|
|
input_features = processor(
|
|
|
|
|
@@ -2983,7 +2975,7 @@ class WhisperModelIntegrationTests(unittest.TestCase):
|
|
|
|
|
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
|
|
|
|
|
model = model.to(torch_device)
|
|
|
|
|
|
|
|
|
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", trust_remote_code=True)
|
|
|
|
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean")
|
|
|
|
|
one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32)
|
|
|
|
|
|
|
|
|
|
input_features = processor(
|
|
|
|
|
@@ -3025,7 +3017,7 @@ class WhisperModelIntegrationTests(unittest.TestCase):
|
|
|
|
|
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
|
|
|
|
|
model = model.to(torch_device)
|
|
|
|
|
|
|
|
|
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", trust_remote_code=True)
|
|
|
|
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean")
|
|
|
|
|
one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32)
|
|
|
|
|
audios = []
|
|
|
|
|
audios.append(one_audio[110000:])
|
|
|
|
|
@@ -3079,7 +3071,7 @@ class WhisperModelIntegrationTests(unittest.TestCase):
|
|
|
|
|
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
|
|
|
|
|
model = model.to(torch_device)
|
|
|
|
|
|
|
|
|
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", trust_remote_code=True)
|
|
|
|
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean")
|
|
|
|
|
one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32)
|
|
|
|
|
audios = []
|
|
|
|
|
audios.append(one_audio[110000:])
|
|
|
|
|
|