diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py index b839552438..96881b09ce 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py @@ -342,7 +342,7 @@ def main(): if data_args.audio_column_name not in raw_datasets["train"].column_names: raise ValueError( - f"--audio_column_name {data_args.audio_column_name} not found in dataset '{data_args.dataset_name}'. " + f"--audio_column_name '{data_args.audio_column_name}' not found in dataset '{data_args.dataset_name}'. " "Make sure to set `--audio_column_name` to the correct audio column - one of " f"{', '.join(raw_datasets['train'].column_names)}." ) diff --git a/setup.py b/setup.py index 674e7fd0df..11f1ac7e8b 100644 --- a/setup.py +++ b/setup.py @@ -136,7 +136,7 @@ _deps = [ "scikit-learn", "sentencepiece>=0.1.91,!=0.1.92", "sigopt", - "soundfile", + "librosa", "sphinx-copybutton", "sphinx-markdown-tables", "sphinx-rtd-theme==0.4.3", # sphinx-rtd-theme==0.5.0 introduced big changes in the style. @@ -251,10 +251,10 @@ extras["optuna"] = deps_list("optuna") extras["ray"] = deps_list("ray[tune]") extras["sigopt"] = deps_list("sigopt") -extras["integrations"] = extras["optuna"] + extras["ray"]+ extras["sigopt"] +extras["integrations"] = extras["optuna"] + extras["ray"] + extras["sigopt"] extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette") -extras["audio"] = deps_list("soundfile") +extras["audio"] = deps_list("librosa") extras["speech"] = deps_list("torchaudio") + extras["audio"] # `pip install ".[speech]"` is deprecated and `pip install ".[torch-speech]"` should be used instead extras["torch-speech"] = deps_list("torchaudio") + extras["audio"] extras["tf-speech"] = extras["audio"] diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index ef39663714..9168ad5c05 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -54,7 +54,7 @@ deps = { "scikit-learn": "scikit-learn", "sentencepiece": "sentencepiece>=0.1.91,!=0.1.92", "sigopt": "sigopt", - "soundfile": "soundfile", + "librosa": "librosa", "sphinx-copybutton": "sphinx-copybutton", "sphinx-markdown-tables": "sphinx-markdown-tables", "sphinx-rtd-theme": "sphinx-rtd-theme==0.4.3", diff --git a/tests/test_modeling_flax_wav2vec2.py b/tests/test_modeling_flax_wav2vec2.py index 07a4ee73e3..d75891a1b0 100644 --- a/tests/test_modeling_flax_wav2vec2.py +++ b/tests/test_modeling_flax_wav2vec2.py @@ -356,21 +356,13 @@ class FlaxWav2Vec2ModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): from datasets import load_dataset - import soundfile as sf - - ids = [f"1272-141231-000{i}" for i in range(num_samples)] - - # map files to raw - def map_to_array(batch): - speech, _ = sf.read(batch["file"]) - batch["speech"] = speech - return batch - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + # automatic decoding with librispeech + speech_samples = ds.sort("id").filter( + lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] + )[:num_samples]["audio"] - ds = ds.filter(lambda x: x["id"] in ids).sort("id").map(map_to_array) - - return ds["speech"][:num_samples] + return [x["array"] for x in speech_samples] def test_inference_ctc_robust_batched(self): model = FlaxWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60-self", from_pt=True) diff --git a/tests/test_modeling_hubert.py b/tests/test_modeling_hubert.py index 486b44f404..5d3d27721e 100644 --- a/tests/test_modeling_hubert.py +++ b/tests/test_modeling_hubert.py @@ -613,21 +613,11 @@ class HubertModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): from datasets import load_dataset - import soundfile as sf - - ids = [f"1272-141231-000{i}" for i in range(num_samples)] - - # map files to raw - def map_to_array(batch): - speech, _ = sf.read(batch["file"]) - batch["speech"] = speech - return batch - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + # automatic decoding with librispeech + speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] - ds = ds.filter(lambda x: x["id"] in ids).sort("id").map(map_to_array) - - return ds["speech"][:num_samples] + return [x["array"] for x in speech_samples] def _load_superb(self, task, num_samples): from datasets import load_dataset diff --git a/tests/test_modeling_sew.py b/tests/test_modeling_sew.py index ab5330f12c..45556585eb 100644 --- a/tests/test_modeling_sew.py +++ b/tests/test_modeling_sew.py @@ -407,21 +407,13 @@ class SEWModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): from datasets import load_dataset - import soundfile as sf + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + # automatic decoding with librispeech + speech_samples = ds.sort("id").filter( + lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] + )[:num_samples]["audio"] - ids = [f"1272-141231-000{i}" for i in range(num_samples)] - - # map files to raw - def map_to_array(batch): - speech, _ = sf.read(batch["file"]) - batch["speech"] = speech - return batch - - ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation") - - ds = ds.filter(lambda x: x["id"] in ids).sort("id").map(map_to_array) - - return ds["speech"][:num_samples] + return [x["array"] for x in speech_samples] def test_inference_pretrained_batched(self): model = SEWModel.from_pretrained("asapp/sew-tiny-100k").to(torch_device) diff --git a/tests/test_modeling_sew_d.py b/tests/test_modeling_sew_d.py index 9ce960c814..974896dc00 100644 --- a/tests/test_modeling_sew_d.py +++ b/tests/test_modeling_sew_d.py @@ -428,21 +428,13 @@ class SEWDModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): from datasets import load_dataset - import soundfile as sf + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + # automatic decoding with librispeech + speech_samples = ds.sort("id").filter( + lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] + )[:num_samples]["audio"] - ids = [f"1272-141231-000{i}" for i in range(num_samples)] - - # map files to raw - def map_to_array(batch): - speech, _ = sf.read(batch["file"]) - batch["speech"] = speech - return batch - - ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation") - - ds = ds.filter(lambda x: x["id"] in ids).sort("id").map(map_to_array) - - return ds["speech"][:num_samples] + return [x["array"] for x in speech_samples] def test_inference_pretrained_batched(self): model = SEWDModel.from_pretrained("asapp/sew-d-tiny-100k").to(torch_device) diff --git a/tests/test_modeling_speech_to_text.py b/tests/test_modeling_speech_to_text.py index 59211cc33b..f1c02b0a3c 100644 --- a/tests/test_modeling_speech_to_text.py +++ b/tests/test_modeling_speech_to_text.py @@ -715,18 +715,11 @@ class Speech2TextModelIntegrationTests(unittest.TestCase): def _load_datasamples(self, num_samples): from datasets import load_dataset - import soundfile as sf - - # map files to raw - def map_to_array(batch): - speech, _ = sf.read(batch["file"]) - batch["speech"] = speech - return batch - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") - ds = ds.sort("id").select(range(num_samples)).map(map_to_array) + # automatic decoding with librispeech + speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"] - return ds["speech"][:num_samples] + return [x["array"] for x in speech_samples] def test_generation_librispeech(self): model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr") diff --git a/tests/test_modeling_tf_hubert.py b/tests/test_modeling_tf_hubert.py index 3091ebed67..422c0aa400 100644 --- a/tests/test_modeling_tf_hubert.py +++ b/tests/test_modeling_tf_hubert.py @@ -479,21 +479,13 @@ class TFHubertModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): from datasets import load_dataset - import soundfile as sf - - ids = [f"1272-141231-000{i}" for i in range(num_samples)] - - # map files to raw - def map_to_array(batch): - speech, _ = sf.read(batch["file"]) - batch["speech"] = speech - return batch - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + # automatic decoding with librispeech + speech_samples = ds.sort("id").filter( + lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] + )[:num_samples]["audio"] - ds = ds.filter(lambda x: x["id"] in ids).sort("id").map(map_to_array) - - return ds["speech"][:num_samples] + return [x["array"] for x in speech_samples] def test_inference_ctc_normal(self): model = TFHubertForCTC.from_pretrained("facebook/hubert-large-ls960-ft") diff --git a/tests/test_modeling_tf_wav2vec2.py b/tests/test_modeling_tf_wav2vec2.py index b46ef5ae9f..46f877f063 100644 --- a/tests/test_modeling_tf_wav2vec2.py +++ b/tests/test_modeling_tf_wav2vec2.py @@ -479,21 +479,13 @@ class TFWav2Vec2ModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): from datasets import load_dataset - import soundfile as sf - - ids = [f"1272-141231-000{i}" for i in range(num_samples)] - - # map files to raw - def map_to_array(batch): - speech, _ = sf.read(batch["file"]) - batch["speech"] = speech - return batch - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + # automatic decoding with librispeech + speech_samples = ds.sort("id").filter( + lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] + )[:num_samples]["audio"] - ds = ds.filter(lambda x: x["id"] in ids).sort("id").map(map_to_array) - - return ds["speech"][:num_samples] + return [x["array"] for x in speech_samples] def test_inference_ctc_normal(self): model = TFWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") diff --git a/tests/test_modeling_wav2vec2.py b/tests/test_modeling_wav2vec2.py index 0ea1bee3b1..164e661ff6 100644 --- a/tests/test_modeling_wav2vec2.py +++ b/tests/test_modeling_wav2vec2.py @@ -900,21 +900,13 @@ class Wav2Vec2ModelIntegrationTest(unittest.TestCase): def _load_datasamples(self, num_samples): from datasets import load_dataset - import soundfile as sf - - ids = [f"1272-141231-000{i}" for i in range(num_samples)] - - # map files to raw - def map_to_array(batch): - speech, _ = sf.read(batch["file"]) - batch["speech"] = speech - return batch - ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") + # automatic decoding with librispeech + speech_samples = ds.sort("id").filter( + lambda x: x["id"] in [f"1272-141231-000{i}" for i in range(num_samples)] + )[:num_samples]["audio"] - ds = ds.filter(lambda x: x["id"] in ids).sort("id").map(map_to_array) - - return ds["speech"][:num_samples] + return [x["array"] for x in speech_samples] def _load_superb(self, task, num_samples): from datasets import load_dataset