From ebc69afc30687006d85d3380b7a5f02fd6a979b9 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 7 Jul 2021 16:06:48 +0200 Subject: [PATCH] Adding support for `pipeline("automatic-speech-recognition")`. (#11525) * Adding support for `pipeline("automatic-speech-recognition")`. - Ugly `"config"` choice for AutoModel. It would be great to have the possibility to have something like `AutoModelFor` that would implement the same logic (Load the config, check Architectures and load the first one) * Remove `model_id` was not needed in the end. * Rebased ! * Remove old code. * Rename `nlp`. --- src/transformers/pipelines/__init__.py | 8 +++ src/transformers/pipelines/base.py | 4 +- ..._pipelines_automatic_speech_recognition.py | 53 ++++++++++++++++--- 3 files changed, 55 insertions(+), 10 deletions(-) diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py index ea353caa52..3400f1f25e 100755 --- a/src/transformers/pipelines/__init__.py +++ b/src/transformers/pipelines/__init__.py @@ -108,6 +108,14 @@ TASK_ALIASES = { "ner": "token-classification", } SUPPORTED_TASKS = { + "automatic-speech-recognition": { + "impl": AutomaticSpeechRecognitionPipeline, + "tf": (), + # Only load from `config.architectures`, AutoModelForCTC and AutoModelForConditionalGeneration + # do not exist yet. + "pt": () if is_torch_available() else (), + "default": {"model": {"pt": "facebook/wav2vec2-base-960h"}}, + }, "feature-extraction": { "impl": FeatureExtractionPipeline, "tf": (TFAutoModel,) if is_tf_available() else (), diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py index 5065c56ca2..19d9840fc7 100644 --- a/src/transformers/pipelines/base.py +++ b/src/transformers/pipelines/base.py @@ -104,11 +104,11 @@ def infer_framework_load_model( classes = [] for architecture in config.architectures: transformers_module = importlib.import_module("transformers") - if look_tf: + if look_pt: _class = getattr(transformers_module, architecture, None) if _class is not None: classes.append(_class) - if look_pt: + if look_tf: _class = getattr(transformers_module, f"TF{architecture}", None) if _class is not None: classes.append(_class) diff --git a/tests/test_pipelines_automatic_speech_recognition.py b/tests/test_pipelines_automatic_speech_recognition.py index 91dcc71de0..6d391eee9a 100644 --- a/tests/test_pipelines_automatic_speech_recognition.py +++ b/tests/test_pipelines_automatic_speech_recognition.py @@ -15,20 +15,57 @@ import unittest from transformers import AutoFeatureExtractor, AutoTokenizer, Speech2TextForConditionalGeneration, Wav2Vec2ForCTC -from transformers.pipelines import AutomaticSpeechRecognitionPipeline -from transformers.testing_utils import require_datasets, require_torch, require_torchaudio, slow +from transformers.pipelines import AutomaticSpeechRecognitionPipeline, pipeline +from transformers.testing_utils import is_pipeline_test, require_datasets, require_torch, require_torchaudio, slow +# We can't use this mixin because it assumes TF support. # from .test_pipelines_common import CustomInputPipelineCommonMixin +@is_pipeline_test class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase): - # pipeline_task = "automatic-speech-recognition" - # small_models = ["facebook/s2t-small-mustc-en-fr-st"] # Models tested without the @slow decorator - # large_models = [ - # "facebook/wav2vec2-base-960h", - # "facebook/s2t-small-mustc-en-fr-st", - # ] # Models tested with the @slow decorator + @require_torch + @slow + def test_pt_defaults(self): + pipeline("automatic-speech-recognition", framework="pt") + + @require_torch + def test_torch_small(self): + import numpy as np + + speech_recognizer = pipeline( + task="automatic-speech-recognition", + model="facebook/s2t-small-mustc-en-fr-st", + tokenizer="facebook/s2t-small-mustc-en-fr-st", + framework="pt", + ) + waveform = np.zeros((34000,)) + output = speech_recognizer(waveform) + self.assertEqual(output, {"text": "C'est ce que j'ai fait à ce moment-là."}) + + @require_datasets + @require_torch + @slow + def test_torch_large(self): + import numpy as np + + speech_recognizer = pipeline( + task="automatic-speech-recognition", + model="facebook/wav2vec2-base-960h", + tokenizer="facebook/wav2vec2-base-960h", + framework="pt", + ) + waveform = np.zeros((34000,)) + output = speech_recognizer(waveform) + self.assertEqual(output, {"text": ""}) + + from datasets import load_dataset + + ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation") + filename = ds[0]["file"] + output = speech_recognizer(filename) + self.assertEqual(output, {"text": "A MAN SAID TO THE UNIVERSE SIR I EXIST"}) @slow @require_torch