diff --git a/docs/source/en/quicktour.mdx b/docs/source/en/quicktour.mdx index c1b23995aa..7c7a39d568 100644 --- a/docs/source/en/quicktour.mdx +++ b/docs/source/en/quicktour.mdx @@ -134,8 +134,7 @@ Audio files are automatically loaded and resampled when calling the `"audio"` co Let's extract the raw waveform arrays of the first 4 samples and pass it as a list to the pipeline: ```py ->>> raw_audio_waveforms = [d["array"] for d in dataset[:4]["audio"]] ->>> speech_recognizer(raw_audio_waveforms) +>>> speech_recognizer(dataset[:4]["audio"]) [{'text': 'I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT'}, {'text': "FONDERING HOW I'D SET UP A JOIN TO HET WITH MY WIFE AND WHERE THE AP MIGHT BE"}, {'text': "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE APSO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AND I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS"}, diff --git a/docs/source/es/quicktour.mdx b/docs/source/es/quicktour.mdx index 16a2eacd9e..8410b6cf50 100644 --- a/docs/source/es/quicktour.mdx +++ b/docs/source/es/quicktour.mdx @@ -133,8 +133,7 @@ Los archivos de audio se cargan y remuestrean automáticamente cuando se llama a Extraigamos las matrices de forma de onda cruda de las primeras 4 muestras y pasémosla como una lista al pipeline: ```py ->>> raw_audio_waveforms = [d["array"] for d in dataset[:4]["audio"]] ->>> speech_recognizer(raw_audio_waveforms) +>>> speech_recognizer(dataset[:4]["audio"]) [{'text': 'I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT'}, {'text': "FONDERING HOW I'D SET UP A JOIN TO HET WITH MY WIFE AND WHERE THE AP MIGHT BE"}, {'text': "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE APSO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AND I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS"}, diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py index 79e4e8b757..e10dc1208e 100644 --- a/src/transformers/pipelines/automatic_speech_recognition.py +++ b/src/transformers/pipelines/automatic_speech_recognition.py @@ -209,7 +209,18 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline): extra = {} if isinstance(inputs, dict): stride = inputs.pop("stride", None) - _inputs = inputs.pop("raw") + # Accepting `"array"` which is the key defined in `datasets` for + # better integration + if not ("sampling_rate" in inputs and ("raw" in inputs or "array" in inputs)): + raise ValueError( + "When passing a dictionnary to AutomaticSpeechRecognitionPipeline, the dict needs to contain a " + '"raw" key containing the numpy array representing the audio and a "sampling_rate" key, ' + "containing the sampling_rate associated with that array" + ) + + _inputs = inputs.pop("raw", None) + if _inputs is None: + _inputs = inputs.pop("array", None) in_sampling_rate = inputs.pop("sampling_rate") extra = inputs inputs = _inputs