Adding support for array key in raw dictionnaries in ASR pipeline. (#16827)
* Adding support for `array` key in raw dictionnaries in ASR pipeline. * ES . * Update src/transformers/pipelines/automatic_speech_recognition.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Making it work by not popping `array` first. * Black 22.3 Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
@@ -134,8 +134,7 @@ Audio files are automatically loaded and resampled when calling the `"audio"` co
|
|||||||
Let's extract the raw waveform arrays of the first 4 samples and pass it as a list to the pipeline:
|
Let's extract the raw waveform arrays of the first 4 samples and pass it as a list to the pipeline:
|
||||||
|
|
||||||
```py
|
```py
|
||||||
>>> raw_audio_waveforms = [d["array"] for d in dataset[:4]["audio"]]
|
>>> speech_recognizer(dataset[:4]["audio"])
|
||||||
>>> speech_recognizer(raw_audio_waveforms)
|
|
||||||
[{'text': 'I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT'},
|
[{'text': 'I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT'},
|
||||||
{'text': "FONDERING HOW I'D SET UP A JOIN TO HET WITH MY WIFE AND WHERE THE AP MIGHT BE"},
|
{'text': "FONDERING HOW I'D SET UP A JOIN TO HET WITH MY WIFE AND WHERE THE AP MIGHT BE"},
|
||||||
{'text': "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE APSO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AND I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS"},
|
{'text': "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE APSO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AND I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS"},
|
||||||
|
|||||||
@@ -133,8 +133,7 @@ Los archivos de audio se cargan y remuestrean automáticamente cuando se llama a
|
|||||||
Extraigamos las matrices de forma de onda cruda de las primeras 4 muestras y pasémosla como una lista al pipeline:
|
Extraigamos las matrices de forma de onda cruda de las primeras 4 muestras y pasémosla como una lista al pipeline:
|
||||||
|
|
||||||
```py
|
```py
|
||||||
>>> raw_audio_waveforms = [d["array"] for d in dataset[:4]["audio"]]
|
>>> speech_recognizer(dataset[:4]["audio"])
|
||||||
>>> speech_recognizer(raw_audio_waveforms)
|
|
||||||
[{'text': 'I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT'},
|
[{'text': 'I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT'},
|
||||||
{'text': "FONDERING HOW I'D SET UP A JOIN TO HET WITH MY WIFE AND WHERE THE AP MIGHT BE"},
|
{'text': "FONDERING HOW I'D SET UP A JOIN TO HET WITH MY WIFE AND WHERE THE AP MIGHT BE"},
|
||||||
{'text': "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE APSO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AND I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS"},
|
{'text': "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE APSO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AND I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS"},
|
||||||
|
|||||||
@@ -209,7 +209,18 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
|
|||||||
extra = {}
|
extra = {}
|
||||||
if isinstance(inputs, dict):
|
if isinstance(inputs, dict):
|
||||||
stride = inputs.pop("stride", None)
|
stride = inputs.pop("stride", None)
|
||||||
_inputs = inputs.pop("raw")
|
# Accepting `"array"` which is the key defined in `datasets` for
|
||||||
|
# better integration
|
||||||
|
if not ("sampling_rate" in inputs and ("raw" in inputs or "array" in inputs)):
|
||||||
|
raise ValueError(
|
||||||
|
"When passing a dictionnary to AutomaticSpeechRecognitionPipeline, the dict needs to contain a "
|
||||||
|
'"raw" key containing the numpy array representing the audio and a "sampling_rate" key, '
|
||||||
|
"containing the sampling_rate associated with that array"
|
||||||
|
)
|
||||||
|
|
||||||
|
_inputs = inputs.pop("raw", None)
|
||||||
|
if _inputs is None:
|
||||||
|
_inputs = inputs.pop("array", None)
|
||||||
in_sampling_rate = inputs.pop("sampling_rate")
|
in_sampling_rate = inputs.pop("sampling_rate")
|
||||||
extra = inputs
|
extra = inputs
|
||||||
inputs = _inputs
|
inputs = _inputs
|
||||||
|
|||||||
Reference in New Issue
Block a user