From eef6b0ba42c062eb8b2180327045c89199ea93f8 Mon Sep 17 00:00:00 2001 From: Michael Kamerath Date: Tue, 22 Oct 2024 07:56:41 -0600 Subject: [PATCH] Add option for running ffmpeg_microphone_live as a background process (#32838) * Add option for running ffmpeg_microphone_live as a background process * Code quality checks for audio_utils * Code clean up for audio_utils * Fixing logic in ffmpeg_microphone calls in audio_utils * Allowing any arbitrary arguments to be passed to ffmpeg_microphone_live * Formatting * Fixing last problems with adding ffmpeg_additional_args * Fixing default arguments and formatting issues * Fixing comments for ffmpeg_additional_args * Adding two shorts tests for ffmpeg_microphone_live * Fixing test bug --- src/transformers/pipelines/audio_utils.py | 24 ++++++++++++++++++- ..._pipelines_automatic_speech_recognition.py | 10 +++++++- 2 files changed, 32 insertions(+), 2 deletions(-) diff --git a/src/transformers/pipelines/audio_utils.py b/src/transformers/pipelines/audio_utils.py index 40a0c0811f..4a8a93c968 100644 --- a/src/transformers/pipelines/audio_utils.py +++ b/src/transformers/pipelines/audio_utils.py @@ -51,6 +51,7 @@ def ffmpeg_microphone( chunk_length_s: float, format_for_conversion: str = "f32le", ffmpeg_input_device: Optional[str] = None, + ffmpeg_additional_args: Optional[list[str]] = None, ): """ Helper function to read audio from a microphone using ffmpeg. The default input device will be used unless another @@ -70,6 +71,11 @@ def ffmpeg_microphone( The indentifier of the input device to be used by ffmpeg (i.e. ffmpeg's '-i' argument). If unset, the default input device will be used. See `https://www.ffmpeg.org/ffmpeg-devices.html#Input-Devices` for how to specify and list input devices. + ffmpeg_additional_args (`list[str]`, *optional*): + Additional arguments to pass to ffmpeg, can include arguments like -nostdin for running as a background + process. For example, to pass -nostdin to the ffmpeg process, pass in ["-nostdin"]. If passing in flags + with multiple arguments, use the following convention (eg ["flag", "arg1", "arg2]). + Returns: A generator yielding audio chunks of `chunk_length_s` seconds as `bytes` objects of length `int(round(sampling_rate * chunk_length_s)) * size_of_sample`. @@ -95,6 +101,8 @@ def ffmpeg_microphone( format_ = "dshow" input_ = ffmpeg_input_device or _get_microphone_name() + ffmpeg_additional_args = [] if ffmpeg_additional_args is None else ffmpeg_additional_args + ffmpeg_command = [ "ffmpeg", "-f", @@ -114,6 +122,9 @@ def ffmpeg_microphone( "quiet", "pipe:1", ] + + ffmpeg_command.extend(ffmpeg_additional_args) + chunk_len = int(round(sampling_rate * chunk_length_s)) * size_of_sample iterator = _ffmpeg_stream(ffmpeg_command, chunk_len) for item in iterator: @@ -127,6 +138,7 @@ def ffmpeg_microphone_live( stride_length_s: Optional[Union[Tuple[float, float], float]] = None, format_for_conversion: str = "f32le", ffmpeg_input_device: Optional[str] = None, + ffmpeg_additional_args: Optional[list[str]] = None, ): """ Helper function to read audio from a microphone using ffmpeg. This will output `partial` overlapping chunks starting @@ -153,6 +165,11 @@ def ffmpeg_microphone_live( The identifier of the input device to be used by ffmpeg (i.e. ffmpeg's '-i' argument). If unset, the default input device will be used. See `https://www.ffmpeg.org/ffmpeg-devices.html#Input-Devices` for how to specify and list input devices. + ffmpeg_additional_args (`list[str]`, *optional*): + Additional arguments to pass to ffmpeg, can include arguments like -nostdin for running as a background + process. For example, to pass -nostdin to the ffmpeg process, pass in ["-nostdin"]. If passing in flags + with multiple arguments, use the following convention (eg ["flag", "arg1", "arg2]). + Return: A generator yielding dictionaries of the following form @@ -168,8 +185,13 @@ def ffmpeg_microphone_live( chunk_s = chunk_length_s microphone = ffmpeg_microphone( - sampling_rate, chunk_s, format_for_conversion=format_for_conversion, ffmpeg_input_device=ffmpeg_input_device + sampling_rate, + chunk_s, + format_for_conversion=format_for_conversion, + ffmpeg_input_device=ffmpeg_input_device, + ffmpeg_additional_args=[] if ffmpeg_additional_args is None else ffmpeg_additional_args, ) + if format_for_conversion == "s16le": dtype = np.int16 size_of_sample = 2 diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py index 391005b021..b21e8cd25f 100644 --- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py +++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py @@ -33,7 +33,7 @@ from transformers import ( WhisperForConditionalGeneration, ) from transformers.pipelines import AutomaticSpeechRecognitionPipeline, pipeline -from transformers.pipelines.audio_utils import chunk_bytes_iter +from transformers.pipelines.audio_utils import chunk_bytes_iter, ffmpeg_microphone_live from transformers.pipelines.automatic_speech_recognition import _find_timestamp_sequence, chunk_iter from transformers.testing_utils import ( compare_pipeline_output_to_hub_spec, @@ -1989,3 +1989,11 @@ class AudioUtilsTest(unittest.TestCase): ) with self.assertRaises(StopIteration): next(iter_) + + def test_ffmpeg_no_additional_args(self): + mic = ffmpeg_microphone_live(16000, 2.0) + mic.close() + + def test_ffmpeg_additional_args(self): + mic = ffmpeg_microphone_live(16000, 2.0, ffmpeg_additional_args=["-nostdin"]) + mic.close()