From eef6b0ba42c062eb8b2180327045c89199ea93f8 Mon Sep 17 00:00:00 2001
From: Michael Kamerath <mkamerath@bamboohr.com>
Date: Tue, 22 Oct 2024 07:56:41 -0600
Subject: [PATCH] Add option for running ffmpeg_microphone_live as a background
 process (#32838)

* Add option for running ffmpeg_microphone_live as a background process

* Code quality checks for audio_utils

* Code clean up for audio_utils

* Fixing logic in ffmpeg_microphone calls in audio_utils

* Allowing any arbitrary arguments to be passed to ffmpeg_microphone_live

* Formatting

* Fixing last problems with adding ffmpeg_additional_args

* Fixing default arguments and formatting issues

* Fixing comments for ffmpeg_additional_args

* Adding two shorts tests for ffmpeg_microphone_live

* Fixing test bug
---
 src/transformers/pipelines/audio_utils.py     | 24 ++++++++++++++++++-
 ..._pipelines_automatic_speech_recognition.py | 10 +++++++-
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/src/transformers/pipelines/audio_utils.py b/src/transformers/pipelines/audio_utils.py
index 40a0c0811f..4a8a93c968 100644
--- a/src/transformers/pipelines/audio_utils.py
+++ b/src/transformers/pipelines/audio_utils.py
@@ -51,6 +51,7 @@ def ffmpeg_microphone(
     chunk_length_s: float,
     format_for_conversion: str = "f32le",
     ffmpeg_input_device: Optional[str] = None,
+    ffmpeg_additional_args: Optional[list[str]] = None,
 ):
     """
     Helper function to read audio from a microphone using ffmpeg. The default input device will be used unless another
@@ -70,6 +71,11 @@ def ffmpeg_microphone(
             The indentifier of the input device to be used by ffmpeg (i.e. ffmpeg's '-i' argument). If unset,
             the default input device will be used. See `https://www.ffmpeg.org/ffmpeg-devices.html#Input-Devices`
             for how to specify and list input devices.
+        ffmpeg_additional_args (`list[str]`, *optional*):
+            Additional arguments to pass to ffmpeg, can include arguments like -nostdin for running as a background
+            process. For example, to pass -nostdin to the ffmpeg process, pass in ["-nostdin"]. If passing in flags
+            with multiple arguments, use the following convention (eg ["flag", "arg1", "arg2]).
+
     Returns:
         A generator yielding audio chunks of `chunk_length_s` seconds as `bytes` objects of length
         `int(round(sampling_rate * chunk_length_s)) * size_of_sample`.
@@ -95,6 +101,8 @@ def ffmpeg_microphone(
         format_ = "dshow"
         input_ = ffmpeg_input_device or _get_microphone_name()
 
+    ffmpeg_additional_args = [] if ffmpeg_additional_args is None else ffmpeg_additional_args
+
     ffmpeg_command = [
         "ffmpeg",
         "-f",
@@ -114,6 +122,9 @@ def ffmpeg_microphone(
         "quiet",
         "pipe:1",
     ]
+
+    ffmpeg_command.extend(ffmpeg_additional_args)
+
     chunk_len = int(round(sampling_rate * chunk_length_s)) * size_of_sample
     iterator = _ffmpeg_stream(ffmpeg_command, chunk_len)
     for item in iterator:
@@ -127,6 +138,7 @@ def ffmpeg_microphone_live(
     stride_length_s: Optional[Union[Tuple[float, float], float]] = None,
     format_for_conversion: str = "f32le",
     ffmpeg_input_device: Optional[str] = None,
+    ffmpeg_additional_args: Optional[list[str]] = None,
 ):
     """
     Helper function to read audio from a microphone using ffmpeg. This will output `partial` overlapping chunks starting
@@ -153,6 +165,11 @@ def ffmpeg_microphone_live(
             The identifier of the input device to be used by ffmpeg (i.e. ffmpeg's '-i' argument). If unset,
             the default input device will be used. See `https://www.ffmpeg.org/ffmpeg-devices.html#Input-Devices`
             for how to specify and list input devices.
+        ffmpeg_additional_args (`list[str]`, *optional*):
+            Additional arguments to pass to ffmpeg, can include arguments like -nostdin for running as a background
+            process. For example, to pass -nostdin to the ffmpeg process, pass in ["-nostdin"]. If passing in flags
+            with multiple arguments, use the following convention (eg ["flag", "arg1", "arg2]).
+
     Return:
         A generator yielding dictionaries of the following form
 
@@ -168,8 +185,13 @@ def ffmpeg_microphone_live(
         chunk_s = chunk_length_s
 
     microphone = ffmpeg_microphone(
-        sampling_rate, chunk_s, format_for_conversion=format_for_conversion, ffmpeg_input_device=ffmpeg_input_device
+        sampling_rate,
+        chunk_s,
+        format_for_conversion=format_for_conversion,
+        ffmpeg_input_device=ffmpeg_input_device,
+        ffmpeg_additional_args=[] if ffmpeg_additional_args is None else ffmpeg_additional_args,
     )
+
     if format_for_conversion == "s16le":
         dtype = np.int16
         size_of_sample = 2
diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index 391005b021..b21e8cd25f 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -33,7 +33,7 @@ from transformers import (
     WhisperForConditionalGeneration,
 )
 from transformers.pipelines import AutomaticSpeechRecognitionPipeline, pipeline
-from transformers.pipelines.audio_utils import chunk_bytes_iter
+from transformers.pipelines.audio_utils import chunk_bytes_iter, ffmpeg_microphone_live
 from transformers.pipelines.automatic_speech_recognition import _find_timestamp_sequence, chunk_iter
 from transformers.testing_utils import (
     compare_pipeline_output_to_hub_spec,
@@ -1989,3 +1989,11 @@ class AudioUtilsTest(unittest.TestCase):
         )
         with self.assertRaises(StopIteration):
             next(iter_)
+
+    def test_ffmpeg_no_additional_args(self):
+        mic = ffmpeg_microphone_live(16000, 2.0)
+        mic.close()
+
+    def test_ffmpeg_additional_args(self):
+        mic = ffmpeg_microphone_live(16000, 2.0, ffmpeg_additional_args=["-nostdin"])
+        mic.close()