Add option for running ffmpeg_microphone_live as a background process (#32838)
* Add option for running ffmpeg_microphone_live as a background process * Code quality checks for audio_utils * Code clean up for audio_utils * Fixing logic in ffmpeg_microphone calls in audio_utils * Allowing any arbitrary arguments to be passed to ffmpeg_microphone_live * Formatting * Fixing last problems with adding ffmpeg_additional_args * Fixing default arguments and formatting issues * Fixing comments for ffmpeg_additional_args * Adding two shorts tests for ffmpeg_microphone_live * Fixing test bug
This commit is contained in:
@@ -51,6 +51,7 @@ def ffmpeg_microphone(
|
|||||||
chunk_length_s: float,
|
chunk_length_s: float,
|
||||||
format_for_conversion: str = "f32le",
|
format_for_conversion: str = "f32le",
|
||||||
ffmpeg_input_device: Optional[str] = None,
|
ffmpeg_input_device: Optional[str] = None,
|
||||||
|
ffmpeg_additional_args: Optional[list[str]] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Helper function to read audio from a microphone using ffmpeg. The default input device will be used unless another
|
Helper function to read audio from a microphone using ffmpeg. The default input device will be used unless another
|
||||||
@@ -70,6 +71,11 @@ def ffmpeg_microphone(
|
|||||||
The indentifier of the input device to be used by ffmpeg (i.e. ffmpeg's '-i' argument). If unset,
|
The indentifier of the input device to be used by ffmpeg (i.e. ffmpeg's '-i' argument). If unset,
|
||||||
the default input device will be used. See `https://www.ffmpeg.org/ffmpeg-devices.html#Input-Devices`
|
the default input device will be used. See `https://www.ffmpeg.org/ffmpeg-devices.html#Input-Devices`
|
||||||
for how to specify and list input devices.
|
for how to specify and list input devices.
|
||||||
|
ffmpeg_additional_args (`list[str]`, *optional*):
|
||||||
|
Additional arguments to pass to ffmpeg, can include arguments like -nostdin for running as a background
|
||||||
|
process. For example, to pass -nostdin to the ffmpeg process, pass in ["-nostdin"]. If passing in flags
|
||||||
|
with multiple arguments, use the following convention (eg ["flag", "arg1", "arg2]).
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A generator yielding audio chunks of `chunk_length_s` seconds as `bytes` objects of length
|
A generator yielding audio chunks of `chunk_length_s` seconds as `bytes` objects of length
|
||||||
`int(round(sampling_rate * chunk_length_s)) * size_of_sample`.
|
`int(round(sampling_rate * chunk_length_s)) * size_of_sample`.
|
||||||
@@ -95,6 +101,8 @@ def ffmpeg_microphone(
|
|||||||
format_ = "dshow"
|
format_ = "dshow"
|
||||||
input_ = ffmpeg_input_device or _get_microphone_name()
|
input_ = ffmpeg_input_device or _get_microphone_name()
|
||||||
|
|
||||||
|
ffmpeg_additional_args = [] if ffmpeg_additional_args is None else ffmpeg_additional_args
|
||||||
|
|
||||||
ffmpeg_command = [
|
ffmpeg_command = [
|
||||||
"ffmpeg",
|
"ffmpeg",
|
||||||
"-f",
|
"-f",
|
||||||
@@ -114,6 +122,9 @@ def ffmpeg_microphone(
|
|||||||
"quiet",
|
"quiet",
|
||||||
"pipe:1",
|
"pipe:1",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
ffmpeg_command.extend(ffmpeg_additional_args)
|
||||||
|
|
||||||
chunk_len = int(round(sampling_rate * chunk_length_s)) * size_of_sample
|
chunk_len = int(round(sampling_rate * chunk_length_s)) * size_of_sample
|
||||||
iterator = _ffmpeg_stream(ffmpeg_command, chunk_len)
|
iterator = _ffmpeg_stream(ffmpeg_command, chunk_len)
|
||||||
for item in iterator:
|
for item in iterator:
|
||||||
@@ -127,6 +138,7 @@ def ffmpeg_microphone_live(
|
|||||||
stride_length_s: Optional[Union[Tuple[float, float], float]] = None,
|
stride_length_s: Optional[Union[Tuple[float, float], float]] = None,
|
||||||
format_for_conversion: str = "f32le",
|
format_for_conversion: str = "f32le",
|
||||||
ffmpeg_input_device: Optional[str] = None,
|
ffmpeg_input_device: Optional[str] = None,
|
||||||
|
ffmpeg_additional_args: Optional[list[str]] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Helper function to read audio from a microphone using ffmpeg. This will output `partial` overlapping chunks starting
|
Helper function to read audio from a microphone using ffmpeg. This will output `partial` overlapping chunks starting
|
||||||
@@ -153,6 +165,11 @@ def ffmpeg_microphone_live(
|
|||||||
The identifier of the input device to be used by ffmpeg (i.e. ffmpeg's '-i' argument). If unset,
|
The identifier of the input device to be used by ffmpeg (i.e. ffmpeg's '-i' argument). If unset,
|
||||||
the default input device will be used. See `https://www.ffmpeg.org/ffmpeg-devices.html#Input-Devices`
|
the default input device will be used. See `https://www.ffmpeg.org/ffmpeg-devices.html#Input-Devices`
|
||||||
for how to specify and list input devices.
|
for how to specify and list input devices.
|
||||||
|
ffmpeg_additional_args (`list[str]`, *optional*):
|
||||||
|
Additional arguments to pass to ffmpeg, can include arguments like -nostdin for running as a background
|
||||||
|
process. For example, to pass -nostdin to the ffmpeg process, pass in ["-nostdin"]. If passing in flags
|
||||||
|
with multiple arguments, use the following convention (eg ["flag", "arg1", "arg2]).
|
||||||
|
|
||||||
Return:
|
Return:
|
||||||
A generator yielding dictionaries of the following form
|
A generator yielding dictionaries of the following form
|
||||||
|
|
||||||
@@ -168,8 +185,13 @@ def ffmpeg_microphone_live(
|
|||||||
chunk_s = chunk_length_s
|
chunk_s = chunk_length_s
|
||||||
|
|
||||||
microphone = ffmpeg_microphone(
|
microphone = ffmpeg_microphone(
|
||||||
sampling_rate, chunk_s, format_for_conversion=format_for_conversion, ffmpeg_input_device=ffmpeg_input_device
|
sampling_rate,
|
||||||
|
chunk_s,
|
||||||
|
format_for_conversion=format_for_conversion,
|
||||||
|
ffmpeg_input_device=ffmpeg_input_device,
|
||||||
|
ffmpeg_additional_args=[] if ffmpeg_additional_args is None else ffmpeg_additional_args,
|
||||||
)
|
)
|
||||||
|
|
||||||
if format_for_conversion == "s16le":
|
if format_for_conversion == "s16le":
|
||||||
dtype = np.int16
|
dtype = np.int16
|
||||||
size_of_sample = 2
|
size_of_sample = 2
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ from transformers import (
|
|||||||
WhisperForConditionalGeneration,
|
WhisperForConditionalGeneration,
|
||||||
)
|
)
|
||||||
from transformers.pipelines import AutomaticSpeechRecognitionPipeline, pipeline
|
from transformers.pipelines import AutomaticSpeechRecognitionPipeline, pipeline
|
||||||
from transformers.pipelines.audio_utils import chunk_bytes_iter
|
from transformers.pipelines.audio_utils import chunk_bytes_iter, ffmpeg_microphone_live
|
||||||
from transformers.pipelines.automatic_speech_recognition import _find_timestamp_sequence, chunk_iter
|
from transformers.pipelines.automatic_speech_recognition import _find_timestamp_sequence, chunk_iter
|
||||||
from transformers.testing_utils import (
|
from transformers.testing_utils import (
|
||||||
compare_pipeline_output_to_hub_spec,
|
compare_pipeline_output_to_hub_spec,
|
||||||
@@ -1989,3 +1989,11 @@ class AudioUtilsTest(unittest.TestCase):
|
|||||||
)
|
)
|
||||||
with self.assertRaises(StopIteration):
|
with self.assertRaises(StopIteration):
|
||||||
next(iter_)
|
next(iter_)
|
||||||
|
|
||||||
|
def test_ffmpeg_no_additional_args(self):
|
||||||
|
mic = ffmpeg_microphone_live(16000, 2.0)
|
||||||
|
mic.close()
|
||||||
|
|
||||||
|
def test_ffmpeg_additional_args(self):
|
||||||
|
mic = ffmpeg_microphone_live(16000, 2.0, ffmpeg_additional_args=["-nostdin"])
|
||||||
|
mic.close()
|
||||||
|
|||||||
Reference in New Issue
Block a user