From dea563c9438cdbd62cf9122c07184ea2d6a7ecf2 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Tue, 18 Jan 2022 12:20:10 +0100 Subject: [PATCH] `is_ctc` needs to be updated to `self.type == "ctc". (#15194) * `is_ctc` needs to be updated to `self.type == "ctc". * Adding fast test for this functionality. --- .../pipelines/automatic_speech_recognition.py | 2 +- ...st_pipelines_automatic_speech_recognition.py | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py index d059e5f407..f3bdb4277e 100644 --- a/src/transformers/pipelines/automatic_speech_recognition.py +++ b/src/transformers/pipelines/automatic_speech_recognition.py @@ -215,7 +215,7 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline): stride_left = int(round(stride_length_s[0] * self.feature_extractor.sampling_rate)) stride_right = int(round(stride_length_s[1] * self.feature_extractor.sampling_rate)) - if not self.is_ctc: + if self.type != "ctc": raise ValueError( "`chunk_length_s` is only valid for CTC models, use other chunking options for other models" ) diff --git a/tests/test_pipelines_automatic_speech_recognition.py b/tests/test_pipelines_automatic_speech_recognition.py index c77a5c56fc..262aea5beb 100644 --- a/tests/test_pipelines_automatic_speech_recognition.py +++ b/tests/test_pipelines_automatic_speech_recognition.py @@ -278,6 +278,23 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase, metaclass=Pipel output = speech_recognizer(filename) self.assertEqual(output, {"text": "a man said to the universe sir i exist"}) + @require_torch + def test_chunking_fast(self): + speech_recognizer = pipeline( + task="automatic-speech-recognition", + model="hf-internal-testing/tiny-random-wav2vec2", + chunk_length_s=10.0, + ) + + ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation").sort("id") + audio = ds[40]["audio"]["array"] + + n_repeats = 2 + audio_tiled = np.tile(audio, n_repeats) + output = speech_recognizer([audio_tiled], batch_size=2) + self.assertEqual(output, [{"text": ANY(str)}]) + self.assertEqual(output[0]["text"][:6], "ZBT ZC") + @require_torch @slow def test_chunking(self):