enable low-precision pipeline (#31625)

* enable low-precision pipeline

* fix parameter for ASR

* reformat

* fix asr bug

* fix bug for zero-shot

* add dtype check

* rm useless comments

* add np.float16 check

* Update src/transformers/pipelines/image_classification.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* Update src/transformers/pipelines/token_classification.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

* fix comments

* fix asr check

* make fixup

* No more need for is_torch_available()

---------

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
Co-authored-by: Matt <Rocketknight1@users.noreply.github.com>
Co-authored-by: Matt <rocketknight1@gmail.com>
This commit is contained in:
jiqing-feng
2024-09-21 07:43:30 +08:00
committed by GitHub
parent 7b2b536a81
commit 49a0bef4c1
6 changed files with 143 additions and 4 deletions

View File

@@ -167,6 +167,48 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
):
_ = speech_recognizer(waveform, return_timestamps="char")
@require_torch
def test_small_model_pt_fp16(self):
speech_recognizer = pipeline(
task="automatic-speech-recognition",
model="facebook/s2t-small-mustc-en-fr-st",
tokenizer="facebook/s2t-small-mustc-en-fr-st",
framework="pt",
torch_dtype=torch.float16,
)
waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
output = speech_recognizer(waveform)
self.assertEqual(output, {"text": "(Applaudissements)"})
output = speech_recognizer(waveform, chunk_length_s=10)
self.assertEqual(output, {"text": "(Applaudissements)"})
# Non CTC models cannot use return_timestamps
with self.assertRaisesRegex(
ValueError, "^We cannot return_timestamps yet on non-CTC models apart from Whisper!$"
):
_ = speech_recognizer(waveform, return_timestamps="char")
@require_torch
def test_small_model_pt_bf16(self):
speech_recognizer = pipeline(
task="automatic-speech-recognition",
model="facebook/s2t-small-mustc-en-fr-st",
tokenizer="facebook/s2t-small-mustc-en-fr-st",
framework="pt",
torch_dtype=torch.bfloat16,
)
waveform = np.tile(np.arange(1000, dtype=np.float32), 34)
output = speech_recognizer(waveform)
self.assertEqual(output, {"text": "(Applaudissements)"})
output = speech_recognizer(waveform, chunk_length_s=10)
self.assertEqual(output, {"text": "(Applaudissements)"})
# Non CTC models cannot use return_timestamps
with self.assertRaisesRegex(
ValueError, "^We cannot return_timestamps yet on non-CTC models apart from Whisper!$"
):
_ = speech_recognizer(waveform, return_timestamps="char")
@slow
@require_torch_accelerator
def test_whisper_fp16(self):