Add WhisperTokenizerFast (#21222)

* Add WhisperTokenizerFast * Fixup * Up * Up * Improve tests * Update src/transformers/models/whisper/tokenization_whisper_fast.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Keep stride in whisper pipelien test * Remove unknown token special case * Reduce vocabulary size in tests * Fix vocab size assertion * Sync copied changes from WhisperTokenizer * Skip pipeline tests * Update assertion * Remove Whisper tokenizer dependency on sentencepiece * Format --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
2023-02-21 06:58:54 +01:00
parent 8b3db33a76
commit deafc24388
12 changed files with 568 additions and 8 deletions
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -123,7 +123,7 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase, metaclass=Pipel
            outputs = speech_recognizer(audio, return_timestamps=True)
            self.assertIsInstance(outputs["chunks"], list)
            nb_chunks = len(outputs["chunks"])
-            self.assertGreaterThan(nb_chunks, 0)
+            self.assertGreater(nb_chunks, 0)
            self.assertEqual(
                outputs,
                {