Add WhisperTokenizerFast (#21222)

* Add WhisperTokenizerFast * Fixup * Up * Up * Improve tests * Update src/transformers/models/whisper/tokenization_whisper_fast.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Keep stride in whisper pipelien test * Remove unknown token special case * Reduce vocabulary size in tests * Fix vocab size assertion * Sync copied changes from WhisperTokenizer * Skip pipeline tests * Update assertion * Remove Whisper tokenizer dependency on sentencepiece * Format --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
2023-02-21 06:58:54 +01:00
parent 8b3db33a76
commit deafc24388
12 changed files with 568 additions and 8 deletions
--- a/tests/models/whisper/test_modeling_tf_whisper.py
+++ b/tests/models/whisper/test_modeling_tf_whisper.py
@@ -79,7 +79,7 @@ class TFWhisperModelTester:
        seq_length=60,
        is_training=True,
        use_labels=False,
-        vocab_size=99,
+        vocab_size=200,
        hidden_size=16,
        num_hidden_layers=2,
        num_attention_heads=4,
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -96,7 +96,7 @@ class WhisperModelTester:
        seq_length=60,
        is_training=True,
        use_labels=False,
-        vocab_size=99,
+        vocab_size=200,
        hidden_size=16,
        num_hidden_layers=2,
        num_attention_heads=4,
--- a/tests/models/whisper/test_tokenization_whisper.py
+++ b/tests/models/whisper/test_tokenization_whisper.py
@@ -14,7 +14,7 @@

 import unittest

-from transformers.models.whisper import WhisperTokenizer
+from transformers.models.whisper import WhisperTokenizer, WhisperTokenizerFast
 from transformers.testing_utils import slow

 from ...test_tokenization_common import TokenizerTesterMixin
@@ -31,7 +31,8 @@ NOTIMESTAMPS = 50363

 class WhisperTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
    tokenizer_class = WhisperTokenizer
-    test_rust_tokenizer = False
+    rust_tokenizer_class = WhisperTokenizerFast
+    test_rust_tokenizer = True
    test_sentencepiece = False
    test_seq2seq = False

@@ -93,6 +94,17 @@ class WhisperTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
    def test_tokenizer_slow_store_full_signature(self):
        pass

+    def test_tokenizer_fast_store_full_signature(self):
+        pass
+
+    def test_special_tokens_initialization(self):
+        # Whisper relies on specific additional special tokens, so we skip this
+        # general test. In particular, this test loads fast tokenizer from slow
+        # tokenizer, and the conversion uses prefix_tokens, where we reference
+        # additional special tokens by specific indices, hence overriding the
+        # list with less tokens leads to out of index error
+        pass
+
    @slow
    def test_tokenizer_integration(self):
        # fmt: off