Add WhisperTokenizerFast (#21222)
* Add WhisperTokenizerFast * Fixup * Up * Up * Improve tests * Update src/transformers/models/whisper/tokenization_whisper_fast.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Keep stride in whisper pipelien test * Remove unknown token special case * Reduce vocabulary size in tests * Fix vocab size assertion * Sync copied changes from WhisperTokenizer * Skip pipeline tests * Update assertion * Remove Whisper tokenizer dependency on sentencepiece * Format --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
This commit is contained in:
@@ -79,7 +79,7 @@ class TFWhisperModelTester:
|
||||
seq_length=60,
|
||||
is_training=True,
|
||||
use_labels=False,
|
||||
vocab_size=99,
|
||||
vocab_size=200,
|
||||
hidden_size=16,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
|
||||
@@ -96,7 +96,7 @@ class WhisperModelTester:
|
||||
seq_length=60,
|
||||
is_training=True,
|
||||
use_labels=False,
|
||||
vocab_size=99,
|
||||
vocab_size=200,
|
||||
hidden_size=16,
|
||||
num_hidden_layers=2,
|
||||
num_attention_heads=4,
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
import unittest
|
||||
|
||||
from transformers.models.whisper import WhisperTokenizer
|
||||
from transformers.models.whisper import WhisperTokenizer, WhisperTokenizerFast
|
||||
from transformers.testing_utils import slow
|
||||
|
||||
from ...test_tokenization_common import TokenizerTesterMixin
|
||||
@@ -31,7 +31,8 @@ NOTIMESTAMPS = 50363
|
||||
|
||||
class WhisperTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
tokenizer_class = WhisperTokenizer
|
||||
test_rust_tokenizer = False
|
||||
rust_tokenizer_class = WhisperTokenizerFast
|
||||
test_rust_tokenizer = True
|
||||
test_sentencepiece = False
|
||||
test_seq2seq = False
|
||||
|
||||
@@ -93,6 +94,17 @@ class WhisperTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
def test_tokenizer_slow_store_full_signature(self):
|
||||
pass
|
||||
|
||||
def test_tokenizer_fast_store_full_signature(self):
|
||||
pass
|
||||
|
||||
def test_special_tokens_initialization(self):
|
||||
# Whisper relies on specific additional special tokens, so we skip this
|
||||
# general test. In particular, this test loads fast tokenizer from slow
|
||||
# tokenizer, and the conversion uses prefix_tokens, where we reference
|
||||
# additional special tokens by specific indices, hence overriding the
|
||||
# list with less tokens leads to out of index error
|
||||
pass
|
||||
|
||||
@slow
|
||||
def test_tokenizer_integration(self):
|
||||
# fmt: off
|
||||
|
||||
Reference in New Issue
Block a user