Add WhisperTokenizerFast (#21222)

* Add WhisperTokenizerFast

* Fixup

* Up

* Up

* Improve tests

* Update src/transformers/models/whisper/tokenization_whisper_fast.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* Keep stride in whisper pipelien test

* Remove unknown token special case

* Reduce vocabulary size in tests

* Fix vocab size assertion

* Sync copied changes from WhisperTokenizer

* Skip pipeline tests

* Update assertion

* Remove Whisper tokenizer dependency on sentencepiece

* Format

---------

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
This commit is contained in:
Jonatan Kłosko
2023-02-21 06:58:54 +01:00
committed by GitHub
parent 8b3db33a76
commit deafc24388
12 changed files with 568 additions and 8 deletions

View File

@@ -14,7 +14,7 @@
import unittest
from transformers.models.whisper import WhisperTokenizer
from transformers.models.whisper import WhisperTokenizer, WhisperTokenizerFast
from transformers.testing_utils import slow
from ...test_tokenization_common import TokenizerTesterMixin
@@ -31,7 +31,8 @@ NOTIMESTAMPS = 50363
class WhisperTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = WhisperTokenizer
test_rust_tokenizer = False
rust_tokenizer_class = WhisperTokenizerFast
test_rust_tokenizer = True
test_sentencepiece = False
test_seq2seq = False
@@ -93,6 +94,17 @@ class WhisperTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
def test_tokenizer_slow_store_full_signature(self):
pass
def test_tokenizer_fast_store_full_signature(self):
pass
def test_special_tokens_initialization(self):
# Whisper relies on specific additional special tokens, so we skip this
# general test. In particular, this test loads fast tokenizer from slow
# tokenizer, and the conversion uses prefix_tokens, where we reference
# additional special tokens by specific indices, hence overriding the
# list with less tokens leads to out of index error
pass
@slow
def test_tokenizer_integration(self):
# fmt: off