[Whisper] Refactor whisper (#21252)
* update whisper logit processor * add generate for whisper * remove part of the whisper specific code from pipeline * update logit processes * major update * enforce first timestamp * update generate * add more tests * update new decoding strategy * Apply suggestions from code review * update docstring * fixup * default config will not have multilingual ar * update expected tokenizer size, see pull on the hub for whisper-tiny
This commit is contained in:
@@ -59,7 +59,7 @@ class WhisperTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
self.assertEqual(len(vocab_keys), 50364)
|
||||
|
||||
def test_vocab_size(self):
|
||||
self.assertEqual(self.get_tokenizer().vocab_size, 50257)
|
||||
self.assertEqual(self.get_tokenizer().vocab_size, 50258)
|
||||
|
||||
def test_full_tokenizer(self):
|
||||
tokenizer = WhisperTokenizer.from_pretrained(self.tmpdirname)
|
||||
@@ -265,7 +265,15 @@ class SpeechToTextTokenizerMultilinguialTest(unittest.TestCase):
|
||||
},
|
||||
],
|
||||
)
|
||||
|
||||
# test `decode_with_offsets`
|
||||
output = multilingual_tokenizer.decode(INPUT_TOKENS, decode_with_timestamps=True)
|
||||
self.assertEqual(
|
||||
output,
|
||||
"<|startoftranscript|><|en|><|transcribe|><|0.00|> Lennils, pictures are a sort of upguards and atom"
|
||||
" paintings, and Mason's exquisite idles<|7.20|><|7.20|> are as national as a jingo poem. Mr. Birkut"
|
||||
" Foster's landscapes smile at one much in the<|15.16|><|15.16|> same way that Mr. Carker used to flash"
|
||||
" his teeth. And Mr. John Colier gives his<|21.70|><|21.70|><|endoftext|>",
|
||||
)
|
||||
# test a single sequence with timestamps
|
||||
# fmt: off
|
||||
INPUT_TOKENS = [
|
||||
|
||||
Reference in New Issue
Block a user