VLM: special multimodal Tokenizer (#34461)

* kinda works

* update

* add tests

* update

* use special tokens in processors

* typo

* fix copies

* fix

* fix moshi after rebase

* update

* fix tests

* update

* Update docs/source/en/main_classes/tokenizer.md

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* update docs

* test for load time adding tokens

* fix some more tests which are now fetched better

* one more fix

---------

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
This commit is contained in:
Raushan Turganbay
2024-11-04 16:37:51 +01:00
committed by GitHub
parent ef976a7e18
commit 187439c3fa
35 changed files with 248 additions and 335 deletions

View File

@@ -28,6 +28,7 @@ from transformers import (
BatchEncoding,
BertTokenizer,
BertTokenizerFast,
LlamaTokenizerFast,
PreTrainedTokenizer,
PreTrainedTokenizerFast,
TensorType,
@@ -280,6 +281,54 @@ class TokenizerUtilsTest(unittest.TestCase):
self.assertEqual(decoded_flat, "##")
self.assertEqual(decoded_list, "##")
def test_extra_special_tokens_multimodal(self):
special_tokens_list = [
"bos_token",
"eos_token",
"unk_token",
"sep_token",
"pad_token",
"cls_token",
"mask_token",
"additional_special_tokens",
]
llama_tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b")
llama_tokenizer.extra_special_tokens = {
"boi_token": "<image_start>",
"eoi_token": "<image_end>",
"image_token": "<image>",
}
self.assertListEqual(llama_tokenizer.SPECIAL_TOKENS_ATTRIBUTES, special_tokens_list)
with tempfile.TemporaryDirectory() as tmpdirname:
llama_tokenizer.save_pretrained(tmpdirname)
# load back and check we have extra special tokens set
loaded_tokenizer = LlamaTokenizerFast.from_pretrained(tmpdirname)
multimodal_special_tokens_list = special_tokens_list + ["boi_token", "eoi_token", "image_token"]
self.assertListEqual(loaded_tokenizer.SPECIAL_TOKENS_ATTRIBUTES, multimodal_special_tokens_list)
# We set an image_token_id before, so we can get an "image_token" as str that matches the id
self.assertTrue(loaded_tokenizer.image_token == "<image>")
self.assertTrue(loaded_tokenizer.image_token_id == loaded_tokenizer.convert_tokens_to_ids("<image>"))
# save one more time and make sure the image token can get loaded back
with tempfile.TemporaryDirectory() as tmpdirname:
loaded_tokenizer.save_pretrained(tmpdirname)
loaded_tokenizer_with_extra_tokens = LlamaTokenizerFast.from_pretrained(tmpdirname)
self.assertTrue(loaded_tokenizer_with_extra_tokens.image_token == "<image>")
# test that we can also indicate extra tokens during load time
extra_special_tokens = {
"boi_token": "<image_start>",
"eoi_token": "<image_end>",
"image_token": "<image>",
}
tokenizer = LlamaTokenizerFast.from_pretrained(
"huggyllama/llama-7b", extra_special_tokens=extra_special_tokens
)
self.assertTrue(tokenizer.image_token == "<image>")
self.assertTrue(tokenizer.image_token_id == loaded_tokenizer.convert_tokens_to_ids("<image>"))
@require_tokenizers
def test_decoding_skip_special_tokens(self):
for tokenizer_class in [BertTokenizer, BertTokenizerFast]: