VLM: special multimodal Tokenizer (#34461)
* kinda works * update * add tests * update * use special tokens in processors * typo * fix copies * fix * fix moshi after rebase * update * fix tests * update * Update docs/source/en/main_classes/tokenizer.md Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * update docs * test for load time adding tokens * fix some more tests which are now fetched better * one more fix --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
ef976a7e18
commit
187439c3fa
@@ -28,6 +28,7 @@ from transformers import (
|
||||
BatchEncoding,
|
||||
BertTokenizer,
|
||||
BertTokenizerFast,
|
||||
LlamaTokenizerFast,
|
||||
PreTrainedTokenizer,
|
||||
PreTrainedTokenizerFast,
|
||||
TensorType,
|
||||
@@ -280,6 +281,54 @@ class TokenizerUtilsTest(unittest.TestCase):
|
||||
self.assertEqual(decoded_flat, "##:")
|
||||
self.assertEqual(decoded_list, "##:")
|
||||
|
||||
def test_extra_special_tokens_multimodal(self):
|
||||
special_tokens_list = [
|
||||
"bos_token",
|
||||
"eos_token",
|
||||
"unk_token",
|
||||
"sep_token",
|
||||
"pad_token",
|
||||
"cls_token",
|
||||
"mask_token",
|
||||
"additional_special_tokens",
|
||||
]
|
||||
llama_tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b")
|
||||
llama_tokenizer.extra_special_tokens = {
|
||||
"boi_token": "<image_start>",
|
||||
"eoi_token": "<image_end>",
|
||||
"image_token": "<image>",
|
||||
}
|
||||
self.assertListEqual(llama_tokenizer.SPECIAL_TOKENS_ATTRIBUTES, special_tokens_list)
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
llama_tokenizer.save_pretrained(tmpdirname)
|
||||
|
||||
# load back and check we have extra special tokens set
|
||||
loaded_tokenizer = LlamaTokenizerFast.from_pretrained(tmpdirname)
|
||||
multimodal_special_tokens_list = special_tokens_list + ["boi_token", "eoi_token", "image_token"]
|
||||
self.assertListEqual(loaded_tokenizer.SPECIAL_TOKENS_ATTRIBUTES, multimodal_special_tokens_list)
|
||||
|
||||
# We set an image_token_id before, so we can get an "image_token" as str that matches the id
|
||||
self.assertTrue(loaded_tokenizer.image_token == "<image>")
|
||||
self.assertTrue(loaded_tokenizer.image_token_id == loaded_tokenizer.convert_tokens_to_ids("<image>"))
|
||||
|
||||
# save one more time and make sure the image token can get loaded back
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
loaded_tokenizer.save_pretrained(tmpdirname)
|
||||
loaded_tokenizer_with_extra_tokens = LlamaTokenizerFast.from_pretrained(tmpdirname)
|
||||
self.assertTrue(loaded_tokenizer_with_extra_tokens.image_token == "<image>")
|
||||
|
||||
# test that we can also indicate extra tokens during load time
|
||||
extra_special_tokens = {
|
||||
"boi_token": "<image_start>",
|
||||
"eoi_token": "<image_end>",
|
||||
"image_token": "<image>",
|
||||
}
|
||||
tokenizer = LlamaTokenizerFast.from_pretrained(
|
||||
"huggyllama/llama-7b", extra_special_tokens=extra_special_tokens
|
||||
)
|
||||
self.assertTrue(tokenizer.image_token == "<image>")
|
||||
self.assertTrue(tokenizer.image_token_id == loaded_tokenizer.convert_tokens_to_ids("<image>"))
|
||||
|
||||
@require_tokenizers
|
||||
def test_decoding_skip_special_tokens(self):
|
||||
for tokenizer_class in [BertTokenizer, BertTokenizerFast]:
|
||||
|
||||
Reference in New Issue
Block a user