From 536ea2aca234fb48c5c69769431d643b0d93b233 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Fri, 29 Mar 2024 00:19:32 +0900 Subject: [PATCH] [`LlamaSlowConverter`] Slow to Fast better support (#29797) * fix * fix test * style * nit * rather rely on concert token to id * fix quality * Update src/transformers/convert_slow_tokenizer.py --- src/transformers/convert_slow_tokenizer.py | 12 +++++----- tests/models/llava/test_modeling_llava.py | 27 ++++++++++++++++++++++ 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index 9eed8cfb42..1980ba643a 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -1331,9 +1331,9 @@ class LlamaConverter(SpmConverter): def vocab(self, proto): vocab = [ - ("", 0.0), - ("", 0.0), - ("", 0.0), + (self.original_tokenizer.convert_ids_to_tokens(0), 0.0), + (self.original_tokenizer.convert_ids_to_tokens(1), 0.0), + (self.original_tokenizer.convert_ids_to_tokens(2), 0.0), ] vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]] return vocab @@ -1371,9 +1371,9 @@ class LlamaConverter(SpmConverter): ) tokenizer.add_special_tokens( [ - AddedToken("", normalized=False, special=True), - AddedToken("", normalized=False, special=True), - AddedToken("", normalized=False, special=True), + AddedToken(self.original_tokenizer.convert_ids_to_tokens(0), normalized=False, special=True), + AddedToken(self.original_tokenizer.convert_ids_to_tokens(1), normalized=False, special=True), + AddedToken(self.original_tokenizer.convert_ids_to_tokens(2), normalized=False, special=True), ] ) else: diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py index 856044520a..d6bb2b56ac 100644 --- a/tests/models/llava/test_modeling_llava.py +++ b/tests/models/llava/test_modeling_llava.py @@ -22,6 +22,7 @@ import requests from transformers import ( AutoProcessor, + AutoTokenizer, LlavaConfig, LlavaForConditionalGeneration, is_torch_available, @@ -575,3 +576,29 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase): labels=input_ids, ).loss loss.backward() + + def test_tokenizer_integration(self): + slow_tokenizer = AutoTokenizer.from_pretrained("liuhaotian/llava-v1.6-34b", use_fast=False) + slow_tokenizer.add_tokens("", True) + + fast_tokenizer = AutoTokenizer.from_pretrained( + "liuhaotian/llava-v1.6-34b", + bos_token="<|startoftext|>", + eos_token="<|endoftext|>", + from_slow=True, + legacy=False, + ) + fast_tokenizer.add_tokens("", True) + + prompt = "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n" + # If the token is added as special, it's not normalized, and the only diff is the extra space after special tokens. + # https://github.com/huggingface/transformers/pull/28881 is the fix for this. + self.assertEqual( + slow_tokenizer.tokenize(prompt), + ['<|im_start|>', 'system', '\n', 'Answer', '▁the', '▁questions', '.', '<|im_end|>', '<|im_start|>', 'user', '\n', '', '\n', 'What', '▁is', '▁shown', '▁in', '▁this', '▁image', '?', '<|im_end|>', '<|im_start|>', 'ass', 'istant', '\n'] + ) # fmt: skip + + self.assertEqual( + fast_tokenizer.tokenize(prompt), + ['<|im_start|>', '▁system', '\n', 'Answer', '▁the', '▁questions', '.', '<|im_end|>', '<|im_start|>', '▁user', '\n', '', '▁', '\n', 'What', '▁is', '▁shown', '▁in', '▁this', '▁image', '?', '<|im_end|>', '<|im_start|>', '▁assistant', '\n'] + ) # fmt: skip