From 9a4a119c10bde6a19e45f0a2cf01e9a860c75450 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Wed, 24 Apr 2024 10:51:35 +0200 Subject: [PATCH] [`Llava`] + CIs fix red cis and llava integration tests (#30440) * nit * nit and fmt skip * fixup * Update src/transformers/convert_slow_tokenizer.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * set to true --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --- src/transformers/convert_slow_tokenizer.py | 4 ++-- tests/models/llava/test_modeling_llava.py | 14 +++----------- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index 39c239d145..eed746096b 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -46,7 +46,7 @@ def import_protobuf(error_message=""): def _get_prepend_scheme(add_prefix_space: bool, original_tokenizer) -> str: if add_prefix_space: prepend_scheme = "always" - if hasattr(original_tokenizer, "legacy") and not original_tokenizer.legacy: + if not getattr(original_tokenizer, "legacy", True): prepend_scheme = "first" else: prepend_scheme = "never" @@ -1393,7 +1393,7 @@ class LlamaConverter(SpmConverter): return tokenizer def normalizer(self, proto): - if self.original_tokenizer.legacy: + if getattr(self.original_tokenizer, "legacy", True): sequence = [] if getattr(self.original_tokenizer, "add_prefix_space"): sequence += [normalizers.Prepend(prepend="▁")] diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py index ce432e0599..b4c57e7ba0 100644 --- a/tests/models/llava/test_modeling_llava.py +++ b/tests/models/llava/test_modeling_llava.py @@ -591,14 +591,6 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase): fast_tokenizer.add_tokens("", True) prompt = "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n" - # If the token is added as special, it's not normalized, and the only diff is the extra space after special tokens. - # https://github.com/huggingface/transformers/pull/28881 is the fix for this. - self.assertEqual( - slow_tokenizer.tokenize(prompt), - ['<|im_start|>', 'system', '\n', 'Answer', '▁the', '▁questions', '.', '<|im_end|>', '<|im_start|>', 'user', '\n', '', '\n', 'What', '▁is', '▁shown', '▁in', '▁this', '▁image', '?', '<|im_end|>', '<|im_start|>', 'ass', 'istant', '\n'] - ) # fmt: skip - - self.assertEqual( - fast_tokenizer.tokenize(prompt), - ['<|im_start|>', '▁system', '\n', 'Answer', '▁the', '▁questions', '.', '<|im_end|>', '<|im_start|>', '▁user', '\n', '', '▁', '\n', 'What', '▁is', '▁shown', '▁in', '▁this', '▁image', '?', '<|im_end|>', '<|im_start|>', '▁assistant', '\n'] - ) # fmt: skip + EXPECTED_OUTPUT = ['<|im_start|>', 'system', '\n', 'Answer', '▁the', '▁questions', '.', '<|im_end|>', '<|im_start|>', 'user', '\n', '', '\n', 'What', '▁is', '▁shown', '▁in', '▁this', '▁image', '?', '<|im_end|>', '<|im_start|>', 'ass', 'istant', '\n'] # fmt: skip + self.assertEqual(slow_tokenizer.tokenize(prompt), EXPECTED_OUTPUT) + self.assertEqual(fast_tokenizer.tokenize(prompt), EXPECTED_OUTPUT)