From c9785d956bb686e06be4c1af3a98dc4c1b9e89ee Mon Sep 17 00:00:00 2001 From: Matt Date: Fri, 13 Oct 2023 14:48:38 +0100 Subject: [PATCH] Disable default system prompt for LLaMA (#26765) * Disable default system prompt for LLaMA * Update test to not expect default prompt --- src/transformers/models/llama/tokenization_llama.py | 4 ++-- src/transformers/models/llama/tokenization_llama_fast.py | 4 ++-- tests/models/llama/test_tokenization_llama.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py index dcf1d8660f..9d9b4f6fcb 100644 --- a/src/transformers/models/llama/tokenization_llama.py +++ b/src/transformers/models/llama/tokenization_llama.py @@ -104,7 +104,7 @@ class LlamaTokenizer(PreTrainedTokenizer): clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`): Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra spaces. - use_default_system_prompt (`bool`, *optional*, defaults to `True`): + use_default_system_prompt (`bool`, *optional*, defaults to `False`): Whether or not the default system prompt for Llama should be used. spaces_between_special_tokens (`bool`, *optional*, defaults to `False`): Whether or not to add spaces between special tokens. @@ -149,7 +149,7 @@ class LlamaTokenizer(PreTrainedTokenizer): add_bos_token=True, add_eos_token=False, clean_up_tokenization_spaces=False, - use_default_system_prompt=True, + use_default_system_prompt=False, spaces_between_special_tokens=False, legacy=None, **kwargs, diff --git a/src/transformers/models/llama/tokenization_llama_fast.py b/src/transformers/models/llama/tokenization_llama_fast.py index 229272e004..abcbe6bb4b 100644 --- a/src/transformers/models/llama/tokenization_llama_fast.py +++ b/src/transformers/models/llama/tokenization_llama_fast.py @@ -98,7 +98,7 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast): Whether or not to add an `bos_token` at the start of sequences. add_eos_token (`bool`, *optional*, defaults to `False`): Whether or not to add an `eos_token` at the end of sequences. - use_default_system_prompt (`bool`, *optional*, defaults to `True`): + use_default_system_prompt (`bool`, *optional*, defaults to `False`): Whether or not the default system prompt for Llama should be used. """ @@ -118,7 +118,7 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast): eos_token="", add_bos_token=True, add_eos_token=False, - use_default_system_prompt=True, + use_default_system_prompt=False, **kwargs, ): super().__init__( diff --git a/tests/models/llama/test_tokenization_llama.py b/tests/models/llama/test_tokenization_llama.py index 008ec83c65..83fbc0b0dc 100644 --- a/tests/models/llama/test_tokenization_llama.py +++ b/tests/models/llama/test_tokenization_llama.py @@ -615,7 +615,7 @@ class LlamaIntegrationTest(unittest.TestCase): expected_tokens = [ [1, 29961, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 13563, 7451, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 10994, 29991, 518, 29914, 25580, 29962], [1, 29961, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 13563, 7451, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 10994, 29991, 518, 29914, 25580, 29962, 20103, 304, 5870, 366, 29889, 29871, 2], - [1, 29961, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 29892, 3390, 1319, 322, 15993, 20255, 29889, 29849, 1234, 408, 1371, 3730, 408, 1950, 29892, 1550, 1641, 9109, 29889, 3575, 6089, 881, 451, 3160, 738, 10311, 1319, 29892, 443, 621, 936, 29892, 11021, 391, 29892, 7916, 391, 29892, 304, 27375, 29892, 18215, 29892, 470, 27302, 2793, 29889, 3529, 9801, 393, 596, 20890, 526, 5374, 635, 443, 5365, 1463, 322, 6374, 297, 5469, 29889, 13, 13, 3644, 263, 1139, 947, 451, 1207, 738, 4060, 29892, 470, 338, 451, 2114, 1474, 16165, 261, 296, 29892, 5649, 2020, 2012, 310, 22862, 1554, 451, 1959, 29889, 960, 366, 1016, 29915, 29873, 1073, 278, 1234, 304, 263, 1139, 29892, 3113, 1016, 29915, 29873, 6232, 2089, 2472, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 10994, 29991, 518, 29914, 25580, 29962] + [1, 29961, 25580, 29962, 15043, 29991, 518, 29914, 25580, 29962] ] # fmt: on for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):