Disable default system prompt for LLaMA (#26765)

* Disable default system prompt for LLaMA

* Update test to not expect default prompt
This commit is contained in:
Matt
2023-10-13 14:48:38 +01:00
committed by GitHub
parent 6df9179c1c
commit c9785d956b
3 changed files with 5 additions and 5 deletions

View File

@@ -104,7 +104,7 @@ class LlamaTokenizer(PreTrainedTokenizer):
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`): clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces. extra spaces.
use_default_system_prompt (`bool`, *optional*, defaults to `True`): use_default_system_prompt (`bool`, *optional*, defaults to `False`):
Whether or not the default system prompt for Llama should be used. Whether or not the default system prompt for Llama should be used.
spaces_between_special_tokens (`bool`, *optional*, defaults to `False`): spaces_between_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to add spaces between special tokens. Whether or not to add spaces between special tokens.
@@ -149,7 +149,7 @@ class LlamaTokenizer(PreTrainedTokenizer):
add_bos_token=True, add_bos_token=True,
add_eos_token=False, add_eos_token=False,
clean_up_tokenization_spaces=False, clean_up_tokenization_spaces=False,
use_default_system_prompt=True, use_default_system_prompt=False,
spaces_between_special_tokens=False, spaces_between_special_tokens=False,
legacy=None, legacy=None,
**kwargs, **kwargs,

View File

@@ -98,7 +98,7 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
Whether or not to add an `bos_token` at the start of sequences. Whether or not to add an `bos_token` at the start of sequences.
add_eos_token (`bool`, *optional*, defaults to `False`): add_eos_token (`bool`, *optional*, defaults to `False`):
Whether or not to add an `eos_token` at the end of sequences. Whether or not to add an `eos_token` at the end of sequences.
use_default_system_prompt (`bool`, *optional*, defaults to `True`): use_default_system_prompt (`bool`, *optional*, defaults to `False`):
Whether or not the default system prompt for Llama should be used. Whether or not the default system prompt for Llama should be used.
""" """
@@ -118,7 +118,7 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
eos_token="</s>", eos_token="</s>",
add_bos_token=True, add_bos_token=True,
add_eos_token=False, add_eos_token=False,
use_default_system_prompt=True, use_default_system_prompt=False,
**kwargs, **kwargs,
): ):
super().__init__( super().__init__(

View File

@@ -615,7 +615,7 @@ class LlamaIntegrationTest(unittest.TestCase):
expected_tokens = [ expected_tokens = [
[1, 29961, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 13563, 7451, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 10994, 29991, 518, 29914, 25580, 29962], [1, 29961, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 13563, 7451, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 10994, 29991, 518, 29914, 25580, 29962],
[1, 29961, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 13563, 7451, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 10994, 29991, 518, 29914, 25580, 29962, 20103, 304, 5870, 366, 29889, 29871, 2], [1, 29961, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 13563, 7451, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 10994, 29991, 518, 29914, 25580, 29962, 20103, 304, 5870, 366, 29889, 29871, 2],
[1, 29961, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 29892, 3390, 1319, 322, 15993, 20255, 29889, 29849, 1234, 408, 1371, 3730, 408, 1950, 29892, 1550, 1641, 9109, 29889, 3575, 6089, 881, 451, 3160, 738, 10311, 1319, 29892, 443, 621, 936, 29892, 11021, 391, 29892, 7916, 391, 29892, 304, 27375, 29892, 18215, 29892, 470, 27302, 2793, 29889, 3529, 9801, 393, 596, 20890, 526, 5374, 635, 443, 5365, 1463, 322, 6374, 297, 5469, 29889, 13, 13, 3644, 263, 1139, 947, 451, 1207, 738, 4060, 29892, 470, 338, 451, 2114, 1474, 16165, 261, 296, 29892, 5649, 2020, 2012, 310, 22862, 1554, 451, 1959, 29889, 960, 366, 1016, 29915, 29873, 1073, 278, 1234, 304, 263, 1139, 29892, 3113, 1016, 29915, 29873, 6232, 2089, 2472, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 10994, 29991, 518, 29914, 25580, 29962] [1, 29961, 25580, 29962, 15043, 29991, 518, 29914, 25580, 29962]
] ]
# fmt: on # fmt: on
for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens): for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):