Fix edge case for continue_final_message (#36404)

* Fix edge case for continue_final_message

* lstrip() correctly

* Add regression test

* Add a clearer error message when the final message is not present

* Add a clearer error message when the final message is not present

* Fix massive bug!
This commit is contained in:
Matt
2025-03-03 18:03:03 +00:00
committed by GitHub
parent 2aff938992
commit 1975be4d97
2 changed files with 51 additions and 7 deletions

View File

@@ -1565,6 +1565,33 @@ class TokenizerTesterMixin:
"<|im_start|>system\nsystem message<|im_end|>\n<|im_start|>user\nuser message<|im_end|>\n<|im_start|>assistant\nassistant message",
)
@require_jinja
def test_continue_final_message_with_decoy_earlier_message(self):
"""Regression test for chat templates where an earlier message has similar content to the final message
https://github.com/huggingface/transformers/issues/35433"""
dummy_template = """
{%- for message in messages %}
{{- "<|im_start|>" + message['role'] + "\n" + message['content'] | trim + "<|im_end|>" + "\n"}}
{%- endfor %}"""
dummy_conversation = [
{"role": "user", "content": "hi 0"},
{"role": "assistant", "content": "bye: 0"},
{"role": "user", "content": "hi 1"},
{"role": "assistant", "content": "bye: "},
]
tokenizers = self.get_tokenizers()
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
prefill_output = tokenizer.apply_chat_template(
dummy_conversation, chat_template=dummy_template, tokenize=False, continue_final_message=True
)
# Assert that the final message is unterminated
self.assertEqual(
prefill_output,
"<|im_start|>user\nhi 0<|im_end|>\n<|im_start|>assistant\nbye: 0<|im_end|>\n<|im_start|>user\nhi 1<|im_end|>\n<|im_start|>assistant\nbye:",
)
@require_jinja
def test_chat_template_dict(self):
dummy_template_1 = "{{'a'}}"