Separate chat templates into a single file (#33957)
* Initial draft * Add .jinja file loading for processors * Add processor saving of naked chat template files * make fixup * Add save-load test for tokenizers * Add save-load test for tokenizers * stash commit * Try popping the file * make fixup * Pop the arg correctly * Pop the arg correctly * Add processor test * Fix processor code * stash commit * Processor clobbers child tokenizer's chat template * Processor clobbers child tokenizer's chat template * make fixup * Split processor/tokenizer files to avoid interactions * fix test * Expand processor tests * Rename arg to "save_raw_chat_template" across all classes * Update processor warning * Move templates to single file * Move templates to single file * Improve testing for processor/tokenizer clashes * Improve testing for processor/tokenizer clashes * Extend saving test * Test file priority correctly * make fixup * Don't pop the chat template file before the slow tokenizer gets a look * Remove breakpoint * make fixup * Fix error
This commit is contained in:
@@ -18,6 +18,7 @@ import inspect
|
||||
import json
|
||||
import random
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
@@ -519,3 +520,27 @@ class ProcessorTesterMixin:
|
||||
processor.prepare_and_validate_optional_call_args(
|
||||
*(f"optional_{i}" for i in range(num_optional_call_args + 1))
|
||||
)
|
||||
|
||||
def test_chat_template_save_loading(self):
|
||||
processor = self.get_processor()
|
||||
existing_tokenizer_template = getattr(processor.tokenizer, "chat_template", None)
|
||||
processor.chat_template = "test template"
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
processor.save_pretrained(tmpdirname)
|
||||
self.assertTrue(Path(tmpdirname, "chat_template.json").is_file())
|
||||
self.assertFalse(Path(tmpdirname, "chat_template.jinja").is_file())
|
||||
reloaded_processor = self.processor_class.from_pretrained(tmpdirname)
|
||||
self.assertEqual(processor.chat_template, reloaded_processor.chat_template)
|
||||
# When we don't use single-file chat template saving, processor and tokenizer chat templates
|
||||
# should remain separate
|
||||
self.assertEqual(getattr(reloaded_processor.tokenizer, "chat_template", None), existing_tokenizer_template)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
processor.save_pretrained(tmpdirname, save_raw_chat_template=True)
|
||||
self.assertTrue(Path(tmpdirname, "chat_template.jinja").is_file())
|
||||
self.assertFalse(Path(tmpdirname, "chat_template.json").is_file())
|
||||
reloaded_processor = self.processor_class.from_pretrained(tmpdirname)
|
||||
self.assertEqual(processor.chat_template, reloaded_processor.chat_template)
|
||||
# When we save as single files, tokenizers and processors share a chat template, which means
|
||||
# the reloaded tokenizer should get the chat template as well
|
||||
self.assertEqual(reloaded_processor.chat_template, reloaded_processor.tokenizer.chat_template)
|
||||
|
||||
Reference in New Issue
Block a user