Update tokenization_code_llama_fast.py (#26576)

* Update tokenization_code_llama_fast.py

* Update test_tokenization_code_llama.py

* Update test_tokenization_code_llama.py
This commit is contained in:
Tianqi Liu
2023-10-06 01:49:02 -07:00
committed by GitHub
parent af38c837ee
commit 65aabafe2f
2 changed files with 13 additions and 1 deletions

View File

@@ -643,3 +643,15 @@ end
input_ids = tokenizer.encode(PROMPTS[0])
self.assertEqual(input_ids, tokenizer.encode(prefix, suffix=suffix))
self.assertEqual(tokenizer.encode(prefix, suffix=suffix), tokenizer_fast.encode(prefix, suffix=suffix))
# Adding suffix_first check for infilling tasks
suffix_first_formatted_prompt = tokenizer.tokenize(PROMPTS[0], suffix_first=True)
self.assertEqual(suffix_first_formatted_prompt, tokenizer_fast.tokenize(PROMPTS[0], suffix_first=True))
prefix, suffix = PROMPTS[0].split("<FILL_ME>")
self.assertEqual(suffix_first_formatted_prompt, tokenizer.tokenize(prefix, suffix, suffix_first=True))
self.assertEqual(suffix_first_formatted_prompt, tokenizer_fast.tokenize(prefix, suffix, suffix_first=True))
prefix, suffix = PROMPTS[0].split("<FILL_ME>")
suffix_first_input_ids = tokenizer.encode(PROMPTS[0], suffix_first=True)
self.assertEqual(suffix_first_input_ids, tokenizer.encode(prefix, suffix=suffix, suffix_first=True))
self.assertEqual(suffix_first_input_ids, tokenizer_fast.encode(prefix, suffix=suffix, suffix_first=True))