[LlamaTokenizerFast] Adds edge cases for the template processor (#26606)

* make sure eos and bos are properly handled for fast tokenizer * fix code llama as well * nits * fix the conversion script as well * fix failing test
2023-10-06 16:40:54 +02:00
parent 27597fea07
commit 9ad815e412
4 changed files with 27 additions and 30 deletions
--- a/tests/models/llama/test_tokenization_llama.py
+++ b/tests/models/llama/test_tokenization_llama.py
@@ -582,6 +582,19 @@ class LlamaIntegrationTest(unittest.TestCase):
        # a dummy prefix space is not added by the sp_model as it was de-activated
        self.assertEqual(tokens, tokenizer.sp_model.encode("▁▁▁", out_type=str))

+    def test_fast_post_processor(self):
+        tokenizer = LlamaTokenizerFast(
+            SAMPLE_VOCAB, eos_token=None, bos_token=None, add_bos_token=False, add_eos_token=False
+        )
+        tokenizer.encode(" Hey ")
+
+        with self.assertRaises(ValueError):
+            tokenizer = LlamaTokenizerFast(
+                SAMPLE_VOCAB, bos_token=None, eos_token="<s>", add_bos_token=True, add_eos_token=False
+            )
+        with self.assertRaises(ValueError):
+            tokenizer = LlamaTokenizerFast(SAMPLE_VOCAB, eos_token=None, add_bos_token=True, add_eos_token=True)
+
    @require_jinja
    def test_tokenization_for_chat(self):
        tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", legacy=False)