[LlamaTokenizer] tokenize nits. (#25793)
* return when length is zero * Add tests Co-authored-by: Avnish Narayan <38871737avnishn@users.noreply.github.com> * Co-authored-by: avnishn <38871737+avnishn@users.noreply.github.com> * codeLlama doc should not be on Main * update test --------- Co-authored-by: Avnish Narayan <38871737avnishn@users.noreply.github.com>
This commit is contained in:
@@ -555,6 +555,25 @@ class LlamaIntegrationTest(unittest.TestCase):
|
||||
self.assertNotEqual(sp_tokens, tokens)
|
||||
self.assertEqual(tokens, ["<s>", ">"])
|
||||
|
||||
tokens = tokenizer.tokenize("")
|
||||
self.assertEqual(tokens, [])
|
||||
self.assertEqual(tokens, tokenizer.sp_model.encode("", out_type=str))
|
||||
|
||||
tokens = tokenizer.tokenize(" ")
|
||||
self.assertEqual(tokens, ["▁▁"])
|
||||
# a dummy prefix space is not added by the sp_model as it was de-activated
|
||||
self.assertEqual(tokens, tokenizer.sp_model.encode(" ", out_type=str))
|
||||
|
||||
tokens = tokenizer.tokenize("▁")
|
||||
self.assertEqual(tokens, ["▁▁"])
|
||||
# a dummy prefix space is not added by the sp_model as it was de-activated
|
||||
self.assertEqual(tokens, tokenizer.sp_model.encode("▁▁", out_type=str))
|
||||
|
||||
tokens = tokenizer.tokenize(" ▁")
|
||||
self.assertEqual(tokens, ["▁▁▁"])
|
||||
# a dummy prefix space is not added by the sp_model as it was de-activated
|
||||
self.assertEqual(tokens, tokenizer.sp_model.encode("▁▁▁", out_type=str))
|
||||
|
||||
|
||||
@require_sentencepiece
|
||||
@require_tokenizers
|
||||
@@ -583,6 +602,18 @@ class CommonSpmIntegrationTests(unittest.TestCase):
|
||||
tokens = self.tokenizer.tokenize(". Hello")
|
||||
self.assertEqual(tokens, ["▁", ".", "▁He", "ll", "o"])
|
||||
|
||||
tokens = self.tokenizer.tokenize("")
|
||||
self.assertEqual(tokens, [])
|
||||
self.assertEqual(tokens, self.tokenizer.sp_model.encode("", out_type=str))
|
||||
|
||||
tokens = self.tokenizer.tokenize(" ")
|
||||
self.assertEqual(tokens, [])
|
||||
self.assertEqual(tokens, self.tokenizer.sp_model.encode(" ", out_type=str))
|
||||
|
||||
tokens = self.tokenizer.tokenize("▁")
|
||||
self.assertEqual(tokens, [])
|
||||
self.assertEqual(tokens, self.tokenizer.sp_model.encode("▁", out_type=str))
|
||||
|
||||
def test_remove_extra_whitespaces(self):
|
||||
# make sure the extra spaces are eaten. Since the sample vocab does not have
|
||||
# `______`. sentencepiece.NormalizerSpec.remove_extra_whitespaces attribute is set to False
|
||||
|
||||
Reference in New Issue
Block a user