[LlamaTokenizer] tokenize nits. (#25793)
* return when length is zero * Add tests Co-authored-by: Avnish Narayan <38871737avnishn@users.noreply.github.com> * Co-authored-by: avnishn <38871737+avnishn@users.noreply.github.com> * codeLlama doc should not be on Main * update test --------- Co-authored-by: Avnish Narayan <38871737avnishn@users.noreply.github.com>
This commit is contained in:
@@ -555,6 +555,25 @@ class LlamaIntegrationTest(unittest.TestCase):
|
||||
self.assertNotEqual(sp_tokens, tokens)
|
||||
self.assertEqual(tokens, ["<s>", ">"])
|
||||
|
||||
tokens = tokenizer.tokenize("")
|
||||
self.assertEqual(tokens, [])
|
||||
self.assertEqual(tokens, tokenizer.sp_model.encode("", out_type=str))
|
||||
|
||||
tokens = tokenizer.tokenize(" ")
|
||||
self.assertEqual(tokens, ["▁▁"])
|
||||
# a dummy prefix space is not added by the sp_model as it was de-activated
|
||||
self.assertEqual(tokens, tokenizer.sp_model.encode(" ", out_type=str))
|
||||
|
||||
tokens = tokenizer.tokenize("▁")
|
||||
self.assertEqual(tokens, ["▁▁"])
|
||||
# a dummy prefix space is not added by the sp_model as it was de-activated
|
||||
self.assertEqual(tokens, tokenizer.sp_model.encode("▁▁", out_type=str))
|
||||
|
||||
tokens = tokenizer.tokenize(" ▁")
|
||||
self.assertEqual(tokens, ["▁▁▁"])
|
||||
# a dummy prefix space is not added by the sp_model as it was de-activated
|
||||
self.assertEqual(tokens, tokenizer.sp_model.encode("▁▁▁", out_type=str))
|
||||
|
||||
|
||||
@require_sentencepiece
|
||||
@require_tokenizers
|
||||
@@ -583,6 +602,18 @@ class CommonSpmIntegrationTests(unittest.TestCase):
|
||||
tokens = self.tokenizer.tokenize(". Hello")
|
||||
self.assertEqual(tokens, ["▁", ".", "▁He", "ll", "o"])
|
||||
|
||||
tokens = self.tokenizer.tokenize("")
|
||||
self.assertEqual(tokens, [])
|
||||
self.assertEqual(tokens, self.tokenizer.sp_model.encode("", out_type=str))
|
||||
|
||||
tokens = self.tokenizer.tokenize(" ")
|
||||
self.assertEqual(tokens, [])
|
||||
self.assertEqual(tokens, self.tokenizer.sp_model.encode(" ", out_type=str))
|
||||
|
||||
tokens = self.tokenizer.tokenize("▁")
|
||||
self.assertEqual(tokens, [])
|
||||
self.assertEqual(tokens, self.tokenizer.sp_model.encode("▁", out_type=str))
|
||||
|
||||
def test_remove_extra_whitespaces(self):
|
||||
# make sure the extra spaces are eaten. Since the sample vocab does not have
|
||||
# `______`. sentencepiece.NormalizerSpec.remove_extra_whitespaces attribute is set to False
|
||||
|
||||
@@ -400,6 +400,31 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
tokenizer = T5TokenizerFast(SAMPLE_VOCAB, extra_ids=10)
|
||||
self.assertListEqual(sorted(tokenizer.get_sentinel_token_ids()), sorted(range(1000, 1010)))
|
||||
|
||||
def test_some_edge_cases(self):
|
||||
tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
|
||||
|
||||
sp_tokens = tokenizer.sp_model.encode("</s>>", out_type=str)
|
||||
self.assertEqual(sp_tokens, ["<", "/", "s", ">", ">"])
|
||||
tokens = tokenizer.tokenize("</s>>")
|
||||
self.assertNotEqual(sp_tokens, tokens)
|
||||
self.assertEqual(tokens, ["</s>", ">"])
|
||||
|
||||
tokens = tokenizer.tokenize("")
|
||||
self.assertEqual(tokens, [])
|
||||
self.assertEqual(tokens, tokenizer.sp_model.encode("", out_type=str))
|
||||
|
||||
tokens = tokenizer.tokenize(" ")
|
||||
self.assertEqual(tokens, [])
|
||||
self.assertEqual(tokens, tokenizer.sp_model.encode(" ", out_type=str))
|
||||
|
||||
tokens = tokenizer.tokenize("▁")
|
||||
self.assertEqual(tokens, [])
|
||||
self.assertEqual(tokens, tokenizer.sp_model.encode("▁", out_type=str))
|
||||
|
||||
tokens = tokenizer.tokenize(" ▁")
|
||||
self.assertEqual(tokens, [])
|
||||
self.assertEqual(tokens, tokenizer.sp_model.encode("▁", out_type=str))
|
||||
|
||||
|
||||
@require_sentencepiece
|
||||
@require_tokenizers
|
||||
@@ -427,6 +452,18 @@ class CommonSpmIntegrationTests(unittest.TestCase):
|
||||
tokens = self.tokenizer.tokenize(". Hello")
|
||||
self.assertEqual(tokens, ["▁", ".", "▁He", "ll", "o"])
|
||||
|
||||
tokens = self.tokenizer.tokenize("")
|
||||
self.assertEqual(tokens, [])
|
||||
self.assertEqual(tokens, self.tokenizer.sp_model.encode("", out_type=str))
|
||||
|
||||
tokens = self.tokenizer.tokenize(" ")
|
||||
self.assertEqual(tokens, [])
|
||||
self.assertEqual(tokens, self.tokenizer.sp_model.encode(" ", out_type=str))
|
||||
|
||||
tokens = self.tokenizer.tokenize("▁")
|
||||
self.assertEqual(tokens, [])
|
||||
self.assertEqual(tokens, self.tokenizer.sp_model.encode("▁", out_type=str))
|
||||
|
||||
def test_remove_extra_whitespaces(self):
|
||||
# make sure the extra spaces are eaten
|
||||
# sentencepiece.NormalizerSpec.remove_extra_whitespaces attribute
|
||||
|
||||
Reference in New Issue
Block a user