[LlamaTokenizer] tokenize nits. (#25793)

* return when length is zero

* Add tests

Co-authored-by:  Avnish Narayan <38871737avnishn@users.noreply.github.com>

* Co-authored-by: avnishn
<38871737+avnishn@users.noreply.github.com>

* codeLlama doc should not be on Main

* update test

---------

Co-authored-by: Avnish Narayan <38871737avnishn@users.noreply.github.com>
This commit is contained in:
Arthur
2023-08-29 15:08:14 +02:00
committed by GitHub
parent 9525515cd4
commit 5b5ee235f3
5 changed files with 78 additions and 11 deletions

View File

@@ -555,6 +555,25 @@ class LlamaIntegrationTest(unittest.TestCase):
self.assertNotEqual(sp_tokens, tokens)
self.assertEqual(tokens, ["<s>", ">"])
tokens = tokenizer.tokenize("")
self.assertEqual(tokens, [])
self.assertEqual(tokens, tokenizer.sp_model.encode("", out_type=str))
tokens = tokenizer.tokenize(" ")
self.assertEqual(tokens, ["▁▁"])
# a dummy prefix space is not added by the sp_model as it was de-activated
self.assertEqual(tokens, tokenizer.sp_model.encode(" ", out_type=str))
tokens = tokenizer.tokenize("")
self.assertEqual(tokens, ["▁▁"])
# a dummy prefix space is not added by the sp_model as it was de-activated
self.assertEqual(tokens, tokenizer.sp_model.encode("▁▁", out_type=str))
tokens = tokenizer.tokenize("")
self.assertEqual(tokens, ["▁▁▁"])
# a dummy prefix space is not added by the sp_model as it was de-activated
self.assertEqual(tokens, tokenizer.sp_model.encode("▁▁▁", out_type=str))
@require_sentencepiece
@require_tokenizers
@@ -583,6 +602,18 @@ class CommonSpmIntegrationTests(unittest.TestCase):
tokens = self.tokenizer.tokenize(". Hello")
self.assertEqual(tokens, ["", ".", "▁He", "ll", "o"])
tokens = self.tokenizer.tokenize("")
self.assertEqual(tokens, [])
self.assertEqual(tokens, self.tokenizer.sp_model.encode("", out_type=str))
tokens = self.tokenizer.tokenize(" ")
self.assertEqual(tokens, [])
self.assertEqual(tokens, self.tokenizer.sp_model.encode(" ", out_type=str))
tokens = self.tokenizer.tokenize("")
self.assertEqual(tokens, [])
self.assertEqual(tokens, self.tokenizer.sp_model.encode("", out_type=str))
def test_remove_extra_whitespaces(self):
# make sure the extra spaces are eaten. Since the sample vocab does not have
# `______`. sentencepiece.NormalizerSpec.remove_extra_whitespaces attribute is set to False

View File

@@ -400,6 +400,31 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer = T5TokenizerFast(SAMPLE_VOCAB, extra_ids=10)
self.assertListEqual(sorted(tokenizer.get_sentinel_token_ids()), sorted(range(1000, 1010)))
def test_some_edge_cases(self):
tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
sp_tokens = tokenizer.sp_model.encode("</s>>", out_type=str)
self.assertEqual(sp_tokens, ["<", "/", "s", ">", ">"])
tokens = tokenizer.tokenize("</s>>")
self.assertNotEqual(sp_tokens, tokens)
self.assertEqual(tokens, ["</s>", ">"])
tokens = tokenizer.tokenize("")
self.assertEqual(tokens, [])
self.assertEqual(tokens, tokenizer.sp_model.encode("", out_type=str))
tokens = tokenizer.tokenize(" ")
self.assertEqual(tokens, [])
self.assertEqual(tokens, tokenizer.sp_model.encode(" ", out_type=str))
tokens = tokenizer.tokenize("")
self.assertEqual(tokens, [])
self.assertEqual(tokens, tokenizer.sp_model.encode("", out_type=str))
tokens = tokenizer.tokenize("")
self.assertEqual(tokens, [])
self.assertEqual(tokens, tokenizer.sp_model.encode("", out_type=str))
@require_sentencepiece
@require_tokenizers
@@ -427,6 +452,18 @@ class CommonSpmIntegrationTests(unittest.TestCase):
tokens = self.tokenizer.tokenize(". Hello")
self.assertEqual(tokens, ["", ".", "▁He", "ll", "o"])
tokens = self.tokenizer.tokenize("")
self.assertEqual(tokens, [])
self.assertEqual(tokens, self.tokenizer.sp_model.encode("", out_type=str))
tokens = self.tokenizer.tokenize(" ")
self.assertEqual(tokens, [])
self.assertEqual(tokens, self.tokenizer.sp_model.encode(" ", out_type=str))
tokens = self.tokenizer.tokenize("")
self.assertEqual(tokens, [])
self.assertEqual(tokens, self.tokenizer.sp_model.encode("", out_type=str))
def test_remove_extra_whitespaces(self):
# make sure the extra spaces are eaten
# sentencepiece.NormalizerSpec.remove_extra_whitespaces attribute