[SPM] Patch spm Llama and T5 (#25656)

* hot fix

* only encode with string prefix if starts with prefix

* styling

* add a new test

* fixup
This commit is contained in:
Arthur
2023-08-23 07:16:43 +02:00
committed by GitHub
parent 57943630e2
commit 51794bf21e
3 changed files with 27 additions and 12 deletions

View File

@@ -546,6 +546,15 @@ class LlamaIntegrationTest(unittest.TestCase):
decoded_tokens = tokenizer.decode(input_ids)
self.assertEqual(decoded_tokens, " <s> Hello<s> how")
def test_some_edge_cases(self):
tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", legacy=False)
sp_tokens = tokenizer.sp_model.encode("<s>>", out_type=str)
self.assertEqual(sp_tokens, ["<", "s", ">>"])
tokens = tokenizer.tokenize("<s>>")
self.assertNotEqual(sp_tokens, tokens)
self.assertEqual(tokens, ["<s>", ">"])
@require_sentencepiece
@require_tokenizers