[SPM] Patch spm Llama and T5 (#25656)
* hot fix * only encode with string prefix if starts with prefix * styling * add a new test * fixup
This commit is contained in:
@@ -546,6 +546,15 @@ class LlamaIntegrationTest(unittest.TestCase):
|
||||
decoded_tokens = tokenizer.decode(input_ids)
|
||||
self.assertEqual(decoded_tokens, " <s> Hello<s> how")
|
||||
|
||||
def test_some_edge_cases(self):
|
||||
tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", legacy=False)
|
||||
|
||||
sp_tokens = tokenizer.sp_model.encode("<s>>", out_type=str)
|
||||
self.assertEqual(sp_tokens, ["<", "s", ">>"])
|
||||
tokens = tokenizer.tokenize("<s>>")
|
||||
self.assertNotEqual(sp_tokens, tokens)
|
||||
self.assertEqual(tokens, ["<s>", ">"])
|
||||
|
||||
|
||||
@require_sentencepiece
|
||||
@require_tokenizers
|
||||
|
||||
Reference in New Issue
Block a user