[ TokenizationLlama] fix the way we convert tokens to strings to keep leading spaces 🚨 breaking fix (#29453)

* nit

* update test and fix test

* fixup
This commit is contained in:
Arthur
2024-03-28 21:58:40 +09:00
committed by GitHub
parent e677479c81
commit a2a7f71604
2 changed files with 15 additions and 0 deletions

View File

@@ -581,6 +581,19 @@ class LlamaIntegrationTest(unittest.TestCase):
decoded_tokens = tokenizer.decode(input_ids)
self.assertEqual(decoded_tokens, " <s> Hello<s> how")
# Let's make sure the space is preserved
input_ids = tokenizer.encode("hello", add_special_tokens=True)
self.assertEqual(input_ids, [1, 22172])
tokens = tokenizer.tokenize("hello")
self.assertEqual(tokens, ["▁hello"])
decoded_tokens = tokenizer.decode(input_ids)
self.assertEqual(decoded_tokens, "<s> hello")
input_ids = tokenizer.encode("hello", add_special_tokens=False)
self.assertEqual(input_ids, [22172])
decoded_tokens = tokenizer.decode(input_ids)
self.assertEqual(decoded_tokens, "hello")
def test_some_edge_cases(self):
tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", legacy=False)