From 7f6e87413f03b41b0896b372e826b27df08ed4c8 Mon Sep 17 00:00:00 2001 From: Ita Zaporozhets <31893021+itazap@users.noreply.github.com> Date: Fri, 24 May 2024 10:03:00 +0200 Subject: [PATCH] add prefix space ignored in llama #29625 (#30964) * add prefix space ignored in llama #29625 * adding test with add_prefix_space=False * ruff --------- Co-authored-by: Ita Zaporozhets --- src/transformers/models/llama/tokenization_llama_fast.py | 1 + tests/models/llama/test_tokenization_llama.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/src/transformers/models/llama/tokenization_llama_fast.py b/src/transformers/models/llama/tokenization_llama_fast.py index 580290841c..44168fbedc 100644 --- a/src/transformers/models/llama/tokenization_llama_fast.py +++ b/src/transformers/models/llama/tokenization_llama_fast.py @@ -163,6 +163,7 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast): add_bos_token=add_bos_token, add_eos_token=add_eos_token, use_default_system_prompt=use_default_system_prompt, + add_prefix_space=add_prefix_space, legacy=legacy, **kwargs, ) diff --git a/tests/models/llama/test_tokenization_llama.py b/tests/models/llama/test_tokenization_llama.py index 84bd6d7a9d..fba883513f 100644 --- a/tests/models/llama/test_tokenization_llama.py +++ b/tests/models/llama/test_tokenization_llama.py @@ -602,6 +602,10 @@ class LlamaIntegrationTest(unittest.TestCase): self.assertEqual(decoded_tokens, "hello") def test_no_prefix_space(self): + tokenizer_no_prefix_space = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b", add_prefix_space=False) + no_prefix_space_tokens = tokenizer_no_prefix_space.tokenize("Hey") + self.assertEqual(no_prefix_space_tokens, ["H", "ey"]) + tokenizer = LlamaTokenizerFast.from_pretrained( "huggyllama/llama-7b", legacy=False, from_slow=True, add_prefix_space=False )