[CodeLlamaTokenizer] Nit, update __init__ to make sure the AddedTokens are not normalized because they are special (#27359)
* make sure tokens are properly initialized for codellama slow * add m ore pretrained models * style * test more tokenizers checkpoints
This commit is contained in:
@@ -149,9 +149,9 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
|
|||||||
):
|
):
|
||||||
requires_backends(self, "protobuf")
|
requires_backends(self, "protobuf")
|
||||||
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
||||||
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
|
bos_token = AddedToken(bos_token, normalized=False, special=True) if isinstance(bos_token, str) else bos_token
|
||||||
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
eos_token = AddedToken(eos_token, normalized=False, special=True) if isinstance(eos_token, str) else eos_token
|
||||||
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
|
unk_token = AddedToken(unk_token, normalized=False, special=True) if isinstance(unk_token, str) else unk_token
|
||||||
|
|
||||||
self.use_default_system_prompt = use_default_system_prompt
|
self.use_default_system_prompt = use_default_system_prompt
|
||||||
# mark tokens special to skip them
|
# mark tokens special to skip them
|
||||||
|
|||||||
@@ -150,6 +150,8 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
self.tokenizers_list = [
|
self.tokenizers_list = [
|
||||||
(self.rust_tokenizer_class, "hf-internal-testing/llama-code-tokenizer", {}),
|
(self.rust_tokenizer_class, "hf-internal-testing/llama-code-tokenizer", {}),
|
||||||
(self.tokenizer_class, "hf-internal-testing/llama-code-tokenizer", {}),
|
(self.tokenizer_class, "hf-internal-testing/llama-code-tokenizer", {}),
|
||||||
|
(self.tokenizer_class, "codellama/CodeLlama-34b-Instruct-hf", {}),
|
||||||
|
(self.rust_tokenizer_class, "codellama/CodeLlama-34b-Instruct-hf", {}),
|
||||||
]
|
]
|
||||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||||
|
|||||||
Reference in New Issue
Block a user