Update-llama-code (#25826)
* some bug fixes * updates * Update code_llama.md Co-authored-by: Omar Sanseviero <osanseviero@users.noreply.github.com> * Add co author Co-authored-by: pcuenca <pedro@latenitesoft.com> * add a test * fixup * nits * some updates * fix-coies * adress comments * nits * nits * fix docsting * Apply suggestions from code review Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * update * add int for https://huggingface.co/spaces/hf-accelerate/model-memory-usage --------- Co-authored-by: Omar Sanseviero <osanseviero@users.noreply.github.com> Co-authored-by: pcuenca <pedro@latenitesoft.com> Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
This commit is contained in:
@@ -49,6 +49,8 @@ Here is a sample usage
|
|||||||
python src/transformers/models/llama/convert_llama_weights_to_hf.py \
|
python src/transformers/models/llama/convert_llama_weights_to_hf.py \
|
||||||
--input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path
|
--input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path
|
||||||
```
|
```
|
||||||
|
Note that executing the script requires enough CPU RAM to host the whole model in float16 precision (even if the biggest versions
|
||||||
|
come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
|
||||||
|
|
||||||
- After conversion, the model and tokenizer can be loaded via:
|
- After conversion, the model and tokenizer can be loaded via:
|
||||||
|
|
||||||
@@ -90,8 +92,8 @@ If you only want the infilled part:
|
|||||||
>>> generator = pipeline("text-generation",model="codellama/CodeLlama-7b-hf",torch_dtype=torch.float16, device_map="auto")
|
>>> generator = pipeline("text-generation",model="codellama/CodeLlama-7b-hf",torch_dtype=torch.float16, device_map="auto")
|
||||||
>>> generator('def remove_non_ascii(s: str) -> str:\n """ <FILL_ME>\n return result', max_new_tokens = 128, return_type = 1)
|
>>> generator('def remove_non_ascii(s: str) -> str:\n """ <FILL_ME>\n return result', max_new_tokens = 128, return_type = 1)
|
||||||
```
|
```
|
||||||
Note that executing the script requires enough CPU RAM to host the whole model in float16 precision (even if the biggest versions
|
|
||||||
come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). For the 75B model, it's thus 145GB of RAM needed.
|
Under the hood, the tokenizer [automatically splits by `<FILL_ME>`](https://huggingface.co/docs/transformers/main/model_doc/code_llama#transformers.CodeLlamaTokenizer.fill_token) to create a formatted input string that follows [the original training pattern](https://github.com/facebookresearch/codellama/blob/cb51c14ec761370ba2e2bc351374a79265d0465e/llama/generation.py#L402). This is more robust than preparing the pattern yourself: it avoids pitfalls, such as token glueing, that are very hard to debug. To see how much CPU and GPU memory you need for this model or others, try [this calculator](https://huggingface.co/spaces/hf-accelerate/model-memory-usage) which can help determine that value.
|
||||||
|
|
||||||
- The LLaMA tokenizer is a BPE model based on [sentencepiece](https://github.com/google/sentencepiece). One quirk of sentencepiece is that when decoding a sequence, if the first token is the start of the word (e.g. "Banana"), the tokenizer does not prepend the prefix space to the string.
|
- The LLaMA tokenizer is a BPE model based on [sentencepiece](https://github.com/google/sentencepiece). One quirk of sentencepiece is that when decoding a sequence, if the first token is the start of the word (e.g. "Banana"), the tokenizer does not prepend the prefix space to the string.
|
||||||
|
|
||||||
|
|||||||
@@ -64,6 +64,10 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
|
|||||||
Construct a CodeLlama tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as
|
Construct a CodeLlama tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as
|
||||||
there is no padding token in the original model.
|
there is no padding token in the original model.
|
||||||
|
|
||||||
|
The default configuration match that of
|
||||||
|
[codellama/CodeLlama-7b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf/blob/main/tokenizer_config.json)
|
||||||
|
which supports prompt infilling.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_file (`str`):
|
vocab_file (`str`):
|
||||||
Path to the vocabulary file.
|
Path to the vocabulary file.
|
||||||
@@ -80,8 +84,6 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
|
|||||||
unk_token (`str`, *optional*, defaults to `"<unk>"`):
|
unk_token (`str`, *optional*, defaults to `"<unk>"`):
|
||||||
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||||
token instead.
|
token instead.
|
||||||
pad_token (`str`, *optional*, defaults to `"<pad>"`):
|
|
||||||
The token used for padding, for example when batching sequences of different lengths.
|
|
||||||
prefix_token (`str`, *optional*, defaults to `"▁<PRE>"`):
|
prefix_token (`str`, *optional*, defaults to `"▁<PRE>"`):
|
||||||
Prefix token used for infilling.
|
Prefix token used for infilling.
|
||||||
suffix_token (`str`, *optional*, defaults to `"▁<SUF>"`):
|
suffix_token (`str`, *optional*, defaults to `"▁<SUF>"`):
|
||||||
@@ -111,7 +113,8 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
|
|||||||
|
|
||||||
- `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
|
- `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
|
||||||
BPE-dropout.
|
BPE-dropout.
|
||||||
|
use_default_system_prompt (`bool`, *optional*, defaults to `False`):
|
||||||
|
Whether or not the default system prompt for Llama should be used.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
@@ -125,7 +128,6 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
|
|||||||
unk_token="<unk>",
|
unk_token="<unk>",
|
||||||
bos_token="<s>",
|
bos_token="<s>",
|
||||||
eos_token="</s>",
|
eos_token="</s>",
|
||||||
pad_token=None,
|
|
||||||
prefix_token="▁<PRE>",
|
prefix_token="▁<PRE>",
|
||||||
middle_token="▁<MID>",
|
middle_token="▁<MID>",
|
||||||
suffix_token="▁<SUF>",
|
suffix_token="▁<SUF>",
|
||||||
@@ -136,6 +138,8 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
|
|||||||
add_bos_token=True,
|
add_bos_token=True,
|
||||||
add_eos_token=False,
|
add_eos_token=False,
|
||||||
clean_up_tokenization_spaces=False,
|
clean_up_tokenization_spaces=False,
|
||||||
|
additional_special_tokens=None,
|
||||||
|
use_default_system_prompt=False,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
requires_backends(self, "protobuf")
|
requires_backends(self, "protobuf")
|
||||||
@@ -143,16 +147,17 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
|
|||||||
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
|
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
|
||||||
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
|
||||||
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
|
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
|
||||||
pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
|
|
||||||
|
|
||||||
|
self.use_default_system_prompt = use_default_system_prompt
|
||||||
# mark tokens special to skip them
|
# mark tokens special to skip them
|
||||||
additional_special_tokens = kwargs.pop("additional_special_tokens", [])
|
additional_special_tokens = additional_special_tokens or []
|
||||||
additional_special_tokens += [prefix_token, middle_token, suffix_token, eot_token]
|
for token in [prefix_token, middle_token, suffix_token, eot_token]:
|
||||||
|
additional_special_tokens += [token] if token is not None else []
|
||||||
|
|
||||||
super().__init__(
|
super().__init__(
|
||||||
bos_token=bos_token,
|
bos_token=bos_token,
|
||||||
eos_token=eos_token,
|
eos_token=eos_token,
|
||||||
unk_token=unk_token,
|
unk_token=unk_token,
|
||||||
pad_token=pad_token,
|
|
||||||
add_bos_token=add_bos_token,
|
add_bos_token=add_bos_token,
|
||||||
add_eos_token=add_eos_token,
|
add_eos_token=add_eos_token,
|
||||||
prefix_token=prefix_token,
|
prefix_token=prefix_token,
|
||||||
@@ -164,6 +169,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
|
|||||||
suffix_first=suffix_first,
|
suffix_first=suffix_first,
|
||||||
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
|
||||||
additional_special_tokens=additional_special_tokens,
|
additional_special_tokens=additional_special_tokens,
|
||||||
|
use_default_system_prompt=use_default_system_prompt,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
self.vocab_file = vocab_file
|
self.vocab_file = vocab_file
|
||||||
@@ -239,6 +245,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
|
|||||||
"""Returns vocab size"""
|
"""Returns vocab size"""
|
||||||
return self.sp_model.get_piece_size()
|
return self.sp_model.get_piece_size()
|
||||||
|
|
||||||
|
# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.get_vocab
|
||||||
def get_vocab(self):
|
def get_vocab(self):
|
||||||
"""Returns vocab as a dict"""
|
"""Returns vocab as a dict"""
|
||||||
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
|
||||||
@@ -247,7 +254,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
|
|||||||
|
|
||||||
def tokenize(self, prefix, suffix=None, suffix_first=False, **kwargs) -> List[int]:
|
def tokenize(self, prefix, suffix=None, suffix_first=False, **kwargs) -> List[int]:
|
||||||
# add a prefix space to `prefix`
|
# add a prefix space to `prefix`
|
||||||
if self.fill_token in prefix and suffix is None:
|
if self.fill_token is not None and self.fill_token in prefix and suffix is None:
|
||||||
prefix, suffix = prefix.split(self.fill_token)
|
prefix, suffix = prefix.split(self.fill_token)
|
||||||
|
|
||||||
if len(prefix) > 0:
|
if len(prefix) > 0:
|
||||||
@@ -263,9 +270,9 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
|
|||||||
|
|
||||||
if None in (self.prefix_id, self.middle_id, self.suffix_id):
|
if None in (self.prefix_id, self.middle_id, self.suffix_id):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Then input includes a `prefix` and a `suffix` used for the infilling task,"
|
"The input either includes a `prefix` and a `suffix` used for the infilling task,"
|
||||||
" the `prefix_id, middle_id, suffix_id` must all be initialized. Current"
|
f" or can be split on the {self.fill_token} token, creating a suffix and prefix,"
|
||||||
f" values : {self.prefix_id, self.middle_id, self.suffix_id}"
|
" but the model does not support `infilling`."
|
||||||
)
|
)
|
||||||
suffix_tokens = self._tokenize(suffix) # make sure CodeLlama sp model does not mess up
|
suffix_tokens = self._tokenize(suffix) # make sure CodeLlama sp model does not mess up
|
||||||
|
|
||||||
@@ -293,10 +300,12 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
|
|||||||
# 2. Remove self.unk_token from ['<','unk','>', '▁Hey']
|
# 2. Remove self.unk_token from ['<','unk','>', '▁Hey']
|
||||||
return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens
|
return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens
|
||||||
|
|
||||||
|
# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer._convert_token_to_id
|
||||||
def _convert_token_to_id(self, token):
|
def _convert_token_to_id(self, token):
|
||||||
"""Converts a token (str) in an id using the vocab."""
|
"""Converts a token (str) in an id using the vocab."""
|
||||||
return self.sp_model.piece_to_id(token)
|
return self.sp_model.piece_to_id(token)
|
||||||
|
|
||||||
|
# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer._convert_id_to_token
|
||||||
def _convert_id_to_token(self, index):
|
def _convert_id_to_token(self, index):
|
||||||
"""Converts an index (integer) in a token (str) using the vocab."""
|
"""Converts an index (integer) in a token (str) using the vocab."""
|
||||||
token = self.sp_model.IdToPiece(index)
|
token = self.sp_model.IdToPiece(index)
|
||||||
@@ -320,6 +329,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
|
|||||||
out_string += self.sp_model.decode(current_sub_tokens)
|
out_string += self.sp_model.decode(current_sub_tokens)
|
||||||
return out_string
|
return out_string
|
||||||
|
|
||||||
|
# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.save_vocabulary
|
||||||
def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
"""
|
"""
|
||||||
Save the vocabulary and special tokens file to a directory.
|
Save the vocabulary and special tokens file to a directory.
|
||||||
@@ -347,6 +357,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
|
|||||||
|
|
||||||
return (out_vocab_file,)
|
return (out_vocab_file,)
|
||||||
|
|
||||||
|
# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens
|
||||||
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
|
||||||
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
|
bos_token_id = [self.bos_token_id] if self.add_bos_token else []
|
||||||
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
|
eos_token_id = [self.eos_token_id] if self.add_eos_token else []
|
||||||
@@ -358,6 +369,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
|
|||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.get_special_tokens_mask
|
||||||
def get_special_tokens_mask(
|
def get_special_tokens_mask(
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
|
||||||
) -> List[int]:
|
) -> List[int]:
|
||||||
@@ -395,6 +407,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
|
|||||||
+ eos_token_id
|
+ eos_token_id
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.create_token_type_ids_from_sequences
|
||||||
def create_token_type_ids_from_sequences(
|
def create_token_type_ids_from_sequences(
|
||||||
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
||||||
) -> List[int]:
|
) -> List[int]:
|
||||||
@@ -443,7 +456,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
|
|||||||
>>> from transformers import Conversation
|
>>> from transformers import Conversation
|
||||||
|
|
||||||
>>> Conversation(
|
>>> Conversation(
|
||||||
... "<<SYS>>\n Only answer with emojis, and charades\n<</SYS>>\n\nHow can I build a house in 10 septs?"
|
... "<<SYS>>\n Complete the functions without any documentation\n<</SYS>>\n\n `def remove_non_ascii(s: str) -> str:`"
|
||||||
... ) # doctest: +IGNORE_RESULT
|
... ) # doctest: +IGNORE_RESULT
|
||||||
```
|
```
|
||||||
Args:
|
Args:
|
||||||
@@ -453,16 +466,20 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
|
|||||||
`List[int]`:
|
`List[int]`:
|
||||||
Input ids for the conversation.
|
Input ids for the conversation.
|
||||||
"""
|
"""
|
||||||
if len(conversation.past_user_inputs) > 0:
|
if self.use_default_system_prompt:
|
||||||
if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]:
|
if len(conversation.past_user_inputs) > 0:
|
||||||
conversation.past_user_inputs[0] = (
|
if (
|
||||||
B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
|
not conversation.past_user_inputs[0].startswith(B_SYS)
|
||||||
)
|
or E_SYS not in conversation.past_user_inputs[0]
|
||||||
elif conversation.new_user_input:
|
):
|
||||||
if not conversation.new_user_input.startswith(B_SYS) or E_SYS not in conversation.new_user_input:
|
conversation.past_user_inputs[0] = (
|
||||||
conversation.new_user_input = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.new_user_input
|
B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
|
||||||
else:
|
)
|
||||||
raise ValueError("Last message must be from user")
|
elif conversation.new_user_input:
|
||||||
|
if not conversation.new_user_input.startswith(B_SYS) or E_SYS not in conversation.new_user_input:
|
||||||
|
conversation.new_user_input = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.new_user_input
|
||||||
|
else:
|
||||||
|
raise ValueError("Last message must be from user")
|
||||||
|
|
||||||
dialogue = list(conversation.iter_texts())
|
dialogue = list(conversation.iter_texts())
|
||||||
if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
|
if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
|
||||||
|
|||||||
@@ -73,7 +73,9 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
|
|
||||||
|
|
||||||
This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
|
This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
|
||||||
refer to this superclass for more information regarding those methods.
|
refer to this superclass for more information regarding those methods. The default configuration match that of
|
||||||
|
[codellama/CodeLlama-7b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf/blob/main/tokenizer_config.json)
|
||||||
|
which supports prompt infilling.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_file (`str`):
|
vocab_file (`str`):
|
||||||
@@ -104,6 +106,10 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
The token used to split the input between the prefix and suffix.
|
The token used to split the input between the prefix and suffix.
|
||||||
suffix_first (`bool`, *optional*, default to `False`):
|
suffix_first (`bool`, *optional*, default to `False`):
|
||||||
Whether the input prompt and suffix should be formatted with the suffix first.
|
Whether the input prompt and suffix should be formatted with the suffix first.
|
||||||
|
additional_special_tokens (`List[str]`, *optional*):
|
||||||
|
Additional special tokens used by the tokenizer.
|
||||||
|
use_default_system_prompt (`bool`, *optional*, defaults to `True`):
|
||||||
|
Whether or not the default system prompt for Llama should be used.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
@@ -124,13 +130,18 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
suffix_token="▁<SUF>",
|
suffix_token="▁<SUF>",
|
||||||
eot_token="▁<EOT>",
|
eot_token="▁<EOT>",
|
||||||
fill_token="<FILL_ME>",
|
fill_token="<FILL_ME>",
|
||||||
|
additional_special_tokens=None,
|
||||||
add_bos_token=True,
|
add_bos_token=True,
|
||||||
add_eos_token=False,
|
add_eos_token=False,
|
||||||
|
use_default_system_prompt=False,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
# mark tokens special to skip them
|
# mark tokens special to skip them
|
||||||
additional_special_tokens = kwargs.pop("additional_special_tokens", [])
|
additional_special_tokens = additional_special_tokens or []
|
||||||
additional_special_tokens += [prefix_token, middle_token, suffix_token, eot_token]
|
for token in [prefix_token, middle_token, suffix_token, eot_token]:
|
||||||
|
additional_special_tokens += [token] if token is not None else []
|
||||||
|
self.use_default_system_prompt = use_default_system_prompt
|
||||||
|
|
||||||
super().__init__(
|
super().__init__(
|
||||||
vocab_file=vocab_file,
|
vocab_file=vocab_file,
|
||||||
tokenizer_file=tokenizer_file,
|
tokenizer_file=tokenizer_file,
|
||||||
@@ -144,6 +155,7 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
suffix_token=suffix_token,
|
suffix_token=suffix_token,
|
||||||
eot_token=eot_token,
|
eot_token=eot_token,
|
||||||
fill_token=fill_token,
|
fill_token=fill_token,
|
||||||
|
use_default_system_prompt=use_default_system_prompt,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
self._add_bos_token = add_bos_token
|
self._add_bos_token = add_bos_token
|
||||||
@@ -162,6 +174,7 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
def can_save_slow_tokenizer(self) -> bool:
|
def can_save_slow_tokenizer(self) -> bool:
|
||||||
return os.path.isfile(self.vocab_file) if self.vocab_file else False
|
return os.path.isfile(self.vocab_file) if self.vocab_file else False
|
||||||
|
|
||||||
|
# Copied from transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast.update_post_processor
|
||||||
def update_post_processor(self):
|
def update_post_processor(self):
|
||||||
"""
|
"""
|
||||||
Updates the underlying post processor with the current `bos_token` and `eos_token`.
|
Updates the underlying post processor with the current `bos_token` and `eos_token`.
|
||||||
@@ -300,6 +313,7 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
self.set_infilling_processor(True)
|
self.set_infilling_processor(True)
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
|
# Copied from transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast.save_vocabulary
|
||||||
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
|
||||||
if not self.can_save_slow_tokenizer:
|
if not self.can_save_slow_tokenizer:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@@ -343,12 +357,12 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
Returns:
|
Returns:
|
||||||
`List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
|
`List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
|
||||||
"""
|
"""
|
||||||
# TODO process the ids for fast? Or update the template processing for infilling task when using `tokenize_infilling`
|
|
||||||
if token_ids_1 is None:
|
if token_ids_1 is None:
|
||||||
return self.prefix_tokens + token_ids_0 + self.suffix_tokens
|
return self.bos_token_id + token_ids_0 + self.eos_token_id
|
||||||
return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
|
return self.bos_token_id + token_ids_0 + token_ids_1 + self.eos_token_id
|
||||||
|
|
||||||
def _build_conversation_input_ids(self, conversation: "Conversation"):
|
# Copied from transformers.models.code_llama.tokenization_code_llama.CodeLlamaTokenizer._build_conversation_input_ids
|
||||||
|
def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
|
||||||
r"""Builds the input ids for a conversation.
|
r"""Builds the input ids for a conversation.
|
||||||
This is the format used in the provided examples. System prompts should be manually added at the beginning of
|
This is the format used in the provided examples. System prompts should be manually added at the beginning of
|
||||||
the conversation. If no system prompt is given, the `DEFAULT_SYSTEM_PROMPT` will be used.
|
the conversation. If no system prompt is given, the `DEFAULT_SYSTEM_PROMPT` will be used.
|
||||||
@@ -363,7 +377,7 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
>>> from transformers import Conversation
|
>>> from transformers import Conversation
|
||||||
|
|
||||||
>>> Conversation(
|
>>> Conversation(
|
||||||
... "<<SYS>>\n Only answer with emojis, and charades\n<</SYS>>\n\nHow can I build a house in 10 septs?"
|
... "<<SYS>>\n Complete the functions without any documentation\n<</SYS>>\n\n `def remove_non_ascii(s: str) -> str:`"
|
||||||
... ) # doctest: +IGNORE_RESULT
|
... ) # doctest: +IGNORE_RESULT
|
||||||
```
|
```
|
||||||
Args:
|
Args:
|
||||||
@@ -373,16 +387,20 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
`List[int]`:
|
`List[int]`:
|
||||||
Input ids for the conversation.
|
Input ids for the conversation.
|
||||||
"""
|
"""
|
||||||
if len(conversation.past_user_inputs) > 0:
|
if self.use_default_system_prompt:
|
||||||
if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]:
|
if len(conversation.past_user_inputs) > 0:
|
||||||
conversation.past_user_inputs[0] = (
|
if (
|
||||||
B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
|
not conversation.past_user_inputs[0].startswith(B_SYS)
|
||||||
)
|
or E_SYS not in conversation.past_user_inputs[0]
|
||||||
elif conversation.new_user_input:
|
):
|
||||||
if not conversation.new_user_input.startswith(B_SYS) or E_SYS not in conversation.new_user_input:
|
conversation.past_user_inputs[0] = (
|
||||||
conversation.new_user_input = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.new_user_input
|
B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
|
||||||
else:
|
)
|
||||||
raise ValueError("Last message must be from user")
|
elif conversation.new_user_input:
|
||||||
|
if not conversation.new_user_input.startswith(B_SYS) or E_SYS not in conversation.new_user_input:
|
||||||
|
conversation.new_user_input = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.new_user_input
|
||||||
|
else:
|
||||||
|
raise ValueError("Last message must be from user")
|
||||||
|
|
||||||
dialogue = list(conversation.iter_texts())
|
dialogue = list(conversation.iter_texts())
|
||||||
if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
|
if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
|
||||||
@@ -392,7 +410,7 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
"The model only supports 'user' and 'assistant' roles, starting with user and alternating (u/a/u/a/u...)"
|
"The model only supports 'user' and 'assistant' roles, starting with user and alternating (u/a/u/a/u...)"
|
||||||
)
|
)
|
||||||
|
|
||||||
dialog_tokens = []
|
dialog_tokens: List[int] = []
|
||||||
dialog_tokens += sum(
|
dialog_tokens += sum(
|
||||||
[
|
[
|
||||||
[self.bos_token_id]
|
[self.bos_token_id]
|
||||||
|
|||||||
@@ -65,6 +65,11 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
|||||||
tokenizer.pad_token = tokenizer.eos_token
|
tokenizer.pad_token = tokenizer.eos_token
|
||||||
tokenizer.save_pretrained(self.tmpdirname)
|
tokenizer.save_pretrained(self.tmpdirname)
|
||||||
|
|
||||||
|
def test_no_infilling_init(self):
|
||||||
|
tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, prefix_token=None, keep_accents=True)
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
tokenizer.tokenize("This is <FILL_ME> prefix")
|
||||||
|
|
||||||
def test_full_tokenizer(self):
|
def test_full_tokenizer(self):
|
||||||
tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, keep_accents=True)
|
||||||
|
|
||||||
@@ -587,8 +592,8 @@ split,
|
|||||||
end
|
end
|
||||||
""",
|
""",
|
||||||
]
|
]
|
||||||
tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")
|
tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")
|
||||||
tokenizer_fast = CodeLlamaTokenizerFast.from_pretrained("codellama/CodeLlama-7b-hf")
|
tokenizer_fast = CodeLlamaTokenizerFast.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")
|
||||||
|
|
||||||
formatted_prompt = tokenizer.tokenize(PROMPTS[0])
|
formatted_prompt = tokenizer.tokenize(PROMPTS[0])
|
||||||
self.assertEqual(formatted_prompt, tokenizer_fast.tokenize(PROMPTS[0]))
|
self.assertEqual(formatted_prompt, tokenizer_fast.tokenize(PROMPTS[0]))
|
||||||
|
|||||||
Reference in New Issue
Block a user