diff --git a/docs/source/en/model_doc/code_llama.md b/docs/source/en/model_doc/code_llama.md index 6408f49d1f..a60cf16415 100644 --- a/docs/source/en/model_doc/code_llama.md +++ b/docs/source/en/model_doc/code_llama.md @@ -49,6 +49,8 @@ Here is a sample usage python src/transformers/models/llama/convert_llama_weights_to_hf.py \ --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path ``` +Note that executing the script requires enough CPU RAM to host the whole model in float16 precision (even if the biggest versions +come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). - After conversion, the model and tokenizer can be loaded via: @@ -90,8 +92,8 @@ If you only want the infilled part: >>> generator = pipeline("text-generation",model="codellama/CodeLlama-7b-hf",torch_dtype=torch.float16, device_map="auto") >>> generator('def remove_non_ascii(s: str) -> str:\n """ \n return result', max_new_tokens = 128, return_type = 1) ``` -Note that executing the script requires enough CPU RAM to host the whole model in float16 precision (even if the biggest versions -come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). For the 75B model, it's thus 145GB of RAM needed. + +Under the hood, the tokenizer [automatically splits by ``](https://huggingface.co/docs/transformers/main/model_doc/code_llama#transformers.CodeLlamaTokenizer.fill_token) to create a formatted input string that follows [the original training pattern](https://github.com/facebookresearch/codellama/blob/cb51c14ec761370ba2e2bc351374a79265d0465e/llama/generation.py#L402). This is more robust than preparing the pattern yourself: it avoids pitfalls, such as token glueing, that are very hard to debug. To see how much CPU and GPU memory you need for this model or others, try [this calculator](https://huggingface.co/spaces/hf-accelerate/model-memory-usage) which can help determine that value. - The LLaMA tokenizer is a BPE model based on [sentencepiece](https://github.com/google/sentencepiece). One quirk of sentencepiece is that when decoding a sequence, if the first token is the start of the word (e.g. "Banana"), the tokenizer does not prepend the prefix space to the string. diff --git a/src/transformers/models/code_llama/tokenization_code_llama.py b/src/transformers/models/code_llama/tokenization_code_llama.py index 3f1c326bba..0cf48b1207 100644 --- a/src/transformers/models/code_llama/tokenization_code_llama.py +++ b/src/transformers/models/code_llama/tokenization_code_llama.py @@ -64,6 +64,10 @@ class CodeLlamaTokenizer(PreTrainedTokenizer): Construct a CodeLlama tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is no padding token in the original model. + The default configuration match that of + [codellama/CodeLlama-7b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf/blob/main/tokenizer_config.json) + which supports prompt infilling. + Args: vocab_file (`str`): Path to the vocabulary file. @@ -80,8 +84,6 @@ class CodeLlamaTokenizer(PreTrainedTokenizer): unk_token (`str`, *optional*, defaults to `""`): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. - pad_token (`str`, *optional*, defaults to `""`): - The token used for padding, for example when batching sequences of different lengths. prefix_token (`str`, *optional*, defaults to `"▁
"`):
             Prefix token used for infilling.
         suffix_token (`str`, *optional*, defaults to `"▁"`):
@@ -111,7 +113,8 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
 
             - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
               BPE-dropout.
-
+        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
+            Whether or not the default system prompt for Llama should be used.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -125,7 +128,6 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
         unk_token="",
         bos_token="",
         eos_token="",
-        pad_token=None,
         prefix_token="▁
",
         middle_token="▁",
         suffix_token="▁",
@@ -136,6 +138,8 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
         add_bos_token=True,
         add_eos_token=False,
         clean_up_tokenization_spaces=False,
+        additional_special_tokens=None,
+        use_default_system_prompt=False,
         **kwargs,
     ):
         requires_backends(self, "protobuf")
@@ -143,16 +147,17 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
         bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
         eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
         unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
-        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
 
+        self.use_default_system_prompt = use_default_system_prompt
         # mark tokens special to skip them
-        additional_special_tokens = kwargs.pop("additional_special_tokens", [])
-        additional_special_tokens += [prefix_token, middle_token, suffix_token, eot_token]
+        additional_special_tokens = additional_special_tokens or []
+        for token in [prefix_token, middle_token, suffix_token, eot_token]:
+            additional_special_tokens += [token] if token is not None else []
+
         super().__init__(
             bos_token=bos_token,
             eos_token=eos_token,
             unk_token=unk_token,
-            pad_token=pad_token,
             add_bos_token=add_bos_token,
             add_eos_token=add_eos_token,
             prefix_token=prefix_token,
@@ -164,6 +169,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
             suffix_first=suffix_first,
             clean_up_tokenization_spaces=clean_up_tokenization_spaces,
             additional_special_tokens=additional_special_tokens,
+            use_default_system_prompt=use_default_system_prompt,
             **kwargs,
         )
         self.vocab_file = vocab_file
@@ -239,6 +245,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
         """Returns vocab size"""
         return self.sp_model.get_piece_size()
 
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.get_vocab
     def get_vocab(self):
         """Returns vocab as a dict"""
         vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
@@ -247,7 +254,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
 
     def tokenize(self, prefix, suffix=None, suffix_first=False, **kwargs) -> List[int]:
         # add a prefix space to `prefix`
-        if self.fill_token in prefix and suffix is None:
+        if self.fill_token is not None and self.fill_token in prefix and suffix is None:
             prefix, suffix = prefix.split(self.fill_token)
 
         if len(prefix) > 0:
@@ -263,9 +270,9 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
 
         if None in (self.prefix_id, self.middle_id, self.suffix_id):
             raise ValueError(
-                "Then input includes a `prefix` and a `suffix` used for the infilling task,"
-                " the `prefix_id, middle_id, suffix_id` must all be initialized. Current"
-                f" values : {self.prefix_id, self.middle_id, self.suffix_id}"
+                "The input either includes a `prefix` and a `suffix` used for the infilling task,"
+                f"  or can be split on the {self.fill_token} token, creating a suffix and prefix,"
+                " but the model does not support `infilling`."
             )
         suffix_tokens = self._tokenize(suffix)  # make sure CodeLlama sp model does not mess up
 
@@ -293,10 +300,12 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
         # 2. Remove self.unk_token from ['<','unk','>', '▁Hey']
         return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens
 
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer._convert_token_to_id
     def _convert_token_to_id(self, token):
         """Converts a token (str) in an id using the vocab."""
         return self.sp_model.piece_to_id(token)
 
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer._convert_id_to_token
     def _convert_id_to_token(self, index):
         """Converts an index (integer) in a token (str) using the vocab."""
         token = self.sp_model.IdToPiece(index)
@@ -320,6 +329,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
         out_string += self.sp_model.decode(current_sub_tokens)
         return out_string
 
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.save_vocabulary
     def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
         """
         Save the vocabulary and special tokens file to a directory.
@@ -347,6 +357,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
 
         return (out_vocab_file,)
 
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.build_inputs_with_special_tokens
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
         bos_token_id = [self.bos_token_id] if self.add_bos_token else []
         eos_token_id = [self.eos_token_id] if self.add_eos_token else []
@@ -358,6 +369,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
 
         return output
 
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.get_special_tokens_mask
     def get_special_tokens_mask(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
     ) -> List[int]:
@@ -395,6 +407,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
             + eos_token_id
         )
 
+    # Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.create_token_type_ids_from_sequences
     def create_token_type_ids_from_sequences(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
     ) -> List[int]:
@@ -443,7 +456,7 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
         >>> from transformers import Conversation
 
         >>> Conversation(
-        ...     "<>\n Only answer with emojis, and charades\n<>\n\nHow can I build a house in 10 septs?"
+        ...     "<>\n Complete the functions without any documentation\n<>\n\n `def remove_non_ascii(s: str) -> str:`"
         ... )  # doctest: +IGNORE_RESULT
         ```
         Args:
@@ -453,16 +466,20 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):
             `List[int]`:
                 Input ids for the conversation.
         """
-        if len(conversation.past_user_inputs) > 0:
-            if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]:
-                conversation.past_user_inputs[0] = (
-                    B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
-                )
-        elif conversation.new_user_input:
-            if not conversation.new_user_input.startswith(B_SYS) or E_SYS not in conversation.new_user_input:
-                conversation.new_user_input = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.new_user_input
-        else:
-            raise ValueError("Last message must be from user")
+        if self.use_default_system_prompt:
+            if len(conversation.past_user_inputs) > 0:
+                if (
+                    not conversation.past_user_inputs[0].startswith(B_SYS)
+                    or E_SYS not in conversation.past_user_inputs[0]
+                ):
+                    conversation.past_user_inputs[0] = (
+                        B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
+                    )
+            elif conversation.new_user_input:
+                if not conversation.new_user_input.startswith(B_SYS) or E_SYS not in conversation.new_user_input:
+                    conversation.new_user_input = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.new_user_input
+            else:
+                raise ValueError("Last message must be from user")
 
         dialogue = list(conversation.iter_texts())
         if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
diff --git a/src/transformers/models/code_llama/tokenization_code_llama_fast.py b/src/transformers/models/code_llama/tokenization_code_llama_fast.py
index b815a6d3e9..d3fc6e8abb 100644
--- a/src/transformers/models/code_llama/tokenization_code_llama_fast.py
+++ b/src/transformers/models/code_llama/tokenization_code_llama_fast.py
@@ -73,7 +73,9 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
 
 
     This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
-    refer to this superclass for more information regarding those methods.
+    refer to this superclass for more information regarding those methods. The default configuration match that of
+    [codellama/CodeLlama-7b-Instruct-hf](https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf/blob/main/tokenizer_config.json)
+    which supports prompt infilling.
 
     Args:
         vocab_file (`str`):
@@ -104,6 +106,10 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
             The token used to split the input between the prefix and suffix.
         suffix_first (`bool`, *optional*, default to `False`):
             Whether the input prompt and suffix should be formatted with the suffix first.
+        additional_special_tokens (`List[str]`, *optional*):
+            Additional special tokens used by the tokenizer.
+        use_default_system_prompt (`bool`, *optional*, defaults to `True`):
+            Whether or not the default system prompt for Llama should be used.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -124,13 +130,18 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
         suffix_token="▁",
         eot_token="▁",
         fill_token="",
+        additional_special_tokens=None,
         add_bos_token=True,
         add_eos_token=False,
+        use_default_system_prompt=False,
         **kwargs,
     ):
         # mark tokens special to skip them
-        additional_special_tokens = kwargs.pop("additional_special_tokens", [])
-        additional_special_tokens += [prefix_token, middle_token, suffix_token, eot_token]
+        additional_special_tokens = additional_special_tokens or []
+        for token in [prefix_token, middle_token, suffix_token, eot_token]:
+            additional_special_tokens += [token] if token is not None else []
+        self.use_default_system_prompt = use_default_system_prompt
+
         super().__init__(
             vocab_file=vocab_file,
             tokenizer_file=tokenizer_file,
@@ -144,6 +155,7 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
             suffix_token=suffix_token,
             eot_token=eot_token,
             fill_token=fill_token,
+            use_default_system_prompt=use_default_system_prompt,
             **kwargs,
         )
         self._add_bos_token = add_bos_token
@@ -162,6 +174,7 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
     def can_save_slow_tokenizer(self) -> bool:
         return os.path.isfile(self.vocab_file) if self.vocab_file else False
 
+    # Copied from transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast.update_post_processor
     def update_post_processor(self):
         """
         Updates the underlying post processor with the current `bos_token` and `eos_token`.
@@ -300,6 +313,7 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
         self.set_infilling_processor(True)
         return tokens
 
+    # Copied from transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast.save_vocabulary
     def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
         if not self.can_save_slow_tokenizer:
             raise ValueError(
@@ -343,12 +357,12 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
         Returns:
             `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
-        # TODO process the ids for fast? Or update the template processing for infilling task when using `tokenize_infilling`
         if token_ids_1 is None:
-            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
-        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
+            return self.bos_token_id + token_ids_0 + self.eos_token_id
+        return self.bos_token_id + token_ids_0 + token_ids_1 + self.eos_token_id
 
-    def _build_conversation_input_ids(self, conversation: "Conversation"):
+    # Copied from transformers.models.code_llama.tokenization_code_llama.CodeLlamaTokenizer._build_conversation_input_ids
+    def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
         r"""Builds the input ids for a conversation.
         This is the format used in the provided examples. System prompts should be manually added at the beginning of
         the conversation. If no system prompt is given, the `DEFAULT_SYSTEM_PROMPT` will be used.
@@ -363,7 +377,7 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
         >>> from transformers import Conversation
 
         >>> Conversation(
-        ...     "<>\n Only answer with emojis, and charades\n<>\n\nHow can I build a house in 10 septs?"
+        ...     "<>\n Complete the functions without any documentation\n<>\n\n `def remove_non_ascii(s: str) -> str:`"
         ... )  # doctest: +IGNORE_RESULT
         ```
         Args:
@@ -373,16 +387,20 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
             `List[int]`:
                 Input ids for the conversation.
         """
-        if len(conversation.past_user_inputs) > 0:
-            if not conversation.past_user_inputs[0].startswith(B_SYS) or E_SYS not in conversation.past_user_inputs[0]:
-                conversation.past_user_inputs[0] = (
-                    B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
-                )
-        elif conversation.new_user_input:
-            if not conversation.new_user_input.startswith(B_SYS) or E_SYS not in conversation.new_user_input:
-                conversation.new_user_input = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.new_user_input
-        else:
-            raise ValueError("Last message must be from user")
+        if self.use_default_system_prompt:
+            if len(conversation.past_user_inputs) > 0:
+                if (
+                    not conversation.past_user_inputs[0].startswith(B_SYS)
+                    or E_SYS not in conversation.past_user_inputs[0]
+                ):
+                    conversation.past_user_inputs[0] = (
+                        B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.past_user_inputs[0]
+                    )
+            elif conversation.new_user_input:
+                if not conversation.new_user_input.startswith(B_SYS) or E_SYS not in conversation.new_user_input:
+                    conversation.new_user_input = B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + conversation.new_user_input
+            else:
+                raise ValueError("Last message must be from user")
 
         dialogue = list(conversation.iter_texts())
         if not all([is_user for is_user, msg in dialogue[::2]]) or not all(
@@ -392,7 +410,7 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):
                 "The model only supports 'user' and 'assistant' roles, starting with user and alternating (u/a/u/a/u...)"
             )
 
-        dialog_tokens = []
+        dialog_tokens: List[int] = []
         dialog_tokens += sum(
             [
                 [self.bos_token_id]
diff --git a/tests/models/code_llama/test_tokenization_code_llama.py b/tests/models/code_llama/test_tokenization_code_llama.py
index 5dc251741b..b4e204625a 100644
--- a/tests/models/code_llama/test_tokenization_code_llama.py
+++ b/tests/models/code_llama/test_tokenization_code_llama.py
@@ -65,6 +65,11 @@ class CodeLlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         tokenizer.pad_token = tokenizer.eos_token
         tokenizer.save_pretrained(self.tmpdirname)
 
+    def test_no_infilling_init(self):
+        tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, prefix_token=None, keep_accents=True)
+        with self.assertRaises(ValueError):
+            tokenizer.tokenize("This is  prefix")
+
     def test_full_tokenizer(self):
         tokenizer = CodeLlamaTokenizer(SAMPLE_VOCAB, keep_accents=True)
 
@@ -587,8 +592,8 @@ split,
 end
 """,
         ]
-        tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")
-        tokenizer_fast = CodeLlamaTokenizerFast.from_pretrained("codellama/CodeLlama-7b-hf")
+        tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")
+        tokenizer_fast = CodeLlamaTokenizerFast.from_pretrained("codellama/CodeLlama-7b-Instruct-hf")
 
         formatted_prompt = tokenizer.tokenize(PROMPTS[0])
         self.assertEqual(formatted_prompt, tokenizer_fast.tokenize(PROMPTS[0]))