From 5c081e29930466ecf9a478727039d980131076d9 Mon Sep 17 00:00:00 2001 From: Bojun-Feng <102875484+Bojun-Feng@users.noreply.github.com> Date: Mon, 16 Oct 2023 03:11:45 -0500 Subject: [PATCH] [docstring] Fix docstring for `CodeLlamaTokenizerFast` (#26666) * remove from OBJECTS_TO_IGNORE * run check_docstrings.py * fill in information * ignore CodeLlamaTokenizer --- .../tokenization_code_llama_fast.py | 22 ++++++++++--------- utils/check_docstrings.py | 1 - 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/transformers/models/code_llama/tokenization_code_llama_fast.py b/src/transformers/models/code_llama/tokenization_code_llama_fast.py index 5e8a7945dc..719caa48f6 100644 --- a/src/transformers/models/code_llama/tokenization_code_llama_fast.py +++ b/src/transformers/models/code_llama/tokenization_code_llama_fast.py @@ -75,37 +75,39 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast): which supports prompt infilling. Args: - vocab_file (`str`): + vocab_file (`str`, *optional*): [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .model extension) that contains the vocabulary necessary to instantiate a tokenizer. - tokenizer_file (`str`): + tokenizer_file (`str`, *optional*): [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that contains everything needed to load the tokenizer. clean_up_tokenization_spaces (`str`, *optional*, defaults to `False`): Wether to cleanup spaces after decoding, cleanup consists in removing potential artifacts like extra spaces. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. bos_token (`str`, *optional*, defaults to `""`): The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. eos_token (`str`, *optional*, defaults to `""`): The end of sequence token. - unk_token (`str`, *optional*, defaults to `""`): - The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this - token instead. prefix_token (`str`, *optional*, defaults to `"▁
"`):
             Prefix token used for infilling.
-        suffix_token (`str`, *optional*, defaults to `"▁"`):
-            Suffix token used for infilling.
         middle_token (`str`, *optional*, defaults to `"▁"`):
             Middle token used for infilling.
+        suffix_token (`str`, *optional*, defaults to `"▁"`):
+            Suffix token used for infilling.
         eot_token (`str`, *optional*, defaults to `"▁"`):
             End of text token used for infilling.
         fill_token (`str`, *optional*, defaults to `""`):
             The token used to split the input between the prefix and suffix.
-        suffix_first (`bool`, *optional*, default to `False`):
-            Whether the input prompt and suffix should be formatted with the suffix first.
         additional_special_tokens (`List[str]`, *optional*):
             Additional special tokens used by the tokenizer.
-        use_default_system_prompt (`bool`, *optional*, defaults to `True`):
+        add_bos_token (`bool`, *optional*, defaults to `True`):
+            Whether to add a beginning of sequence token at the start of sequences.
+        add_eos_token (`bool`, *optional*, defaults to `False`):
+            Whether to add an end of sequence token at the end of sequences.
+        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
             Whether or not the default system prompt for Llama should be used.
     """
 
diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py
index 25861e5a9c..8a9aa1cf76 100644
--- a/utils/check_docstrings.py
+++ b/utils/check_docstrings.py
@@ -130,7 +130,6 @@ OBJECTS_TO_IGNORE = [
     "CodeGenConfig",
     "CodeGenTokenizer",
     "CodeGenTokenizerFast",
-    "CodeLlamaTokenizerFast",
     "ConditionalDetrConfig",
     "ConditionalDetrImageProcessor",
     "ConvBertConfig",