From 5c6b83cb69da2fbc58b3c25c0ed7ba354a3d4141 Mon Sep 17 00:00:00 2001
From: przemL <24912415+przemL@users.noreply.github.com>
Date: Mon, 16 Oct 2023 18:26:55 +0200
Subject: [PATCH] [docstring] Fix bert generation tokenizer (#26820)

* Remove BertGenerationTokenizer from objects to ignore

The file BertGenerationTokenizer is removed from
objects to ignore as a first step to fix docstring.

* Docstrings fix for BertGenerationTokenizer

Docstring fix is generated for BertGenerationTokenizer
by using check_docstrings.py.

* Fix docstring for BertGenerationTokenizer

Added sep_token type and docstring in BertGenerationTokenizer.
---
 .../bert_generation/tokenization_bert_generation.py       | 8 ++++++--
 utils/check_docstrings.py                                 | 1 -
 2 files changed, 6 insertions(+), 3 deletions(-)
diff --git a/src/transformers/models/bert_generation/tokenization_bert_generation.py b/src/transformers/models/bert_generation/tokenization_bert_generation.py
index f8d49f86ac..3b6298fcbd 100644
--- a/src/transformers/models/bert_generation/tokenization_bert_generation.py
+++ b/src/transformers/models/bert_generation/tokenization_bert_generation.py
@@ -51,15 +51,19 @@ class BertGenerationTokenizer(PreTrainedTokenizer):
         vocab_file (`str`):
             [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
-        eos_token (`str`, *optional*, defaults to `"</s>"`):
-            The end of sequence token.
         bos_token (`str`, *optional*, defaults to `"<s>"`):
             The begin of sequence token.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
         unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
         pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
+        sep_token (`str`, *optional*, defaults to `"<::::>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
         sp_model_kwargs (`dict`, *optional*):
             Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
             SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py
index 8a9aa1cf76..f46ad8995c 100644
--- a/utils/check_docstrings.py
+++ b/utils/check_docstrings.py
@@ -94,7 +94,6 @@ OBJECTS_TO_IGNORE = [
     "BarthezTokenizerFast",
     "BeitModel",
     "BertConfig",
-    "BertGenerationTokenizer",
     "BertJapaneseTokenizer",
     "BertModel",
     "BertTokenizerFast",