From b4199c2dad51639f5c467c7e2986e5270b275d84 Mon Sep 17 00:00:00 2001 From: Joseph McDonnell <90898184+McDonnellJoseph@users.noreply.github.com> Date: Thu, 12 Oct 2023 17:00:59 +0200 Subject: [PATCH] [docstring] Update `GPT2` and `Whisper` (#26642) * [DOCS] Update docstrings for and tokenizer * [DOCS] add pad_token argument to whisper tokenizer docstring * [FIX] Reword pad_token description * [CHORE] Apply style formatting --------- Co-authored-by: jmcdonnell --- .../models/gpt2/configuration_gpt2.py | 8 ++++++-- .../models/gpt2/tokenization_gpt2.py | 11 ++++++++--- .../models/gpt2/tokenization_gpt2_fast.py | 18 ++++++++---------- .../models/whisper/tokenization_whisper.py | 4 +++- .../whisper/tokenization_whisper_fast.py | 18 ++++++++---------- utils/check_docstrings.py | 5 ----- 6 files changed, 33 insertions(+), 31 deletions(-) diff --git a/src/transformers/models/gpt2/configuration_gpt2.py b/src/transformers/models/gpt2/configuration_gpt2.py index ef1c591a27..d35a161428 100644 --- a/src/transformers/models/gpt2/configuration_gpt2.py +++ b/src/transformers/models/gpt2/configuration_gpt2.py @@ -58,7 +58,7 @@ class GPT2Config(PretrainedConfig): Number of hidden layers in the Transformer encoder. n_head (`int`, *optional*, defaults to 12): Number of attention heads for each attention layer in the Transformer encoder. - n_inner (`int`, *optional*, defaults to None): + n_inner (`int`, *optional*): Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd activation_function (`str`, *optional*, defaults to `"gelu_new"`): Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`. @@ -68,7 +68,7 @@ class GPT2Config(PretrainedConfig): The dropout ratio for the embeddings. attn_pdrop (`float`, *optional*, defaults to 0.1): The dropout ratio for the attention. - layer_norm_epsilon (`float`, *optional*, defaults to 1e-5): + layer_norm_epsilon (`float`, *optional*, defaults to 1e-05): The epsilon to use in the layer normalization layers. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. @@ -107,6 +107,10 @@ class GPT2Config(PretrainedConfig): Scale attention weights by dividing by sqrt(hidden_size).. use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). + bos_token_id (`int`, *optional*, defaults to 50256): + Id of the beginning of sentence token in the vocabulary. + eos_token_id (`int`, *optional*, defaults to 50256): + Id of the end of sentence token in the vocabulary. scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`): Whether to additionally scale attention weights by `1 / layer_idx + 1`. reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`): diff --git a/src/transformers/models/gpt2/tokenization_gpt2.py b/src/transformers/models/gpt2/tokenization_gpt2.py index 21c2cdf382..e757fd9801 100644 --- a/src/transformers/models/gpt2/tokenization_gpt2.py +++ b/src/transformers/models/gpt2/tokenization_gpt2.py @@ -136,16 +136,21 @@ class GPT2Tokenizer(PreTrainedTokenizer): errors (`str`, *optional*, defaults to `"replace"`): Paradigm to follow when decoding bytes to UTF-8. See [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. - unk_token (`str`, *optional*, defaults to `<|endoftext|>`): + unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. - bos_token (`str`, *optional*, defaults to `<|endoftext|>`): + bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): The beginning of sequence token. - eos_token (`str`, *optional*, defaults to `<|endoftext|>`): + eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): The end of sequence token. + pad_token (`str`, *optional*): + The token used for padding, for example when batching sequences of different lengths. add_prefix_space (`bool`, *optional*, defaults to `False`): Whether or not to add an initial space to the input. This allows to treat the leading word just as any other word. (GPT2 tokenizer detect beginning of words by the preceding space). + add_bos_token (`bool`, *optional*, defaults to `False`): + Whether or not to add an initial beginning of sentence token to the input. This allows to treat the leading + word just as any other word. """ vocab_files_names = VOCAB_FILES_NAMES diff --git a/src/transformers/models/gpt2/tokenization_gpt2_fast.py b/src/transformers/models/gpt2/tokenization_gpt2_fast.py index 189a355084..0f7a31c9f8 100644 --- a/src/transformers/models/gpt2/tokenization_gpt2_fast.py +++ b/src/transformers/models/gpt2/tokenization_gpt2_fast.py @@ -95,25 +95,23 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast): refer to this superclass for more information regarding those methods. Args: - vocab_file (`str`): + vocab_file (`str`, *optional*): Path to the vocabulary file. - merges_file (`str`): + merges_file (`str`, *optional*): Path to the merges file. - errors (`str`, *optional*, defaults to `"replace"`): - Paradigm to follow when decoding bytes to UTF-8. See - [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. - unk_token (`str`, *optional*, defaults to `<|endoftext|>`): + tokenizer_file (`str`, *optional*): + Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that + contains everything needed to load the tokenizer. + unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. - bos_token (`str`, *optional*, defaults to `<|endoftext|>`): + bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): The beginning of sequence token. - eos_token (`str`, *optional*, defaults to `<|endoftext|>`): + eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): The end of sequence token. add_prefix_space (`bool`, *optional*, defaults to `False`): Whether or not to add an initial space to the input. This allows to treat the leading word just as any other word. (GPT2 tokenizer detect beginning of words by the preceding space). - trim_offsets (`bool`, *optional*, defaults to `True`): - Whether or not the post-processing step should trim offsets to avoid including whitespaces. """ vocab_files_names = VOCAB_FILES_NAMES diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py index b83528a84a..d2f6ea382f 100644 --- a/src/transformers/models/whisper/tokenization_whisper.py +++ b/src/transformers/models/whisper/tokenization_whisper.py @@ -224,7 +224,7 @@ class WhisperTokenizer(PreTrainedTokenizer): Path to the vocabulary file. merges_file (`str`): Path to the merges file. - normalizer_file (`str`, *optional*, defaults to `None`): + normalizer_file (`str`, *optional*): Path to the normalizer_file file. errors (`str`, *optional*, defaults to `"replace"`): Paradigm to follow when decoding bytes to UTF-8. See @@ -237,6 +237,8 @@ class WhisperTokenizer(PreTrainedTokenizer): `"<|startoftranscript|>"` when generating. eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): The end of sequence token. + pad_token (`str`, *optional*): + The token used for padding, for example when batching sequences of different lengths. add_prefix_space (`bool`, *optional*, defaults to `False`): Whether or not to add an initial space to the input. This allows to treat the leading word just as any other word. diff --git a/src/transformers/models/whisper/tokenization_whisper_fast.py b/src/transformers/models/whisper/tokenization_whisper_fast.py index 64a4343a19..43d8a46d7c 100644 --- a/src/transformers/models/whisper/tokenization_whisper_fast.py +++ b/src/transformers/models/whisper/tokenization_whisper_fast.py @@ -95,28 +95,26 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast): refer to this superclass for more information regarding those methods. Args: - vocab_file (`str`): + vocab_file (`str`, *optional*): Path to the vocabulary file. - merges_file (`str`): + merges_file (`str`, *optional*): Path to the merges file. - normalizer_file (`str`, *optional*, defaults to `None`): + normalizer_file (`str`, *optional*): Path to the normalizer_file file. - errors (`str`, *optional*, defaults to `"replace"`): - Paradigm to follow when decoding bytes to UTF-8. See - [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. - unk_token (`str`, *optional*, defaults to `<|endoftext|>`): + tokenizer_file (`str`, *optional*): + Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that + contains everything needed to load the tokenizer. + unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`): The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this token instead. bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): The beginning of sequence token. The `decoder_start_token_id` is used to set the first token as `"<|startoftranscript|>"` when generating. - eos_token (`str`, *optional*, defaults to `<|endoftext|>`): + eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): The end of sequence token. add_prefix_space (`bool`, *optional*, defaults to `False`): Whether or not to add an initial space to the input. This allows to treat the leading word just as any other word. (Whisper tokenizer detect beginning of words by the preceding space). - trim_offsets (`bool`, *optional*, defaults to `True`): - Whether or not the post-processing step should trim offsets to avoid including whitespaces. language (`str`, *optional*): The language of the transcription text. The corresponding language id token is appended to the start of the sequence for multilingual speech recognition and speech translation tasks, e.g. for Spanish the token diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py index 6ad68d3c67..6ad8ea1e20 100644 --- a/utils/check_docstrings.py +++ b/utils/check_docstrings.py @@ -316,9 +316,6 @@ OBJECTS_TO_IGNORE = [ "FlavaTextModel", "FocalNetModel", "FunnelTokenizerFast", - "GPT2Config", - "GPT2Tokenizer", - "GPT2TokenizerFast", "GPTBigCodeConfig", "GPTJConfig", "GPTNeoXConfig", @@ -789,8 +786,6 @@ OBJECTS_TO_IGNORE = [ "WhisperConfig", "WhisperFeatureExtractor", "WhisperForAudioClassification", - "WhisperTokenizer", - "WhisperTokenizerFast", "XCLIPTextConfig", "XCLIPVisionConfig", "XGLMConfig",