[docstring] Update GPT2 and Whisper (#26642)

* [DOCS] Update docstrings for  and  tokenizer

* [DOCS] add pad_token argument to whisper tokenizer docstring

* [FIX] Reword pad_token description

* [CHORE] Apply style formatting

---------

Co-authored-by: jmcdonnell <jmcdonnell@fieldbox.ai>
This commit is contained in:
Joseph McDonnell
2023-10-12 17:00:59 +02:00
committed by GitHub
parent eb734e5147
commit b4199c2dad
6 changed files with 33 additions and 31 deletions

View File

@@ -58,7 +58,7 @@ class GPT2Config(PretrainedConfig):
Number of hidden layers in the Transformer encoder. Number of hidden layers in the Transformer encoder.
n_head (`int`, *optional*, defaults to 12): n_head (`int`, *optional*, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
n_inner (`int`, *optional*, defaults to None): n_inner (`int`, *optional*):
Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
activation_function (`str`, *optional*, defaults to `"gelu_new"`): activation_function (`str`, *optional*, defaults to `"gelu_new"`):
Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`. Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
@@ -68,7 +68,7 @@ class GPT2Config(PretrainedConfig):
The dropout ratio for the embeddings. The dropout ratio for the embeddings.
attn_pdrop (`float`, *optional*, defaults to 0.1): attn_pdrop (`float`, *optional*, defaults to 0.1):
The dropout ratio for the attention. The dropout ratio for the attention.
layer_norm_epsilon (`float`, *optional*, defaults to 1e-5): layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
The epsilon to use in the layer normalization layers. The epsilon to use in the layer normalization layers.
initializer_range (`float`, *optional*, defaults to 0.02): initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
@@ -107,6 +107,10 @@ class GPT2Config(PretrainedConfig):
Scale attention weights by dividing by sqrt(hidden_size).. Scale attention weights by dividing by sqrt(hidden_size)..
use_cache (`bool`, *optional*, defaults to `True`): use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models). Whether or not the model should return the last key/values attentions (not used by all models).
bos_token_id (`int`, *optional*, defaults to 50256):
Id of the beginning of sentence token in the vocabulary.
eos_token_id (`int`, *optional*, defaults to 50256):
Id of the end of sentence token in the vocabulary.
scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`): scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
Whether to additionally scale attention weights by `1 / layer_idx + 1`. Whether to additionally scale attention weights by `1 / layer_idx + 1`.
reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`): reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):

View File

@@ -136,16 +136,21 @@ class GPT2Tokenizer(PreTrainedTokenizer):
errors (`str`, *optional*, defaults to `"replace"`): errors (`str`, *optional*, defaults to `"replace"`):
Paradigm to follow when decoding bytes to UTF-8. See Paradigm to follow when decoding bytes to UTF-8. See
[bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
unk_token (`str`, *optional*, defaults to `<|endoftext|>`): unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead. token instead.
bos_token (`str`, *optional*, defaults to `<|endoftext|>`): bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The beginning of sequence token. The beginning of sequence token.
eos_token (`str`, *optional*, defaults to `<|endoftext|>`): eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The end of sequence token. The end of sequence token.
pad_token (`str`, *optional*):
The token used for padding, for example when batching sequences of different lengths.
add_prefix_space (`bool`, *optional*, defaults to `False`): add_prefix_space (`bool`, *optional*, defaults to `False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (GPT2 tokenizer detect beginning of words by the preceding space). other word. (GPT2 tokenizer detect beginning of words by the preceding space).
add_bos_token (`bool`, *optional*, defaults to `False`):
Whether or not to add an initial beginning of sentence token to the input. This allows to treat the leading
word just as any other word.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES

View File

@@ -95,25 +95,23 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
refer to this superclass for more information regarding those methods. refer to this superclass for more information regarding those methods.
Args: Args:
vocab_file (`str`): vocab_file (`str`, *optional*):
Path to the vocabulary file. Path to the vocabulary file.
merges_file (`str`): merges_file (`str`, *optional*):
Path to the merges file. Path to the merges file.
errors (`str`, *optional*, defaults to `"replace"`): tokenizer_file (`str`, *optional*):
Paradigm to follow when decoding bytes to UTF-8. See Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
[bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. contains everything needed to load the tokenizer.
unk_token (`str`, *optional*, defaults to `<|endoftext|>`): unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead. token instead.
bos_token (`str`, *optional*, defaults to `<|endoftext|>`): bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The beginning of sequence token. The beginning of sequence token.
eos_token (`str`, *optional*, defaults to `<|endoftext|>`): eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The end of sequence token. The end of sequence token.
add_prefix_space (`bool`, *optional*, defaults to `False`): add_prefix_space (`bool`, *optional*, defaults to `False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (GPT2 tokenizer detect beginning of words by the preceding space). other word. (GPT2 tokenizer detect beginning of words by the preceding space).
trim_offsets (`bool`, *optional*, defaults to `True`):
Whether or not the post-processing step should trim offsets to avoid including whitespaces.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES

View File

@@ -224,7 +224,7 @@ class WhisperTokenizer(PreTrainedTokenizer):
Path to the vocabulary file. Path to the vocabulary file.
merges_file (`str`): merges_file (`str`):
Path to the merges file. Path to the merges file.
normalizer_file (`str`, *optional*, defaults to `None`): normalizer_file (`str`, *optional*):
Path to the normalizer_file file. Path to the normalizer_file file.
errors (`str`, *optional*, defaults to `"replace"`): errors (`str`, *optional*, defaults to `"replace"`):
Paradigm to follow when decoding bytes to UTF-8. See Paradigm to follow when decoding bytes to UTF-8. See
@@ -237,6 +237,8 @@ class WhisperTokenizer(PreTrainedTokenizer):
`"<|startoftranscript|>"` when generating. `"<|startoftranscript|>"` when generating.
eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The end of sequence token. The end of sequence token.
pad_token (`str`, *optional*):
The token used for padding, for example when batching sequences of different lengths.
add_prefix_space (`bool`, *optional*, defaults to `False`): add_prefix_space (`bool`, *optional*, defaults to `False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. other word.

View File

@@ -95,28 +95,26 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast):
refer to this superclass for more information regarding those methods. refer to this superclass for more information regarding those methods.
Args: Args:
vocab_file (`str`): vocab_file (`str`, *optional*):
Path to the vocabulary file. Path to the vocabulary file.
merges_file (`str`): merges_file (`str`, *optional*):
Path to the merges file. Path to the merges file.
normalizer_file (`str`, *optional*, defaults to `None`): normalizer_file (`str`, *optional*):
Path to the normalizer_file file. Path to the normalizer_file file.
errors (`str`, *optional*, defaults to `"replace"`): tokenizer_file (`str`, *optional*):
Paradigm to follow when decoding bytes to UTF-8. See Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
[bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. contains everything needed to load the tokenizer.
unk_token (`str`, *optional*, defaults to `<|endoftext|>`): unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead. token instead.
bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`): bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The beginning of sequence token. The `decoder_start_token_id` is used to set the first token as The beginning of sequence token. The `decoder_start_token_id` is used to set the first token as
`"<|startoftranscript|>"` when generating. `"<|startoftranscript|>"` when generating.
eos_token (`str`, *optional*, defaults to `<|endoftext|>`): eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
The end of sequence token. The end of sequence token.
add_prefix_space (`bool`, *optional*, defaults to `False`): add_prefix_space (`bool`, *optional*, defaults to `False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (Whisper tokenizer detect beginning of words by the preceding space). other word. (Whisper tokenizer detect beginning of words by the preceding space).
trim_offsets (`bool`, *optional*, defaults to `True`):
Whether or not the post-processing step should trim offsets to avoid including whitespaces.
language (`str`, *optional*): language (`str`, *optional*):
The language of the transcription text. The corresponding language id token is appended to the start of the The language of the transcription text. The corresponding language id token is appended to the start of the
sequence for multilingual speech recognition and speech translation tasks, e.g. for Spanish the token sequence for multilingual speech recognition and speech translation tasks, e.g. for Spanish the token

View File

@@ -316,9 +316,6 @@ OBJECTS_TO_IGNORE = [
"FlavaTextModel", "FlavaTextModel",
"FocalNetModel", "FocalNetModel",
"FunnelTokenizerFast", "FunnelTokenizerFast",
"GPT2Config",
"GPT2Tokenizer",
"GPT2TokenizerFast",
"GPTBigCodeConfig", "GPTBigCodeConfig",
"GPTJConfig", "GPTJConfig",
"GPTNeoXConfig", "GPTNeoXConfig",
@@ -789,8 +786,6 @@ OBJECTS_TO_IGNORE = [
"WhisperConfig", "WhisperConfig",
"WhisperFeatureExtractor", "WhisperFeatureExtractor",
"WhisperForAudioClassification", "WhisperForAudioClassification",
"WhisperTokenizer",
"WhisperTokenizerFast",
"XCLIPTextConfig", "XCLIPTextConfig",
"XCLIPVisionConfig", "XCLIPVisionConfig",
"XGLMConfig", "XGLMConfig",