[docstring] Update GPT2 and Whisper (#26642)
* [DOCS] Update docstrings for and tokenizer * [DOCS] add pad_token argument to whisper tokenizer docstring * [FIX] Reword pad_token description * [CHORE] Apply style formatting --------- Co-authored-by: jmcdonnell <jmcdonnell@fieldbox.ai>
This commit is contained in:
@@ -58,7 +58,7 @@ class GPT2Config(PretrainedConfig):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
n_head (`int`, *optional*, defaults to 12):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
n_inner (`int`, *optional*, defaults to None):
|
||||
n_inner (`int`, *optional*):
|
||||
Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
|
||||
activation_function (`str`, *optional*, defaults to `"gelu_new"`):
|
||||
Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
|
||||
@@ -68,7 +68,7 @@ class GPT2Config(PretrainedConfig):
|
||||
The dropout ratio for the embeddings.
|
||||
attn_pdrop (`float`, *optional*, defaults to 0.1):
|
||||
The dropout ratio for the attention.
|
||||
layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
|
||||
layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
|
||||
The epsilon to use in the layer normalization layers.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
@@ -107,6 +107,10 @@ class GPT2Config(PretrainedConfig):
|
||||
Scale attention weights by dividing by sqrt(hidden_size)..
|
||||
use_cache (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should return the last key/values attentions (not used by all models).
|
||||
bos_token_id (`int`, *optional*, defaults to 50256):
|
||||
Id of the beginning of sentence token in the vocabulary.
|
||||
eos_token_id (`int`, *optional*, defaults to 50256):
|
||||
Id of the end of sentence token in the vocabulary.
|
||||
scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
|
||||
Whether to additionally scale attention weights by `1 / layer_idx + 1`.
|
||||
reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
|
||||
|
||||
@@ -136,16 +136,21 @@ class GPT2Tokenizer(PreTrainedTokenizer):
|
||||
errors (`str`, *optional*, defaults to `"replace"`):
|
||||
Paradigm to follow when decoding bytes to UTF-8. See
|
||||
[bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
|
||||
unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
|
||||
unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
|
||||
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||
token instead.
|
||||
bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
|
||||
bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
|
||||
The beginning of sequence token.
|
||||
eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
|
||||
eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
|
||||
The end of sequence token.
|
||||
pad_token (`str`, *optional*):
|
||||
The token used for padding, for example when batching sequences of different lengths.
|
||||
add_prefix_space (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
|
||||
other word. (GPT2 tokenizer detect beginning of words by the preceding space).
|
||||
add_bos_token (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to add an initial beginning of sentence token to the input. This allows to treat the leading
|
||||
word just as any other word.
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
|
||||
@@ -95,25 +95,23 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
|
||||
refer to this superclass for more information regarding those methods.
|
||||
|
||||
Args:
|
||||
vocab_file (`str`):
|
||||
vocab_file (`str`, *optional*):
|
||||
Path to the vocabulary file.
|
||||
merges_file (`str`):
|
||||
merges_file (`str`, *optional*):
|
||||
Path to the merges file.
|
||||
errors (`str`, *optional*, defaults to `"replace"`):
|
||||
Paradigm to follow when decoding bytes to UTF-8. See
|
||||
[bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
|
||||
unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
|
||||
tokenizer_file (`str`, *optional*):
|
||||
Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
|
||||
contains everything needed to load the tokenizer.
|
||||
unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
|
||||
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||
token instead.
|
||||
bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
|
||||
bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
|
||||
The beginning of sequence token.
|
||||
eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
|
||||
eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
|
||||
The end of sequence token.
|
||||
add_prefix_space (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
|
||||
other word. (GPT2 tokenizer detect beginning of words by the preceding space).
|
||||
trim_offsets (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the post-processing step should trim offsets to avoid including whitespaces.
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
|
||||
@@ -224,7 +224,7 @@ class WhisperTokenizer(PreTrainedTokenizer):
|
||||
Path to the vocabulary file.
|
||||
merges_file (`str`):
|
||||
Path to the merges file.
|
||||
normalizer_file (`str`, *optional*, defaults to `None`):
|
||||
normalizer_file (`str`, *optional*):
|
||||
Path to the normalizer_file file.
|
||||
errors (`str`, *optional*, defaults to `"replace"`):
|
||||
Paradigm to follow when decoding bytes to UTF-8. See
|
||||
@@ -237,6 +237,8 @@ class WhisperTokenizer(PreTrainedTokenizer):
|
||||
`"<|startoftranscript|>"` when generating.
|
||||
eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
|
||||
The end of sequence token.
|
||||
pad_token (`str`, *optional*):
|
||||
The token used for padding, for example when batching sequences of different lengths.
|
||||
add_prefix_space (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
|
||||
other word.
|
||||
|
||||
@@ -95,28 +95,26 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast):
|
||||
refer to this superclass for more information regarding those methods.
|
||||
|
||||
Args:
|
||||
vocab_file (`str`):
|
||||
vocab_file (`str`, *optional*):
|
||||
Path to the vocabulary file.
|
||||
merges_file (`str`):
|
||||
merges_file (`str`, *optional*):
|
||||
Path to the merges file.
|
||||
normalizer_file (`str`, *optional*, defaults to `None`):
|
||||
normalizer_file (`str`, *optional*):
|
||||
Path to the normalizer_file file.
|
||||
errors (`str`, *optional*, defaults to `"replace"`):
|
||||
Paradigm to follow when decoding bytes to UTF-8. See
|
||||
[bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
|
||||
unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
|
||||
tokenizer_file (`str`, *optional*):
|
||||
Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
|
||||
contains everything needed to load the tokenizer.
|
||||
unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
|
||||
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
||||
token instead.
|
||||
bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
|
||||
The beginning of sequence token. The `decoder_start_token_id` is used to set the first token as
|
||||
`"<|startoftranscript|>"` when generating.
|
||||
eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
|
||||
eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
|
||||
The end of sequence token.
|
||||
add_prefix_space (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
|
||||
other word. (Whisper tokenizer detect beginning of words by the preceding space).
|
||||
trim_offsets (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the post-processing step should trim offsets to avoid including whitespaces.
|
||||
language (`str`, *optional*):
|
||||
The language of the transcription text. The corresponding language id token is appended to the start of the
|
||||
sequence for multilingual speech recognition and speech translation tasks, e.g. for Spanish the token
|
||||
|
||||
@@ -316,9 +316,6 @@ OBJECTS_TO_IGNORE = [
|
||||
"FlavaTextModel",
|
||||
"FocalNetModel",
|
||||
"FunnelTokenizerFast",
|
||||
"GPT2Config",
|
||||
"GPT2Tokenizer",
|
||||
"GPT2TokenizerFast",
|
||||
"GPTBigCodeConfig",
|
||||
"GPTJConfig",
|
||||
"GPTNeoXConfig",
|
||||
@@ -789,8 +786,6 @@ OBJECTS_TO_IGNORE = [
|
||||
"WhisperConfig",
|
||||
"WhisperFeatureExtractor",
|
||||
"WhisperForAudioClassification",
|
||||
"WhisperTokenizer",
|
||||
"WhisperTokenizerFast",
|
||||
"XCLIPTextConfig",
|
||||
"XCLIPVisionConfig",
|
||||
"XGLMConfig",
|
||||
|
||||
Reference in New Issue
Block a user