Docstrings
This commit is contained in:
@@ -300,6 +300,25 @@ ALBERT_INPUTS_DOCSTRING = r"""
|
|||||||
@add_start_docstrings("The bare ALBERT Model transformer outputting raw hidden-states without any specific head on top.",
|
@add_start_docstrings("The bare ALBERT Model transformer outputting raw hidden-states without any specific head on top.",
|
||||||
ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
|
ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
|
||||||
class AlbertModel(BertModel):
|
class AlbertModel(BertModel):
|
||||||
|
r"""
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
|
||||||
|
Sequence of hidden-states at the output of the last layer of the model.
|
||||||
|
**pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
|
||||||
|
Last layer hidden-state of the first token of the sequence (classification token)
|
||||||
|
further processed by a Linear layer and a Tanh activation function. The Linear
|
||||||
|
layer weights are trained from the next sentence prediction (classification)
|
||||||
|
objective during Bert pretraining. This output is usually *not* a good summary
|
||||||
|
of the semantic content of the input, you're often better with averaging or pooling
|
||||||
|
the sequence of hidden-states for the whole input sequence.
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
"""
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(AlbertModel, self).__init__(config)
|
super(AlbertModel, self).__init__(config)
|
||||||
|
|
||||||
@@ -343,6 +362,27 @@ class AlbertModel(BertModel):
|
|||||||
|
|
||||||
@add_start_docstrings("Bert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
|
@add_start_docstrings("Bert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING, ALBERT_INPUTS_DOCSTRING)
|
||||||
class AlbertForMaskedLM(nn.Module):
|
class AlbertForMaskedLM(nn.Module):
|
||||||
|
r"""
|
||||||
|
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Labels for computing the masked language modeling loss.
|
||||||
|
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||||
|
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||||
|
in ``[0, ..., config.vocab_size]``
|
||||||
|
|
||||||
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
**loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||||
|
Masked language modeling loss.
|
||||||
|
**prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||||
|
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||||
|
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
|
||||||
|
of shape ``(batch_size, sequence_length, hidden_size)``:
|
||||||
|
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
|
||||||
|
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
|
||||||
|
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
|
||||||
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
|
||||||
|
"""
|
||||||
|
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(AlbertForMaskedLM, self).__init__()
|
super(AlbertForMaskedLM, self).__init__()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user