From 1262495a912b9cd97e2ae174fd627a9d8a502341 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Mon, 22 Jun 2020 14:43:52 -0400 Subject: [PATCH] Add TF auto model to the docs + fix sphinx warnings (#5187) --- docs/source/model_doc/auto.rst | 50 +++++++++- docs/source/model_doc/mobilebert.rst | 2 +- src/transformers/modeling_albert.py | 42 +++----- src/transformers/modeling_bart.py | 44 ++++----- src/transformers/modeling_bert.py | 36 +++---- src/transformers/modeling_ctrl.py | 10 +- src/transformers/modeling_distilbert.py | 14 +-- src/transformers/modeling_electra.py | 19 ++-- src/transformers/modeling_flaubert.py | 2 +- src/transformers/modeling_gpt2.py | 4 +- src/transformers/modeling_longformer.py | 14 +-- src/transformers/modeling_mobilebert.py | 34 ++++--- src/transformers/modeling_openai.py | 4 +- src/transformers/modeling_reformer.py | 4 +- src/transformers/modeling_roberta.py | 10 +- src/transformers/modeling_t5.py | 9 +- src/transformers/modeling_tf_albert.py | 107 +++++++++++---------- src/transformers/modeling_tf_auto.py | 16 +++ src/transformers/modeling_tf_bert.py | 66 ++++++------- src/transformers/modeling_tf_ctrl.py | 18 ++-- src/transformers/modeling_tf_distilbert.py | 48 ++++----- src/transformers/modeling_tf_electra.py | 40 ++++---- src/transformers/modeling_tf_gpt2.py | 27 +++--- src/transformers/modeling_tf_mobilebert.py | 58 ++++++----- src/transformers/modeling_tf_openai.py | 26 ++--- src/transformers/modeling_tf_roberta.py | 48 ++++----- src/transformers/modeling_tf_t5.py | 19 ++-- src/transformers/modeling_tf_transfo_xl.py | 16 +-- src/transformers/modeling_tf_xlm.py | 40 ++++---- src/transformers/modeling_tf_xlnet.py | 40 ++++---- src/transformers/modeling_transfo_xl.py | 4 +- src/transformers/modeling_xlm.py | 12 +-- src/transformers/modeling_xlnet.py | 14 +-- 33 files changed, 474 insertions(+), 423 deletions(-) diff --git a/docs/source/model_doc/auto.rst b/docs/source/model_doc/auto.rst index 55140d5a99..c3345ac4f4 100644 --- a/docs/source/model_doc/auto.rst +++ b/docs/source/model_doc/auto.rst @@ -1,11 +1,15 @@ AutoModels ----------- -In many cases, the architecture you want to use can be guessed from the name or the path of the pretrained model you are supplying to the ``from_pretrained`` method. +In many cases, the architecture you want to use can be guessed from the name or the path of the pretrained model you +are supplying to the ``from_pretrained`` method. -AutoClasses are here to do this job for you so that you automatically retrieve the relevant model given the name/path to the pretrained weights/config/vocabulary: +AutoClasses are here to do this job for you so that you automatically retrieve the relevant model given the name/path +to the pretrained weights/config/vocabulary: -Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will directly create a class of the relevant architecture (ex: ``model = AutoModel.from_pretrained('bert-base-cased')`` will create a instance of ``BertModel``). +Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will directly create a class of the relevant +architecture (ex: ``model = AutoModel.from_pretrained('bert-base-cased')`` will create a instance of +:class:`~transformers.BertModel`). ``AutoConfig`` @@ -63,3 +67,43 @@ Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will di .. autoclass:: transformers.AutoModelForTokenClassification :members: +``TFAutoModel`` +~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TFAutoModel + :members: + + +``TFAutoModelForPreTraining`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TFAutoModelForPreTraining + :members: + + +``TFAutoModelWithLMHead`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TFAutoModelWithLMHead + :members: + + +``TFAutoModelForSequenceClassification`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TFAutoModelForSequenceClassification + :members: + + +``TFAutoModelForQuestionAnswering`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TFAutoModelForQuestionAnswering + :members: + + +``TFAutoModelForTokenClassification`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TFAutoModelForTokenClassification + :members: diff --git a/docs/source/model_doc/mobilebert.rst b/docs/source/model_doc/mobilebert.rst index b8f0bba144..caec4af1fb 100644 --- a/docs/source/model_doc/mobilebert.rst +++ b/docs/source/model_doc/mobilebert.rst @@ -50,7 +50,7 @@ MobileBertTokenizer MobileBertTokenizerFast -~~~~~~~~~~~~~~~~~~~~~ +~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.MobileBertTokenizerFast :members: diff --git a/src/transformers/modeling_albert.py b/src/transformers/modeling_albert.py index 4598b73ca8..fb86c3d62b 100644 --- a/src/transformers/modeling_albert.py +++ b/src/transformers/modeling_albert.py @@ -510,14 +510,12 @@ class AlbertModel(AlbertPreTrainedModel): This output is usually *not* a good summary of the semantic content of the input, you're often better with averaging or pooling the sequence of hidden-states for the whole input sequence. - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned - when ``output_hidden_states=True`` is passed or when - ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -644,14 +642,12 @@ class AlbertForPreTraining(AlbertPreTrainedModel): sop_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned - when ``output_hidden_states=True`` is passed or when - ``config.output_hidden_states``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -795,14 +791,12 @@ class AlbertForMaskedLM(AlbertPreTrainedModel): Masked language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned - when ``output_hidden_states=True`` is passed or when - ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -894,14 +888,12 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel): Classification (or regression if config.num_labels==1) loss. logits ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` Classification (or regression if config.num_labels==1) scores (before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned - when ``output_hidden_states=True`` is passed or when - ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -993,14 +985,12 @@ class AlbertForTokenClassification(AlbertPreTrainedModel): Classification loss. scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) Classification scores (before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned - when ``output_hidden_states=True`` is passed or when - ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -1103,14 +1093,12 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel): Span-start scores (before SoftMax). end_scores: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` Span-end scores (before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned - when ``output_hidden_states=True`` is passed or when - ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -1214,14 +1202,12 @@ class AlbertForMultipleChoice(AlbertPreTrainedModel): `num_choices` is the second dimension of the input tensors. (see `input_ids` above). Classification scores (before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned - when ``output_hidden_states=True`` is passed or when - ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py index b70471fe7f..9e1812be8e 100644 --- a/src/transformers/modeling_bart.py +++ b/src/transformers/modeling_bart.py @@ -933,14 +933,12 @@ class BartForConditionalGeneration(PretrainedBartModel): Masked language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned - when ``output_hidden_states=True`` is passed or when - ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -1080,21 +1078,18 @@ class BartForSequenceClassification(PretrainedBartModel): Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BartConfig`) and inputs: - loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): - Classification loss (cross entropy) - logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): - Classification (or regression if config.num_labels==1) scores (before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned - when ``output_hidden_states=True`` is passed or when - ``config.output_hidden_states=True``): - Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): - Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - Attentions weights after the attention softmax, used to compute the weighted average in the - self-attention - heads. + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): + Classification loss (cross entropy) + logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -1182,17 +1177,12 @@ class BartForQuestionAnswering(PretrainedBartModel): Span-start scores (before SoftMax). end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned - when ``output_hidden_states=True`` is passed or when - ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True``): - Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py index 313dc47d22..a8bcc76c5a 100644 --- a/src/transformers/modeling_bert.py +++ b/src/transformers/modeling_bert.py @@ -691,12 +691,12 @@ class BertModel(BertPreTrainedModel): This output is usually *not* a good summary of the semantic content of the input, you're often better with averaging or pooling the sequence of hidden-states for the whole input sequence. - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -836,12 +836,12 @@ class BertForPreTraining(BertPreTrainedModel): seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -946,12 +946,12 @@ class BertLMHeadModel(BertPreTrainedModel): Next token prediction loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -1058,12 +1058,12 @@ class BertForMaskedLM(BertPreTrainedModel): Masked language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -1171,12 +1171,12 @@ class BertForNextSentencePrediction(BertPreTrainedModel): Next sequence prediction (classification) loss. seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -1265,12 +1265,12 @@ class BertForSequenceClassification(BertPreTrainedModel): Classification (or regression if config.num_labels==1) loss. logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or ``config.output_hidden_states``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -1366,12 +1366,12 @@ class BertForMultipleChoice(BertPreTrainedModel): `num_choices` is the second dimension of the input tensors. (see `input_ids` above). Classification scores (before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or ``config.output_hidden_states``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -1476,12 +1476,12 @@ class BertForTokenClassification(BertPreTrainedModel): Classification loss. scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) Classification scores (before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or ``config.output_hidden_states``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -1585,12 +1585,12 @@ class BertForQuestionAnswering(BertPreTrainedModel): Span-start scores (before SoftMax). end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or ``config.output_hidden_states``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. diff --git a/src/transformers/modeling_ctrl.py b/src/transformers/modeling_ctrl.py index 6ee74393de..d10fe66b0c 100644 --- a/src/transformers/modeling_ctrl.py +++ b/src/transformers/modeling_ctrl.py @@ -347,13 +347,12 @@ class CTRLModel(CTRLPreTrainedModel): past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned - when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -541,13 +540,12 @@ class CTRLLMHeadModel(CTRLPreTrainedModel): past (:obj:`List[torch.FloatTensor]` of length :obj:`config.n_layers` with each tensor of shape :obj:`(2, batch_size, num_heads, sequence_length, embed_size_per_head)`): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned - when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. diff --git a/src/transformers/modeling_distilbert.py b/src/transformers/modeling_distilbert.py index 064fcea606..b3ea7039cd 100644 --- a/src/transformers/modeling_distilbert.py +++ b/src/transformers/modeling_distilbert.py @@ -428,7 +428,7 @@ class DistilBertModel(DistilBertPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -537,7 +537,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -632,7 +632,7 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -731,7 +731,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -832,7 +832,7 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -926,12 +926,12 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel): `num_choices` is the second dimension of the input tensors. (see `input_ids` above). Classification scores (before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or ``config.output_hidden_states``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. diff --git a/src/transformers/modeling_electra.py b/src/transformers/modeling_electra.py index f25d8110bc..74426cc89b 100644 --- a/src/transformers/modeling_electra.py +++ b/src/transformers/modeling_electra.py @@ -285,7 +285,7 @@ class ElectraModel(ElectraPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -413,7 +413,7 @@ class ElectraForSequenceClassification(ElectraPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -506,13 +506,12 @@ class ElectraForPreTraining(ElectraPreTrainedModel): Total loss of the ELECTRA objective. scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`) Prediction scores of the head (scores for each token before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned - when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -623,7 +622,7 @@ class ElectraForMaskedLM(ElectraPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -725,7 +724,7 @@ class ElectraForTokenClassification(ElectraPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -839,7 +838,7 @@ class ElectraForQuestionAnswering(ElectraPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -944,12 +943,12 @@ class ElectraForMultipleChoice(ElectraPreTrainedModel): `num_choices` is the second dimension of the input tensors. (see `input_ids` above). Classification scores (before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. diff --git a/src/transformers/modeling_flaubert.py b/src/transformers/modeling_flaubert.py index c3d9aacd65..b565d7dd92 100644 --- a/src/transformers/modeling_flaubert.py +++ b/src/transformers/modeling_flaubert.py @@ -143,7 +143,7 @@ class FlaubertModel(XLMModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. diff --git a/src/transformers/modeling_gpt2.py b/src/transformers/modeling_gpt2.py index 09d21ac87c..ed9175dd09 100644 --- a/src/transformers/modeling_gpt2.py +++ b/src/transformers/modeling_gpt2.py @@ -588,7 +588,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -711,7 +711,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. diff --git a/src/transformers/modeling_longformer.py b/src/transformers/modeling_longformer.py index d5dfdc1c3c..9b52794201 100644 --- a/src/transformers/modeling_longformer.py +++ b/src/transformers/modeling_longformer.py @@ -600,7 +600,7 @@ class LongformerModel(RobertaModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -734,7 +734,7 @@ class LongformerForMaskedLM(BertPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -837,7 +837,7 @@ class LongformerForSequenceClassification(BertPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -962,10 +962,12 @@ class LongformerForQuestionAnswering(BertPreTrainedModel): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. + Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. @@ -1086,7 +1088,7 @@ class LongformerForTokenClassification(BertPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -1192,7 +1194,7 @@ class LongformerForMultipleChoice(BertPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. diff --git a/src/transformers/modeling_mobilebert.py b/src/transformers/modeling_mobilebert.py index f7f429e1b8..018ee4eb40 100644 --- a/src/transformers/modeling_mobilebert.py +++ b/src/transformers/modeling_mobilebert.py @@ -774,12 +774,12 @@ class MobileBertModel(MobileBertPreTrainedModel): This output is usually *not* a good summary of the semantic content of the input, you're often better with averaging or pooling the sequence of hidden-states for the whole input sequence. - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -938,12 +938,12 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel): seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -1052,12 +1052,12 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel): Masked language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -1160,12 +1160,12 @@ class MobileBertForNextSentencePrediction(MobileBertPreTrainedModel): Next sequence prediction (classification) loss. seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`): Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -1252,13 +1252,15 @@ class MobileBertForSequenceClassification(MobileBertPreTrainedModel): Classification (or regression if config.num_labels==1) loss. logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. + Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. @@ -1350,12 +1352,12 @@ class MobileBertForQuestionAnswering(MobileBertPreTrainedModel): Span-start scores (before SoftMax). end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -1463,12 +1465,12 @@ class MobileBertForMultipleChoice(MobileBertPreTrainedModel): `num_choices` is the second dimension of the input tensors. (see `input_ids` above). Classification scores (before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -1573,12 +1575,12 @@ class MobileBertForTokenClassification(MobileBertPreTrainedModel): Classification loss. scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) Classification scores (before SoftMax). - hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. diff --git a/src/transformers/modeling_openai.py b/src/transformers/modeling_openai.py index a066f0ef62..f8643c047c 100644 --- a/src/transformers/modeling_openai.py +++ b/src/transformers/modeling_openai.py @@ -525,7 +525,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -644,7 +644,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. diff --git a/src/transformers/modeling_reformer.py b/src/transformers/modeling_reformer.py index 6fa1f1ee29..2e01b54a02 100644 --- a/src/transformers/modeling_reformer.py +++ b/src/transformers/modeling_reformer.py @@ -1559,7 +1559,7 @@ class ReformerModel(ReformerPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - all_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + all_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -1763,7 +1763,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - all_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + all_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. diff --git a/src/transformers/modeling_roberta.py b/src/transformers/modeling_roberta.py index 56c1abf8c1..be98fb9cdd 100644 --- a/src/transformers/modeling_roberta.py +++ b/src/transformers/modeling_roberta.py @@ -211,7 +211,7 @@ class RobertaForMaskedLM(BertPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -335,7 +335,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -432,7 +432,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -539,7 +539,7 @@ class RobertaForTokenClassification(BertPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -669,7 +669,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. diff --git a/src/transformers/modeling_t5.py b/src/transformers/modeling_t5.py index a51ec99850..4a606fcf5a 100644 --- a/src/transformers/modeling_t5.py +++ b/src/transformers/modeling_t5.py @@ -915,7 +915,7 @@ class T5Model(T5PreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -1053,11 +1053,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. + Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: diff --git a/src/transformers/modeling_tf_albert.py b/src/transformers/modeling_tf_albert.py index 53afe5220e..e71223909d 100644 --- a/src/transformers/modeling_tf_albert.py +++ b/src/transformers/modeling_tf_albert.py @@ -715,39 +715,39 @@ class TFAlbertModel(TFAlbertPreTrainedModel): @add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)")) def call(self, inputs, **kwargs): r""" - Returns: - :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs: - last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`): - Last layer hidden-state of the first token of the sequence (classification token) - further processed by a Linear layer and a Tanh activation function. The Linear - layer weights are trained from the next sentence prediction (classification) - objective during Albert pretraining. This output is usually *not* a good summary - of the semantic content of the input, you're often better with averaging or pooling - the sequence of hidden-states for the whole input sequence. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) - of shape :obj:`(batch_size, sequence_length, hidden_size)`. + Returns: + :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs: + last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + pooler_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`): + Last layer hidden-state of the first token of the sequence (classification token) + further processed by a Linear layer and a Tanh activation function. The Linear + layer weights are trained from the next sentence prediction (classification) + objective during Albert pretraining. This output is usually *not* a good summary + of the semantic content of the input, you're often better with averaging or pooling + the sequence of hidden-states for the whole input sequence. + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): - tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. - Examples:: + Examples:: - import tensorflow as tf - from transformers import AlbertTokenizer, TFAlbertModel + import tensorflow as tf + from transformers import AlbertTokenizer, TFAlbertModel - tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') - model = TFAlbertModel.from_pretrained('albert-base-v2') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 - outputs = model(input_ids) - last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple + tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') + model = TFAlbertModel.from_pretrained('albert-base-v2') + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + outputs = model(input_ids) + last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ outputs = self.albert(inputs, **kwargs) @@ -780,15 +780,18 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). sop_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`): Prediction scores of the sentence order prediction (classification) head (scores of True/False continuation before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. + Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + Examples:: import tensorflow as tf from transformers import AlbertTokenizer, TFAlbertForPreTraining @@ -840,17 +843,17 @@ class TFAlbertForMaskedLM(TFAlbertPreTrainedModel): :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs: prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)` Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -916,17 +919,17 @@ class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClass :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs: logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`) Classification (or regression if config.num_labels==1) scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -1007,17 +1010,17 @@ class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificat :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -1106,17 +1109,17 @@ class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringL Span-start scores (before SoftMax). end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -1216,17 +1219,17 @@ class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss): `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: diff --git a/src/transformers/modeling_tf_auto.py b/src/transformers/modeling_tf_auto.py index 2b0cf73f74..fc345c9d4f 100644 --- a/src/transformers/modeling_tf_auto.py +++ b/src/transformers/modeling_tf_auto.py @@ -274,6 +274,7 @@ class TFAutoModel(object): The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: + - `t5`: TFT5Model (T5 model) - `distilbert`: TFDistilBertModel (DistilBERT model) - `roberta`: TFRobertaModel (RoBERTa model) @@ -308,6 +309,7 @@ class TFAutoModel(object): Args: config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: The model class to instantiate is selected based on the configuration class: + - isInstance of `distilbert` configuration class: TFDistilBertModel (DistilBERT model) - isInstance of `roberta` configuration class: TFRobertaModel (RoBERTa model) - isInstance of `bert` configuration class: TFBertModel (Bert model) @@ -341,6 +343,7 @@ class TFAutoModel(object): The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: + - `t5`: TFT5Model (T5 model) - `distilbert`: TFDistilBertModel (DistilBERT model) - `roberta`: TFRobertaModel (RoBERTa model) @@ -488,6 +491,7 @@ class TFAutoModelForPreTraining(object): The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: + - `t5`: :class:`~transformers.TFT5ModelWithLMHead` (T5 model) - `distilbert`: :class:`~transformers.TFDistilBertForMaskedLM` (DistilBERT model) - `albert`: :class:`~transformers.TFAlbertForPreTraining` (ALBERT model) @@ -586,6 +590,7 @@ class TFAutoModelWithLMHead(object): The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: + - `t5`: TFT5ForConditionalGeneration (T5 model) - `distilbert`: TFDistilBertForMaskedLM (DistilBERT model) - `roberta`: TFRobertaForMaskedLM (RoBERTa model) @@ -620,6 +625,7 @@ class TFAutoModelWithLMHead(object): Args: config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: The model class to instantiate is selected based on the configuration class: + - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model) - isInstance of `roberta` configuration class: RobertaModel (RoBERTa model) - isInstance of `bert` configuration class: BertModel (Bert model) @@ -653,6 +659,7 @@ class TFAutoModelWithLMHead(object): The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: + - `t5`: TFT5ForConditionalGeneration (T5 model) - `distilbert`: TFDistilBertForMaskedLM (DistilBERT model) - `roberta`: TFRobertaForMaskedLM (RoBERTa model) @@ -805,6 +812,7 @@ class TFAutoModelForMultipleChoice: The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: + - `albert`: TFRobertaForMultiple (Albert model) - `bert`: TFBertForMultipleChoice (Bert model) @@ -898,6 +906,7 @@ class TFAutoModelForSequenceClassification(object): The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: + - `distilbert`: TFDistilBertForSequenceClassification (DistilBERT model) - `roberta`: TFRobertaForSequenceClassification (RoBERTa model) - `bert`: TFBertForSequenceClassification (Bert model) @@ -927,6 +936,7 @@ class TFAutoModelForSequenceClassification(object): Args: config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: The model class to instantiate is selected based on the configuration class: + - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model) - isInstance of `roberta` configuration class: RobertaModel (RoBERTa model) - isInstance of `bert` configuration class: BertModel (Bert model) @@ -958,6 +968,7 @@ class TFAutoModelForSequenceClassification(object): The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: + - `distilbert`: TFDistilBertForSequenceClassification (DistilBERT model) - `roberta`: TFRobertaForSequenceClassification (RoBERTa model) - `bert`: TFBertForSequenceClassification (Bert model) @@ -1054,6 +1065,7 @@ class TFAutoModelForQuestionAnswering(object): The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: + - `distilbert`: TFDistilBertForQuestionAnswering (DistilBERT model) - `albert`: TFAlbertForQuestionAnswering (ALBERT model) - `roberta`: TFRobertaForQuestionAnswering (RoBERTa model) @@ -1084,6 +1096,7 @@ class TFAutoModelForQuestionAnswering(object): Args: config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: The model class to instantiate is selected based on the configuration class: + - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model) - isInstance of `albert` configuration class: AlbertModel (ALBERT model) - isInstance of `roberta` configuration class: RobertaModel (RoBERTa model) @@ -1116,6 +1129,7 @@ class TFAutoModelForQuestionAnswering(object): The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: + - `distilbert`: TFDistilBertForQuestionAnswering (DistilBERT model) - `albert`: TFAlbertForQuestionAnswering (ALBERT model) - `roberta`: TFRobertaForQuestionAnswering (RoBERTa model) @@ -1224,6 +1238,7 @@ class TFAutoModelForTokenClassification: Args: config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: The model class to instantiate is selected based on the configuration class: + - isInstance of `bert` configuration class: BertModel (Bert model) - isInstance of `xlnet` configuration class: XLNetModel (XLNet model) - isInstance of `distilbert` configuration class: DistilBertModel (DistilBert model) @@ -1254,6 +1269,7 @@ class TFAutoModelForTokenClassification: The `from_pretrained()` method takes care of returning the correct model class instance based on the `model_type` property of the config object, or when it's missing, falling back to using pattern matching on the `pretrained_model_name_or_path` string: + - `bert`: BertForTokenClassification (Bert model) - `xlnet`: XLNetForTokenClassification (XLNet model) - `distilbert`: DistilBertForTokenClassification (DistilBert model) diff --git a/src/transformers/modeling_tf_bert.py b/src/transformers/modeling_tf_bert.py index acb662c740..7e6028bb13 100644 --- a/src/transformers/modeling_tf_bert.py +++ b/src/transformers/modeling_tf_bert.py @@ -717,17 +717,17 @@ class TFBertModel(TFBertPreTrainedModel): objective during Bert pretraining. This output is usually *not* a good summary of the semantic content of the input, you're often better with averaging or pooling the sequence of hidden-states for the whole input sequence. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -770,17 +770,17 @@ class TFBertForPreTraining(TFBertPreTrainedModel): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). seq_relationship_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`): Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -825,17 +825,17 @@ class TFBertForMaskedLM(TFBertPreTrainedModel): :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -876,17 +876,17 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel): :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: seq_relationship_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`) Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -954,17 +954,17 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassific :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -1056,17 +1056,17 @@ class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss): `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -1189,17 +1189,17 @@ class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationL :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -1289,17 +1289,17 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss) Span-start scores (before SoftMax). end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: diff --git a/src/transformers/modeling_tf_ctrl.py b/src/transformers/modeling_tf_ctrl.py index c8744370f7..34b8d1aabd 100644 --- a/src/transformers/modeling_tf_ctrl.py +++ b/src/transformers/modeling_tf_ctrl.py @@ -497,14 +497,14 @@ class TFCTRLModel(TFCTRLPreTrainedModel): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. - hidden_states (:obj:`tuple(tf.Tensor)` `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): - Tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. @@ -578,13 +578,13 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel): Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): - Tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. diff --git a/src/transformers/modeling_tf_distilbert.py b/src/transformers/modeling_tf_distilbert.py index 69cefb7710..6ee2848904 100644 --- a/src/transformers/modeling_tf_distilbert.py +++ b/src/transformers/modeling_tf_distilbert.py @@ -581,17 +581,17 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel): :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs: last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -654,17 +654,17 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs: prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -735,17 +735,17 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel, TFSeque :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs: logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -822,17 +822,17 @@ class TFDistilBertForTokenClassification(TFDistilBertPreTrainedModel, TFTokenCla :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers,DistilBertConfig`) and inputs: scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -925,17 +925,17 @@ class TFDistilBertForMultipleChoice(TFDistilBertPreTrainedModel, TFMultipleChoic `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -1052,17 +1052,17 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel, TFQuestionAn Span-start scores (before SoftMax). end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: diff --git a/src/transformers/modeling_tf_electra.py b/src/transformers/modeling_tf_electra.py index 3e917d20f7..72cf7c77f7 100644 --- a/src/transformers/modeling_tf_electra.py +++ b/src/transformers/modeling_tf_electra.py @@ -389,17 +389,17 @@ class TFElectraModel(TFElectraPreTrainedModel): :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the output of the last layer of the model. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -449,17 +449,17 @@ class TFElectraForPreTraining(TFElectraPreTrainedModel): :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): Prediction scores of the head (scores for each token before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -549,17 +549,17 @@ class TFElectraForMaskedLM(TFElectraPreTrainedModel): :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -633,17 +633,17 @@ class TFElectraForTokenClassification(TFElectraPreTrainedModel, TFTokenClassific :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -732,17 +732,17 @@ class TFElectraForQuestionAnswering(TFElectraPreTrainedModel, TFQuestionAnswerin Span-start scores (before SoftMax). end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: diff --git a/src/transformers/modeling_tf_gpt2.py b/src/transformers/modeling_tf_gpt2.py index 8a6c03ef43..90f696062a 100644 --- a/src/transformers/modeling_tf_gpt2.py +++ b/src/transformers/modeling_tf_gpt2.py @@ -497,14 +497,14 @@ class TFGPT2Model(TFGPT2PreTrainedModel): Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. - hidden_states (:obj:`tuple(tf.Tensor)` `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): - Tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. @@ -557,13 +557,13 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel): Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): - Tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. @@ -643,18 +643,17 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as `input_ids` as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): - Tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - Examples:: # For example purposes. Not runnable. diff --git a/src/transformers/modeling_tf_mobilebert.py b/src/transformers/modeling_tf_mobilebert.py index f46a251db3..f87ec718b2 100644 --- a/src/transformers/modeling_tf_mobilebert.py +++ b/src/transformers/modeling_tf_mobilebert.py @@ -858,16 +858,17 @@ class TFMobileBertModel(TFMobileBertPreTrainedModel): objective during the original Bert pretraining. This output is usually *not* a good summary of the semantic content of the input, you're often better with averaging or pooling the sequence of hidden-states for the whole input sequence. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -908,16 +909,17 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). seq_relationship_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`): Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -960,16 +962,17 @@ class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel): :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs: prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -1021,16 +1024,17 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel): :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs: seq_relationship_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`) Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -1098,16 +1102,17 @@ class TFMobileBertForSequenceClassification(TFMobileBertPreTrainedModel, TFSeque :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs: logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -1198,16 +1203,17 @@ class TFMobileBertForQuestionAnswering(TFMobileBertPreTrainedModel, TFQuestionAn Span-start scores (before SoftMax). end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -1305,16 +1311,17 @@ class TFMobileBertForMultipleChoice(TFMobileBertPreTrainedModel, TFMultipleChoic `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -1439,16 +1446,17 @@ class TFMobileBertForTokenClassification(TFMobileBertPreTrainedModel, TFTokenCla :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.MobileBertConfig`) and inputs: scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` or ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: diff --git a/src/transformers/modeling_tf_openai.py b/src/transformers/modeling_tf_openai.py index aad54cd4fb..89ab8f2f1d 100644 --- a/src/transformers/modeling_tf_openai.py +++ b/src/transformers/modeling_tf_openai.py @@ -455,14 +455,14 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel): :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs: last_hidden_state (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): Sequence of hidden-states at the last layer of the model. - hidden_states (:obj:`tuple(tf.Tensor)` `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): - Tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. @@ -504,13 +504,13 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel): prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): - Tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. @@ -587,13 +587,13 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): - Tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. diff --git a/src/transformers/modeling_tf_roberta.py b/src/transformers/modeling_tf_roberta.py index b5cbcf60de..fdcbba1dda 100644 --- a/src/transformers/modeling_tf_roberta.py +++ b/src/transformers/modeling_tf_roberta.py @@ -207,17 +207,17 @@ class TFRobertaModel(TFRobertaPreTrainedModel): objective during Bert pretraining. This output is usually *not* a good summary of the semantic content of the input, you're often better with averaging or pooling the sequence of hidden-states for the whole input sequence. - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -284,17 +284,17 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel): :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: prediction_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -375,17 +375,17 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel, TFSequenceCla :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: logits (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -474,17 +474,17 @@ class TFRobertaForMultipleChoice(TFRobertaPreTrainedModel, TFMultipleChoiceLoss) `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -600,17 +600,17 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel, TFTokenClassific :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -699,17 +699,17 @@ class TFRobertaForQuestionAnswering(TFRobertaPreTrainedModel, TFQuestionAnswerin Span-start scores (before SoftMax). end_scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: diff --git a/src/transformers/modeling_tf_t5.py b/src/transformers/modeling_tf_t5.py index f0a12d3a04..2664dddcc8 100644 --- a/src/transformers/modeling_tf_t5.py +++ b/src/transformers/modeling_tf_t5.py @@ -915,13 +915,13 @@ class TFT5Model(TFT5PreTrainedModel): Can be used to speed up sequential decoding (see `decoder_past_key_value_states` input). Note that when using `decoder_past_key_value_states`, the model only outputs the last `hidden-state` of the sequence of shape :obj:`(batch_size, 1, config.vocab_size)`. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): - Tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. @@ -1055,15 +1055,16 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel): Can be used to speed up sequential decoding (see `decoder_past_key_value_states` input). Note that when using `decoder_past_key_value_states`, the model only outputs the last `prediction_score` of the sequence of shape :obj:`(batch_size, 1, config.vocab_size)`. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): - Tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: diff --git a/src/transformers/modeling_tf_transfo_xl.py b/src/transformers/modeling_tf_transfo_xl.py index 4826d88cf9..63921c4368 100644 --- a/src/transformers/modeling_tf_transfo_xl.py +++ b/src/transformers/modeling_tf_transfo_xl.py @@ -733,13 +733,13 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel): Can be used (see `mems` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): - Tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. @@ -832,13 +832,13 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel): Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): - Tuple of :obj:`tf.Tensor` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. diff --git a/src/transformers/modeling_tf_xlm.py b/src/transformers/modeling_tf_xlm.py index b5b9c2412d..f6e80d34e5 100644 --- a/src/transformers/modeling_tf_xlm.py +++ b/src/transformers/modeling_tf_xlm.py @@ -711,13 +711,13 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel): prediction_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer) + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): - Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. @@ -785,13 +785,13 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel, TFSequenceClassificat logits (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, config.num_labels)`): Classification (or regression if config.num_labels==1) scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer) + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): - Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. @@ -886,17 +886,17 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss): `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -1023,17 +1023,17 @@ class TFXLMForTokenClassification(TFXLMPreTrainedModel, TFTokenClassificationLos :obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: scores (:obj:`Numpy array` or :obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`): Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -1125,13 +1125,13 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel, TFQuestionAnsweringL end_scores (:obj:`tf.Tensor` or :obj:`Numpy array` of shape :obj:`(batch_size, sequence_length,)`): Span-end scores (before SoftMax). hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer) + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): - Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. diff --git a/src/transformers/modeling_tf_xlnet.py b/src/transformers/modeling_tf_xlnet.py index 370de3cde8..b4b1d9ce27 100644 --- a/src/transformers/modeling_tf_xlnet.py +++ b/src/transformers/modeling_tf_xlnet.py @@ -927,13 +927,13 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel): Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer) + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): - Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. @@ -1019,13 +1019,13 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel, TFSequenceClassif Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer) + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): - Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. @@ -1127,17 +1127,17 @@ class TFXLNetForMultipleChoice(TFXLNetPreTrainedModel, TFMultipleChoiceLoss): `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above). Classification scores (before SoftMax). - hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when - ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. Examples:: @@ -1274,13 +1274,13 @@ class TFXLNetForTokenClassification(TFXLNetPreTrainedModel, TFTokenClassificatio Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer) + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): - Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. @@ -1384,13 +1384,13 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel, TFQuestionAnswer Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model should not be passed as input ids as they have already been computed. hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): - Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for the output of the embeddings + one for the output of each layer) + tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): - Tuple of :obj:`tf.Tensor` or :obj:`Numpy array` (one for each layer) of shape - :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + tuple of :obj:`tf.Tensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. diff --git a/src/transformers/modeling_transfo_xl.py b/src/transformers/modeling_transfo_xl.py index 8655039035..6b2eb2c8e5 100644 --- a/src/transformers/modeling_transfo_xl.py +++ b/src/transformers/modeling_transfo_xl.py @@ -772,7 +772,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -978,7 +978,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. diff --git a/src/transformers/modeling_xlm.py b/src/transformers/modeling_xlm.py index 60efaebada..714bca06cb 100644 --- a/src/transformers/modeling_xlm.py +++ b/src/transformers/modeling_xlm.py @@ -419,7 +419,7 @@ class XLMModel(XLMPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -666,7 +666,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -755,7 +755,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -861,7 +861,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -999,7 +999,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -1095,7 +1095,7 @@ class XLMForTokenClassification(XLMPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. diff --git a/src/transformers/modeling_xlnet.py b/src/transformers/modeling_xlnet.py index d0cda81ad7..a70916b341 100644 --- a/src/transformers/modeling_xlnet.py +++ b/src/transformers/modeling_xlnet.py @@ -779,7 +779,7 @@ class XLNetModel(XLNetPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -1081,7 +1081,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -1202,7 +1202,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -1310,7 +1310,7 @@ class XLNetForTokenClassification(XLNetPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -1425,7 +1425,7 @@ class XLNetForMultipleChoice(XLNetPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -1552,7 +1552,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. @@ -1698,7 +1698,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): of shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus the initial embedding outputs. - attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``): + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.