From d409aca32632718afbcd098de2bb11b9b71b7df1 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Tue, 12 Nov 2019 10:59:37 -0500 Subject: [PATCH] Clarify the use of past in GPT2 and CTRL --- transformers/modeling_ctrl.py | 9 ++++++--- transformers/modeling_gpt2.py | 12 ++++++++---- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/transformers/modeling_ctrl.py b/transformers/modeling_ctrl.py index 405e33602a..1ed9e6ebb1 100644 --- a/transformers/modeling_ctrl.py +++ b/transformers/modeling_ctrl.py @@ -220,7 +220,8 @@ CTRL_INPUTS_DOCSTRING = r""" Inputs: **past**: list of ``torch.FloatTensor`` (one for each layer): that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model - (see `past` output below). Can be used to speed up sequential decoding. + (see `past` output below). Can be used to speed up sequential decoding. The token ids which have their past given to this model + should not be passed as input ids as they have already been computed. **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: @@ -252,7 +253,8 @@ class CTRLModel(CTRLPreTrainedModel): **past**: list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: that contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `past` input) to speed up sequential decoding. + Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model + should not be passed as input ids as they have already been computed. **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) of shape ``(batch_size, sequence_length, hidden_size)``: @@ -437,7 +439,8 @@ class CTRLLMHeadModel(CTRLPreTrainedModel): **past**: list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: that contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `past` input) to speed up sequential decoding. + Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model + should not be passed as input ids as they have already been computed. **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) of shape ``(batch_size, sequence_length, hidden_size)``: diff --git a/transformers/modeling_gpt2.py b/transformers/modeling_gpt2.py index e3d26797c8..35bc5c8d6e 100644 --- a/transformers/modeling_gpt2.py +++ b/transformers/modeling_gpt2.py @@ -298,7 +298,8 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs: **past**: list of ``torch.FloatTensor`` (one for each layer): that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model - (see `past` output below). Can be used to speed up sequential decoding. + (see `past` output below). Can be used to speed up sequential decoding. The token ids which have their past given to this model + should not be passed as input ids as they have already been computed. **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: @@ -330,7 +331,8 @@ class GPT2Model(GPT2PreTrainedModel): **past**: list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: that contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `past` input) to speed up sequential decoding. + Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model + should not be passed as input ids as they have already been computed. **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) of shape ``(batch_size, sequence_length, hidden_size)``: @@ -503,7 +505,8 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): **past**: list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: that contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `past` input) to speed up sequential decoding. + Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model + should not be passed as input ids as they have already been computed. **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) of shape ``(batch_size, sequence_length, hidden_size)``: @@ -595,7 +598,8 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): **past**: list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: that contains pre-computed hidden-states (key and values in the attention blocks). - Can be used (see `past` input) to speed up sequential decoding. + Can be used (see `past` input) to speed up sequential decoding. The token ids which have their past given to this model + should not be passed as input ids as they have already been computed. **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) of shape ``(batch_size, sequence_length, hidden_size)``: