updating head masking, readme and docstrings
This commit is contained in:
@@ -613,7 +613,10 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
||||
You should use the associate indices to index the embeddings.
|
||||
|
||||
Params:
|
||||
config: a OpenAIGPTConfig class instance with the configuration to build a new model
|
||||
`config`: a OpenAIGPTConfig class instance with the configuration to build a new model
|
||||
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
|
||||
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
|
||||
This can be used to compute head importance metrics. Default: False
|
||||
|
||||
Inputs:
|
||||
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
|
||||
@@ -625,10 +628,12 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
||||
(the previous two being the word and position embeddings).
|
||||
The input, position and token_type embeddings are summed inside the Transformer before the first
|
||||
self-attention block.
|
||||
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
|
||||
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
|
||||
|
||||
Outputs:
|
||||
`hidden_states`: the encoded-hidden-states at the top of the model
|
||||
as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
|
||||
`hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings)
|
||||
as torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
|
||||
(or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
|
||||
|
||||
Example usage:
|
||||
@@ -694,13 +699,17 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
||||
# Prepare head mask if needed
|
||||
# 1.0 in head_mask indicate we mask the head
|
||||
# attention_probs has shape bsz x n_heads x N x N
|
||||
# head_mask has shape n_layer x batch x n_heads x N x N
|
||||
if head_mask is not None:
|
||||
if head_mask.dim() == 1:
|
||||
head_mask = head_mask.unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
|
||||
head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
|
||||
head_mask = head_mask.expand_as(self.config.n_layer, -1, -1, -1, -1)
|
||||
elif head_mask.dim() == 2:
|
||||
head_mask = head_mask.unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each instance in batch
|
||||
head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer
|
||||
head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
|
||||
head_mask = (1.0 - head_mask)
|
||||
else:
|
||||
head_mask = [None] * self.config.n_layer
|
||||
|
||||
input_shape = input_ids.size()
|
||||
input_ids = input_ids.view(-1, input_ids.size(-1))
|
||||
@@ -720,8 +729,8 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
||||
|
||||
all_attentions = []
|
||||
all_hidden_states = [hidden_states.view(*output_shape)]
|
||||
for block in self.h:
|
||||
outputs = block(hidden_states, head_mask)
|
||||
for i, block in enumerate(self.h):
|
||||
outputs = block(hidden_states, head_mask[i])
|
||||
if self.output_attentions:
|
||||
attentions, hidden_states = outputs
|
||||
all_attentions.append(attentions)
|
||||
@@ -755,7 +764,10 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
|
||||
You should use the associate indices to index the embeddings.
|
||||
|
||||
Params:
|
||||
config: a OpenAIGPTConfig class instance with the configuration to build a new model
|
||||
`config`: a OpenAIGPTConfig class instance with the configuration to build a new model
|
||||
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
|
||||
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
|
||||
This can be used to compute head importance metrics. Default: False
|
||||
|
||||
Inputs:
|
||||
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
|
||||
@@ -770,6 +782,8 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
|
||||
`lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
|
||||
with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
|
||||
is only computed for the labels set in [0, ..., vocab_size]
|
||||
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
|
||||
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
|
||||
|
||||
Outputs:
|
||||
if `lm_labels` is not `None`:
|
||||
@@ -847,7 +861,10 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
||||
You should use the associate indices to index the embeddings.
|
||||
|
||||
Params:
|
||||
config: a OpenAIGPTConfig class instance with the configuration to build a new model
|
||||
`config`: a OpenAIGPTConfig class instance with the configuration to build a new model
|
||||
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
|
||||
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
|
||||
This can be used to compute head importance metrics. Default: False
|
||||
|
||||
Inputs:
|
||||
`input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token
|
||||
@@ -866,6 +883,8 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
||||
is only computed for the labels set in [0, ..., total_tokens_embeddings]
|
||||
`multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size]
|
||||
with indices selected in [0, ..., num_choices].
|
||||
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
|
||||
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
|
||||
|
||||
Outputs:
|
||||
if `lm_labels` and `multiple_choice_labels` are not `None`:
|
||||
|
||||
Reference in New Issue
Block a user