updating head masking, readme and docstrings

2019-06-17 15:51:28 +02:00
parent 965f172de6
commit 33d3db5c43
7 changed files with 220 additions and 43 deletions
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -613,7 +613,10 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
    You should use the associate indices to index the embeddings.

    Params:
-        config: a OpenAIGPTConfig class instance with the configuration to build a new model
+        `config`: a OpenAIGPTConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False

    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
@@ -625,10 +628,12 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
            (the previous two being the word and position embeddings).
            The input, position and token_type embeddings are summed inside the Transformer before the first
            self-attention block.
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.

    Outputs:
-        `hidden_states`: the encoded-hidden-states at the top of the model
-            as a torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
+        `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings)
+            as torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
            (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)

    Example usage:
@@ -694,13 +699,17 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
        # Prepare head mask if needed
        # 1.0 in head_mask indicate we mask the head
        # attention_probs has shape bsz x n_heads x N x N
+        # head_mask has shape n_layer x batch x n_heads x N x N
        if head_mask is not None:
            if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand_as(self.config.n_layer, -1, -1, -1, -1)
            elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each instance in batch
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
            head_mask = (1.0 - head_mask)
+        else:
+            head_mask = [None] * self.config.n_layer

        input_shape = input_ids.size()
        input_ids = input_ids.view(-1, input_ids.size(-1))
@@ -720,8 +729,8 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):

        all_attentions = []
        all_hidden_states = [hidden_states.view(*output_shape)]
-        for block in self.h:
-            outputs = block(hidden_states, head_mask)
+        for i, block in enumerate(self.h):
+            outputs = block(hidden_states, head_mask[i])
            if self.output_attentions:
                attentions, hidden_states = outputs
                all_attentions.append(attentions)
@@ -755,7 +764,10 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
    You should use the associate indices to index the embeddings.

    Params:
-        config: a OpenAIGPTConfig class instance with the configuration to build a new model
+        `config`: a OpenAIGPTConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False

    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
@@ -770,6 +782,8 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
            is only computed for the labels set in [0, ..., vocab_size]
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.

    Outputs:
        if `lm_labels` is not `None`:
@@ -847,7 +861,10 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
    You should use the associate indices to index the embeddings.

    Params:
-        config: a OpenAIGPTConfig class instance with the configuration to build a new model
+        `config`: a OpenAIGPTConfig class instance with the configuration to build a new model
+        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
+        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
+            This can be used to compute head importance metrics. Default: False

    Inputs:
        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token
@@ -866,6 +883,8 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
            is only computed for the labels set in [0, ..., total_tokens_embeddings]
        `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size]
            with indices selected in [0, ..., num_choices].
+        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.

    Outputs:
        if `lm_labels` and `multiple_choice_labels` are not `None`: