Refactored Docstrings of BERT, GPT2, GPT, TransfoXL, XLM and XLNet.

2019-07-09 15:55:31 -04:00
parent ed6c8d37f4
commit 8fe2c9d98e
13 changed files with 924 additions and 763 deletions
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -101,6 +101,25 @@ def gelu(x):

 class GPT2Config(PretrainedConfig):
    """Configuration class to store the configuration of a `GPT2Model`.
+
+    Args:
+        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
+        n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
+        n_positions: Number of positional embeddings.
+        n_ctx: Size of the causal mask (usually same as n_positions).
+        n_embd: Dimensionality of the embeddings and hidden states.
+        n_layer: Number of hidden layers in the Transformer encoder.
+        n_head: Number of attention heads for each attention layer in
+            the Transformer encoder.
+        layer_norm_epsilon: epsilon to use in the layer norm layers
+        resid_pdrop: The dropout probabilitiy for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        attn_pdrop: The dropout ratio for the attention
+            probabilities.
+        embd_pdrop: The dropout ratio for the embeddings.
+        initializer_range: The sttdev of the truncated_normal_initializer for
+            initializing all weight matrices.
+        predict_special_tokens: should we predict special tokens (when the model has a LM head)
    """
    pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP

@@ -418,9 +437,11 @@ class GPT2Model(GPT2PreTrainedModel):
    GPT-2 use a single embedding matrix to store the word and special embeddings.
    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
    Special tokens need to be trained during the fine-tuning if you use them.
-    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
+    The number of special embeddings can be controlled using the `set_num_special_tokens(num_special_tokens)` function.
+
+    The embeddings are ordered as follow in the token embeddings matrix:
+    ::

-    The embeddings are ordered as follow in the token embeddings matrice:
        [0,                                                         ----------------------
         ...                                                        -> word embeddings
         config.vocab_size - 1,                                     ______________________
@@ -428,47 +449,24 @@ class GPT2Model(GPT2PreTrainedModel):
         ...                                                        -> special embeddings
         config.vocab_size + config.n_special - 1]                  ______________________

-    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
-        total_tokens_embeddings = config.vocab_size + config.n_special
-    You should use the associate indices to index the embeddings.
+    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is equal to

-    Params:
+    ::
+
+        total_tokens_embeddings = config.vocab_size + config.n_special
+
+    You should use the associated indices to index the embeddings.
+
+    Args:
        `config`: a GPT2Config class instance with the configuration to build a new model
        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False

-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
-            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
-        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
-            with the position indices (selected in the range [0, config.n_positions - 1[.
-        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
-            You can use it to add a third type of embedding to each input token in the sequence
-            (the previous two being the word and position embeddings).
-            The input, position and token_type embeddings are summed inside the Transformer before the first
-            self-attention block.
-        `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states
-            (key and values in the attention blocks) to speed up sequential decoding
-            (this is the presents output of the model, cf. below).
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.

-    Outputs a tuple consisting of:
-        `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings)
-            as torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
-            (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
-        `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as
-            torch.FloatTensors. They can be reused to speed up sequential decoding.

-    Example usage:
-    ```python
-    # Already been converted into BPE token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    Example::

-    config = modeling_gpt2.GPT2Config()
-
-    model = modeling_gpt2.GPT2Model(config)
-    hidden_states, presents = model(input_ids)
-    ```
+        config = modeling_gpt2.GPT2Config()
+        model = modeling_gpt2.GPT2Model(config)
    """

    def __init__(self, config):
@@ -485,7 +483,7 @@ class GPT2Model(GPT2PreTrainedModel):
        self.apply(self.init_weights)

    def set_num_special_tokens(self, num_special_tokens=None):
-        " Update input embeddings with new embedding matrice if needed "
+        """Update input embeddings with new embedding matrix if needed."""
        if num_special_tokens is None or self.config.n_special == num_special_tokens:
            return
        # Update config
@@ -506,6 +504,47 @@ class GPT2Model(GPT2PreTrainedModel):
            self.h[layer].attn.prune_heads(heads)

    def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None, head_mask=None):
+        """
+        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
+
+        Args:
+            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
+                were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
+            `position_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
+                with the position indices (selected in the range [0, config.n_positions - 1[.
+            `token_type_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
+                You can use it to add a third type of embedding to each input token in the sequence
+                (the previous two being the word and position embeddings).
+                The input, position and token_type embeddings are summed inside the Transformer before the first
+                self-attention block.
+            `past`: an optional list of ``torch.LongTensor`` that contains pre-computed hidden-states
+                (key and values in the attention blocks) to speed up sequential decoding
+                (this is the presents output of the model, cf. below).
+            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+        Returns:
+             A tuple consisting of ``hidden_states`` and ``presents``.
+
+                 ``hidden_states`` are a list of all the encoded-hidden-states in the model (length of the list: number of
+                 layers + 1 for the output of the embeddings) as ``torch.FloatTensor`` of size [batch_size, sequence_length,
+                 hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of
+                 input_ids).
+
+                 ``presents`` are a list of pre-computed hidden-states (key and values in each attention blocks) as
+                 torch.FloatTensors. They can be reused to speed up sequential decoding.
+
+
+        Example::
+
+            # Already been converted into BPE token ids
+            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+
+            hidden_states, presents = model(input_ids)
+            # or
+            hidden_states, presents = model.forward(input_ids)
+
+        """
        if past is None:
            past_length = 0
            past = [None] * len(self.h)
@@ -580,50 +619,18 @@ class GPT2Model(GPT2PreTrainedModel):
 class GPT2LMHeadModel(GPT2PreTrainedModel):
    """OpenAI GPT-2 model with a Language Modeling head ("Language Models are Unsupervised Multitask Learners").

-    Params:
+    Args:
        `config`: a GPT2Config class instance with the configuration to build a new model
        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
            This can be used to compute head importance metrics. Default: False

-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
-            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
-        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
-            with the position indices (selected in the range [0, config.n_positions - 1[.
-        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
-            You can use it to add a third type of embedding to each input token in the sequence
-            (the previous two being the word and position embeddings).
-            The input, position and token_type embeddings are summed inside the Transformer before the first
-            self-attention block.
-        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
-            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
-            is only computed for the labels set in [0, ..., vocab_size]
-        `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states
-            (key and values in the attention blocks) to speed up sequential decoding
-            (this is the presents output of the model, cf. below).
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.

-    Outputs:
-        if `lm_labels` is not `None`:
-            Outputs the language modeling loss.
-        else a tuple:
-            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, config.vocab_size]
-                (or more generally [d_1, ..., d_n, config.vocab_size] were d_1 ... d_n are the dimension of input_ids)
-            `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as
-                torch.FloatTensors. They can be reused to speed up sequential decoding.

-    Example usage:
-    ```python
-    # Already been converted into BPE token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+    Example::

-    config = modeling_gpt2.GPT2Config()
-
-    model = modeling_gpt2.GPT2LMHeadModel(config)
-    lm_logits, presents = model(input_ids)
-    ```
+        config = modeling_gpt2.GPT2Config()
+        model = modeling_gpt2.GPT2LMHeadModel(config)
    """

    def __init__(self, config):
@@ -633,14 +640,58 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
        self.apply(self.init_weights)

    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
-        """ Update input and output embeddings with new embedding matrice
-            Make sure we are sharing the embeddings
+        """
+        Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings.
+        TODO Shouldn't we put args + returns ?
        """
        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
        self.transformer.set_num_special_tokens(num_special_tokens)
        self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)

    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None, head_mask=None):
+        """
+        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
+
+        Args:
+            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
+                were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
+            `position_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
+                with the position indices (selected in the range [0, config.n_positions - 1[.
+            `token_type_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
+                You can use it to add a third type of embedding to each input token in the sequence
+                (the previous two being the word and position embeddings).
+                The input, position and token_type embeddings are summed inside the Transformer before the first
+                self-attention block.
+            `lm_labels`: optional language modeling labels: ``torch.LongTensor`` of shape [batch_size, sequence_length]
+                with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
+                is only computed for the labels set in [0, ..., vocab_size]
+            `past`: an optional list of ``torch.LongTensor`` that contains pre-computed hidden-states
+                (key and values in the attention blocks) to speed up sequential decoding
+                (this is the presents output of the model, cf. below).
+            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+        Returns:
+            If ``lm_labels`` is not ``None``, returns the language modeling loss. It ``lm_labels`` is ``None``, returns
+            a tuple of (``lm_logits``, ``presents``).
+
+                ``lm_logits`` is the language modeling logits as a ``torch.FloatTensor`` of size [batch_size,
+                sequence_length, config.vocab_size] (or more generally [d_1, ..., d_n, config.vocab_size] were d_1 ...
+                d_n are the dimension of input_ids).
+
+                ``presents`` is a list of pre-computed hidden-states (key and values in each attention blocks) as
+                torch.FloatTensors. They can be reused to speed up sequential decoding.
+
+        Example::
+
+            # Already been converted into BPE token ids
+            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
+
+            lm_logits, presents = model(input_ids)
+            # or
+            lm_logits, presents = model.forward(input_ids)
+
+        """
        transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, past, head_mask)
        hidden_states = transformer_outputs[0]

@@ -663,55 +714,16 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
 class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
    """OpenAI GPT-2 model with a Language Modeling and a Multiple Choice head ("Language Models are Unsupervised Multitask Learners").

-    Params:
+    Args:
        `config`: a GPT2Config class instance with the configuration to build a new model
        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
            This can be used to compute head importance metrics. Default: False

-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token
-            indices selected in the range [0, config.vocab_size[
-        `mc_token_ids`: a torch.LongTensor of shape [batch_size, num_choices] with the index of the token from
-            which we should take the hidden state to feed the multiple choice classifier (usually last token of the sequence)
-        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
-            with the position indices (selected in the range [0, config.n_positions - 1[.
-        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
-            You can use it to add a third type of embedding to each input token in the sequence
-            (the previous two being the word and position embeddings).
-            The input, position and token_type embeddings are summed inside the Transformer before the first
-            self-attention block.
-        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, num_choices, sequence_length]
-            with indices selected in [-1, 0, ..., config.vocab_size]. All labels set to -1 are ignored (masked), the loss
-            is only computed for the labels set in [0, ..., config.vocab_size]
-        `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size]
-            with indices selected in [0, ..., num_choices].
-        `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states
-            (key and values in the attention blocks) to speed up sequential decoding
-            (this is the presents output of the model, cf. below).
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+    Example::

-    Outputs:
-        if `lm_labels` and `multiple_choice_labels` are not `None`:
-            Outputs a tuple of losses with the language modeling loss and the multiple choice loss.
-        else: a tuple with
-            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, config.vocab_size]
-            `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]
-            `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as
-                torch.FloatTensors. They can be reused to speed up sequential decoding.
-
-    Example usage:
-    ```python
-    # Already been converted into BPE token ids
-    input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]]])  # (bsz, number of choice, seq length)
-    mc_token_ids = torch.LongTensor([[2], [1]]) # (bsz, number of choice)
-
-    config = modeling_gpt2.GPT2Config()
-
-    model = modeling_gpt2.GPT2DoubleHeadsModel(config)
-    lm_logits, multiple_choice_logits, presents = model(input_ids, mc_token_ids)
-    ```
+        config = modeling_gpt2.GPT2Config()
+        model = modeling_gpt2.GPT2DoubleHeadsModel(config)
    """

    def __init__(self, config):
@@ -723,8 +735,9 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
        self.apply(self.init_weights)

    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
-        """ Update input and output embeddings with new embedding matrice
-            Make sure we are sharing the embeddings
+        """
+        Update input and output embeddings with new embedding matrix.Make sure we are sharing the embeddings
+        TODO Shouldn't we put args + returns ?
        """
        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
        self.transformer.set_num_special_tokens(num_special_tokens)
@@ -732,6 +745,55 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):

    def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
                position_ids=None, past=None, head_mask=None):
+        """
+        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
+
+        Args:
+            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, num_choices, sequence_length] with the BPE token
+                indices selected in the range [0, config.vocab_size[
+            `mc_token_ids`: a ``torch.LongTensor`` of shape [batch_size, num_choices] with the index of the token from
+                which we should take the hidden state to feed the multiple choice classifier (usually last token of the sequence)
+            `position_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
+                with the position indices (selected in the range [0, config.n_positions - 1[.
+            `token_type_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
+                You can use it to add a third type of embedding to each input token in the sequence
+                (the previous two being the word and position embeddings).
+                The input, position and token_type embeddings are summed inside the Transformer before the first
+                self-attention block.
+            `lm_labels`: optional language modeling labels: ``torch.LongTensor`` of shape [batch_size, num_choices, sequence_length]
+                with indices selected in [-1, 0, ..., config.vocab_size]. All labels set to -1 are ignored (masked), the loss
+                is only computed for the labels set in [0, ..., config.vocab_size]
+            `multiple_choice_labels`: optional multiple choice labels: ``torch.LongTensor`` of shape [batch_size]
+                with indices selected in [0, ..., num_choices].
+            `past`: an optional list of ``torch.LongTensor`` that contains pre-computed hidden-states
+                (key and values in the attention blocks) to speed up sequential decoding
+                (this is the presents output of the model, cf. below).
+            `head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
+                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+
+        Returns:
+            If ``lm_labels`` and ``multiple_choice_labels`` are not ``None``, outputs a
+            ``tuple(language_modeling_loss, multiple_choice_loss)``. If they are not ``None``, outputs a
+            ``tuple(lm_logits, multiple_choice_logits, presents)``.
+
+                ``lm_logits``: the language modeling logits as a ``torch.FloatTensor`` of size [batch_size, num_choices, sequence_length, config.vocab_size]
+
+                ``multiple_choice_logits``: the multiple choice logits as a ``torch.FloatTensor`` of size [batch_size, num_choices]
+
+                ``presents``: a list of pre-computed hidden-states (key and values in each attention blocks) as
+                torch.FloatTensors. They can be reused to speed up sequential decoding.
+
+        Example::
+
+            # Already been converted into BPE token ids
+            input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]]])  # (bsz, number of choice, seq length)
+            mc_token_ids = torch.LongTensor([[2], [1]]) # (bsz, number of choice)
+
+            lm_logits, multiple_choice_logits, presents = model(input_ids, mc_token_ids)
+            # or
+            lm_logits, multiple_choice_logits, presents = model.forward(input_ids, mc_token_ids)
+
+        """
        transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, past, head_mask)
        hidden_states = transformer_outputs[0]