From 0201d86015d6c79dac376933161c21395479f4d8 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 15 Jul 2019 10:11:09 +0200
Subject: [PATCH] added doc for transformer-xl

---
 pytorch_transformers/modeling_gpt2.py       |   8 +-
 pytorch_transformers/modeling_openai.py     |  11 +-
 pytorch_transformers/modeling_transfo_xl.py | 199 ++++++++++----------
 3 files changed, 106 insertions(+), 112 deletions(-)

diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 8aaf84a099..06386f9ace 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -382,10 +382,10 @@ GPT2_START_DOCSTRING = r"""    OpenAI GPT-2 model was proposed in
         https://pytorch.org/docs/stable/nn.html#module
 
     Parameters:
-        config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model.
+        config (:class:`~pytorch_transformers.GPT2Config`): Model configuration class with all the parameters of the model.
 """
 
-GPT2_INPUTS_DOCTRING = r"""    Inputs:
+GPT2_INPUTS_DOCSTRING = r"""    Inputs:
         **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Indices of input sequence tokens in the vocabulary.
             Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
@@ -413,7 +413,7 @@ GPT2_INPUTS_DOCTRING = r"""    Inputs:
 """
 
 @add_start_docstrings("The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.",
-                      GPT2_START_DOCSTRING, GPT2_INPUTS_DOCTRING)
+                      GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
 class GPT2Model(GPT2PreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -538,7 +538,7 @@ class GPT2Model(GPT2PreTrainedModel):
 
 
 @add_start_docstrings("""The GPT2 Model transformer with a language modeling head on top
-(linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCTRING)
+(linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
 class GPT2LMHeadModel(GPT2PreTrainedModel):
     r"""
         **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index 6e5dc44f04..268252a12c 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -31,7 +31,8 @@ from torch.nn import CrossEntropyLoss
 from torch.nn.parameter import Parameter
 
 from .modeling_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig,
-                             PreTrainedModel, prune_conv1d_layer, SequenceSummary)
+                             PreTrainedModel, prune_conv1d_layer, SequenceSummary,
+                             add_start_docstrings)
 from .modeling_bert import BertLayerNorm as LayerNorm
 
 logger = logging.getLogger(__name__)
@@ -395,10 +396,10 @@ OPENAI_GPT_START_DOCSTRING = r"""    OpenAI GPT model was proposed in
         https://pytorch.org/docs/stable/nn.html#module
 
     Parameters:
-        config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model.
+        config (:class:`~pytorch_transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
 """
 
-OPENAI_GPT_INPUTS_DOCTRING = r"""    Inputs:
+OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
         **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Indices of input sequence tokens in the vocabulary.
             Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
@@ -422,7 +423,7 @@ OPENAI_GPT_INPUTS_DOCTRING = r"""    Inputs:
 """
 
 @add_start_docstrings("The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.",
-                      OPENAI_GPT_START_DOCSTRING, GPT2_INPUTS_DOCTRING)
+                      OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
 class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -532,7 +533,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
 
 
 @add_start_docstrings("""OpenAI GPT Model transformer with a language modeling head on top
-(linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCTRING)
+(linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
 class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
     r"""
         **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
index f368d32636..7eb7a46df3 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -36,7 +36,7 @@ from torch.nn.parameter import Parameter
 
 from .modeling_bert import BertLayerNorm as LayerNorm
 from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits
-from .modeling_utils import CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel
+from .modeling_utils import (PretrainedConfig, PreTrainedModel, add_start_docstrings)
 
 logger = logging.getLogger(__name__)
 
@@ -910,23 +910,71 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
         pass
 
 
+TRANSFO_XL_START_DOCSTRING = r"""    The Transformer-XL model was proposed in
+    `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context`_
+    by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+    It's a causal (uni-directional) transformer with relative positioning (sinusoïdal) embeddings which can reuse
+    previously computed hidden-states to attend to longer context (memory).
+    This model also uses adaptive softmax inputs and outputs (tied).
+
+    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
+    refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+    .. _`Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context`:
+        https://arxiv.org/abs/1901.02860
+
+    .. _`torch.nn.Module`:
+        https://pytorch.org/docs/stable/nn.html#module
+
+    Parameters:
+        config (:class:`~pytorch_transformers.TransfoXLConfig`): Model configuration class with all the parameters of the model.
+"""
+
+TRANSFO_XL_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using :class:`pytorch_transformers.TransfoXLTokenizer`.
+            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **mems**:
+            list of ``torch.FloatTensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            (see `mems` output below). Can be used to speed up sequential decoding and attend to longer context.
+        **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask indices selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
+                      TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING)
 class TransfoXLModel(TransfoXLPreTrainedModel):
-    """Transformer XL model ("Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context").
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the last layer of the model.
+        **mems**: ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            list of ``torch.FloatTensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            (see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
 
-    Transformer XL uses relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that:
+    Examples::
 
-        - you don't need to specify positioning embeddings indices.
+        >>> config = TransfoXLConfig.from_pretrained('transfo-xl-wt103')
+        >>> tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
+        >>> model = TransfoXLModel(config)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> last_hidden_states, mems = outputs[:2]
 
-        - the tokens in the vocabulary have to be sorted in decreasing frequency.
-
-    Args:
-        config: a TransfoXLConfig class instance with the configuration to build a new model
-
-
-    Example::
-
-        config = TransfoXLConfig()
-        model = TransfoXLModel(config)
     """
     def __init__(self, config):
         super(TransfoXLModel, self).__init__(config)
@@ -1193,41 +1241,6 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
         return outputs  # last hidden state, new_mems, (all hidden states), (all attentions)
 
     def forward(self, input_ids, mems=None, head_mask=None):
-        """
-        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
-
-        Args:
-            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
-                with the token indices selected in the range [0, self.config.n_token[
-            `mems`: optional memory of hidden states from previous forward passes
-                as a list (num layers) of hidden states at the entry of each layer
-                each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
-                Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
-
-        Returns:
-            A tuple of ``(last_hidden_state, new_mems)``.
-
-                ``last_hidden_state``: the encoded-hidden-states at the top of the model
-                as a ``torch.FloatTensor`` of size [batch_size, sequence_length, self.config.d_model]
-
-                ``new_mems``: list (num layers) of updated mem states at the entry of each layer
-                each mem state is a ``torch.FloatTensor`` of size [self.config.mem_len, batch_size, self.config.d_model]
-                Note that the first two dimensions are transposed in ``mems`` with regards to ``input_ids`` and
-                ``labels``
-
-        Example::
-
-            # Already been converted into BPE token ids
-            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-            input_ids_next = torch.LongTensor([[53, 21, 1], [64, 23, 100]])
-
-            last_hidden_state, new_mems = model(input_ids)
-            # or
-            last_hidden_state, new_mems = model.forward(input_ids)
-
-            # Another time on input_ids_next using the memory:
-            last_hidden_state, new_mems = model(input_ids_next, new_mems)
-        """
         # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
         # so we transpose here from shape [bsz, len] to shape [len, bsz]
         input_ids = input_ids.transpose(0, 1).contiguous()
@@ -1239,27 +1252,45 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
         return outputs  # last hidden state, new_mems, (all hidden states), (all attentions)
 
 
+@add_start_docstrings("""The Transformer-XL Model with a language modeling head on top
+    (adaptive softmax with weights tied to the adaptive input embeddings)""",
+    TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING)
 class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
-    """Transformer XL model ("Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context").
+    r"""
+        **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for language modeling.
+            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
+            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+            All labels set to ``-1`` are ignored (masked), the loss is only
+            computed for labels in ``[0, ..., config.vocab_size]``
 
-    This model adds an (adaptive) softmax head on top of the ``TransfoXLModel``
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Language modeling loss.
+        **prediction_scores**: ``None`` if ``lm_labels`` is provided else ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+            We don't output them when the loss is computed to speedup adaptive softmax decoding.
+        **mems**: ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            list of ``torch.FloatTensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            (see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
 
-    Transformer XL uses a relative positioning (with sinusoidal patterns) and adaptive softmax inputs which means that:
+    Examples::
 
-        - you don't need to specify positioning embeddings indices
+        >>> config = TransfoXLConfig.from_pretrained('transfo-xl-wt103')
+        >>> tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
+        >>> model = TransfoXLLMHeadModel(config)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> prediction_scores, mems = outputs[:2]
 
-        - the tokens in the vocabulary have to be sorted in decreasing frequency.
-
-    Call ``self.tie_weights()`` if you update/load the weights of the transformer to keep the weights tied.
-
-    Args:
-        config: a ``TransfoXLConfig`` class instance with the configuration to build a new model
-
-
-    Example::
-
-        config = TransfoXLConfig()
-        model = TransfoXLModel(config)
     """
     def __init__(self, config):
         super(TransfoXLLMHeadModel, self).__init__(config)
@@ -1310,44 +1341,6 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
         return self.transformer.init_mems(data)
 
     def forward(self, input_ids, labels=None, mems=None, head_mask=None):
-        """
-        Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
-
-        Args:
-            `input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
-                with the token indices selected in the range [0, self.config.n_token[
-            `labels`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length]
-                with the labels token indices selected in the range [0, self.config.n_token[
-            `mems`: an optional memory of hidden states from previous forward passes
-                as a list (num layers) of hidden states at the entry of each layer
-                each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
-                Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
-
-        Returns:
-            A tuple of (last_hidden_state, new_mems)
-
-                ``last_hidden_state``: output of the (adaptive) softmax. If ``labels`` is ``None``, it is the negative
-                log likelihood of shape [batch_size, sequence_length]. Otherwise, it is the log probabilities of
-                tokens of, shape [batch_size, sequence_length, n_tokens].
-
-                ``new_mems``: list (num layers) of updated mem states at the entry of each layer
-                each mem state is a ``torch.FloatTensor`` of size [self.config.mem_len, batch_size, self.config.d_model]
-                Note that the first two dimensions are transposed in ``mems`` with regards to ``input_ids`` and
-                ``labels``
-
-        Example::
-
-            # Already been converted into BPE token ids
-            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-            input_ids_next = torch.LongTensor([[53, 21, 1], [64, 23, 100]])
-
-            last_hidden_state, new_mems = model(input_ids)
-            # or
-            last_hidden_state, new_mems = model.forward(input_ids)
-
-            # Another time on input_ids_next using the memory:
-            last_hidden_state, new_mems = model(input_ids_next, mems=new_mems)
-        """
         bsz = input_ids.size(0)
         tgt_len = input_ids.size(1)