From 098a89f312311a730275a79af7cf5c527d35fdd7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Tue, 29 Oct 2019 20:08:03 +0100
Subject: [PATCH] update docstrings; rename lm_labels to more explicit
 ltr_lm_labels

---
 examples/run_summarization_finetuning.py |  8 ++--
 transformers/modeling_bert.py            | 51 +++++++++++++-----------
 2 files changed, 32 insertions(+), 27 deletions(-)

diff --git a/examples/run_summarization_finetuning.py b/examples/run_summarization_finetuning.py
index 1888f56caf..2dc8c660ce 100644
--- a/examples/run_summarization_finetuning.py
+++ b/examples/run_summarization_finetuning.py
@@ -26,7 +26,7 @@ import numpy as np
 from tqdm import tqdm, trange
 import torch
 from torch.optim import Adam
-from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 
 from transformers import (
     AutoTokenizer,
@@ -283,14 +283,14 @@ def evaluate(args, model, tokenizer, prefix=""):
     model.eval()
 
     for batch in tqdm(eval_dataloader, desc="Evaluating"):
-        source, target, encoder_token_type_ids, encoder_mask, decoder_mask, lm_labels = batch
+        source, target, encoder_token_type_ids, encoder_mask, decoder_mask, ltr_lm_labels = batch
 
         source = source.to(args.device)
         target = target.to(args.device)
         encoder_token_type_ids = encoder_token_type_ids.to(args.device)
         encoder_mask = encoder_mask.to(args.device)
         decoder_mask = decoder_mask.to(args.device)
-        lm_labels = lm_labels.to(args.device)
+        ltr_lm_labels = ltr_lm_labels.to(args.device)
 
         with torch.no_grad():
             outputs = model(
@@ -299,7 +299,7 @@ def evaluate(args, model, tokenizer, prefix=""):
                 encoder_token_type_ids=encoder_token_type_ids,
                 encoder_attention_mask=encoder_mask,
                 decoder_attention_mask=decoder_mask,
-                decoder_lm_labels=lm_labels,
+                decoder_ltr_lm_labels=ltr_lm_labels,
             )
             lm_loss = outputs[0]
             eval_loss += lm_loss.mean().item()
diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index 1081c8dd7b..3fec69a814 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -548,6 +548,14 @@ BERT_INPUTS_DOCSTRING = r"""
             Mask to nullify selected heads of the self-attention modules.
             Mask values selected in ``[0, 1]``:
             ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+        **encoder_hidden_states**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``:
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model
+            is configured as a decoder.
+        **encoder_attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
+            is used in the cross-attention if the model is configured as a decoder.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
 """
 
 @add_start_docstrings("The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
@@ -609,26 +617,18 @@ class BertModel(BertPreTrainedModel):
                 head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
         """ Forward pass on the Model.
 
-        The values of the attention matrix (shape [batch_size, seq_length])
-        should be 1.0 for the position we want to attend to and 0. for the ones
-        we do not want to attend to.
-
         The model can behave as an encoder (with only self-attention) as well
         as a decoder, in which case a layer of cross-attention is added between
-        ever self-attention layer, following the architecture described in [1].
+        the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani,
+        Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
 
-        To behave like as a decoder the model needs to be initialized with the
-        `is_decoder` argument of the config set to `True`. An
+        To behave as an decoder the model needs to be initialized with the
+        `is_decoder` argument of the configuration set to `True`; an
         `encoder_hidden_states` is expected as an input to the forward pass.
-        When a decoder, there are two kinds of attention masks to specify:
 
-        (1) Self-attention masks that need to be causal (only attends to
-        previous tokens);
-        (2) A cross-attention mask that prevents the module
-        from attending to the encoder's padding tokens.
+        .. _`Attention is all you need`:
+            https://arxiv.org/abs/1706.03762
 
-        [1] Vaswani, Ashish, et al. "Attention is all you need." Advances in
-        neural information processing systems. 2017.
         """
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
@@ -791,11 +791,16 @@ class BertForMaskedLM(BertPreTrainedModel):
             Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
             Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
             in ``[0, ..., config.vocab_size]``
+        **ltr_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for computing the left-to-right language modeling loss (next word prediction).
+            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
 
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
         **masked_lm_loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
             Masked language modeling loss.
-        **next_token_loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+        **ltr_lm_loss**: (`optional`, returned when ``ltr_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
             Next token prediction loss.
         **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
@@ -833,7 +838,7 @@ class BertForMaskedLM(BertPreTrainedModel):
                                    self.bert.embeddings.word_embeddings)
 
     def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
-                masked_lm_labels=None, encoder_hidden_states=None, encoder_attention_mask=None, lm_labels=None, ):
+                masked_lm_labels=None, encoder_hidden_states=None, encoder_attention_mask=None, ltr_lm_labels=None, ):
 
         outputs = self.bert(input_ids,
                             attention_mask=attention_mask,
@@ -852,22 +857,22 @@ class BertForMaskedLM(BertPreTrainedModel):
         # 1. If a tensor that contains the indices of masked labels is provided,
         #    the cross-entropy is the MLM cross-entropy that measures the likelihood
         #    of predictions for masked words.
-        # 2. If `lm_label` is provided we are in a causal scenario where we
-        #    try to predict the next word for each input in the encoder.
+        # 2. If `ltr_lm_labels` is provided we are in a causal scenario where we
+        #    try to predict the next token for each input in the decoder.
         if masked_lm_labels is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)  # -1 index = padding token
             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
             outputs = (masked_lm_loss,) + outputs
 
-        if lm_labels is not None:
+        if ltr_lm_labels is not None:
             # we are doing next-token prediction; shift prediction scores and input ids by one
             prediction_scores = prediction_scores[:, :-1, :].contiguous()
-            lm_labels = lm_labels[:, 1:].contiguous()
+            ltr_lm_labels = ltr_lm_labels[:, 1:].contiguous()
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            next_token_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), lm_labels.view(-1))
-            outputs = (next_token_loss,) + outputs
+            ltr_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), ltr_lm_labels.view(-1))
+            outputs = (ltr_lm_loss,) + outputs
 
-        return outputs  # (masked_lm_loss), (next_token_loss), prediction_scores, (hidden_states), (attentions)
+        return outputs  # (masked_lm_loss), (ltr_lm_loss), prediction_scores, (hidden_states), (attentions)
 
 
 @add_start_docstrings("""Bert Model with a `next sentence prediction (classification)` head on top. """,