From 569da80cedcf9022b869fbb398666c93b3219956 Mon Sep 17 00:00:00 2001
From: Romain Keramitas <r.keramitas@gmail.com>
Date: Tue, 7 Jan 2020 10:09:56 +0100
Subject: [PATCH] Make doc regarding masked indices more clear.

Signed-off-by: Romain Keramitas <r.keramitas@gmail.com>
---
 src/transformers/modeling_albert.py     | 2 +-
 src/transformers/modeling_bert.py       | 6 +++---
 src/transformers/modeling_camembert.py  | 2 +-
 src/transformers/modeling_ctrl.py       | 2 +-
 src/transformers/modeling_distilbert.py | 2 +-
 src/transformers/modeling_gpt2.py       | 2 +-
 src/transformers/modeling_openai.py     | 4 ++--
 src/transformers/modeling_roberta.py    | 2 +-
 8 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/transformers/modeling_albert.py b/src/transformers/modeling_albert.py
index dea35e695e..c1540bda5f 100644
--- a/src/transformers/modeling_albert.py
+++ b/src/transformers/modeling_albert.py
@@ -597,7 +597,7 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
     r"""
         **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Labels for computing the masked language modeling loss.
-            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
             Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
             in ``[0, ..., config.vocab_size]``
 
diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py
index 49f67e4416..cdc46b9662 100644
--- a/src/transformers/modeling_bert.py
+++ b/src/transformers/modeling_bert.py
@@ -826,7 +826,7 @@ class BertForPreTraining(BertPreTrainedModel):
     r"""
         **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Labels for computing the masked language modeling loss.
-            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
             Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
             in ``[0, ..., config.vocab_size]``
         **next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -916,12 +916,12 @@ class BertForMaskedLM(BertPreTrainedModel):
     r"""
         **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Labels for computing the masked language modeling loss.
-            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
             Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
             in ``[0, ..., config.vocab_size]``
         **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Labels for computing the left-to-right language modeling loss (next word prediction).
-            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
             Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
             in ``[0, ..., config.vocab_size]``
 
diff --git a/src/transformers/modeling_camembert.py b/src/transformers/modeling_camembert.py
index b16df3f9b9..720d4a1f60 100644
--- a/src/transformers/modeling_camembert.py
+++ b/src/transformers/modeling_camembert.py
@@ -167,7 +167,7 @@ class CamembertForMaskedLM(RobertaForMaskedLM):
     r"""
         **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Labels for computing the masked language modeling loss.
-            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
             Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
             in ``[0, ..., config.vocab_size]``
 
diff --git a/src/transformers/modeling_ctrl.py b/src/transformers/modeling_ctrl.py
index e2aebed8c8..03e73c3311 100644
--- a/src/transformers/modeling_ctrl.py
+++ b/src/transformers/modeling_ctrl.py
@@ -444,7 +444,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Labels for language modeling.
             Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
-            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+            Indices are selected in ``[-100, 0, ..., config.vocab_size]``
             All labels set to ``-100`` are ignored (masked), the loss is only
             computed for labels in ``[0, ..., config.vocab_size]``
 
diff --git a/src/transformers/modeling_distilbert.py b/src/transformers/modeling_distilbert.py
index 0563dcf852..475992d382 100644
--- a/src/transformers/modeling_distilbert.py
+++ b/src/transformers/modeling_distilbert.py
@@ -496,7 +496,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
     r"""
         **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Labels for computing the masked language modeling loss.
-            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
             Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
             in ``[0, ..., config.vocab_size]``
 
diff --git a/src/transformers/modeling_gpt2.py b/src/transformers/modeling_gpt2.py
index 54db1cc067..679100b84d 100644
--- a/src/transformers/modeling_gpt2.py
+++ b/src/transformers/modeling_gpt2.py
@@ -513,7 +513,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Labels for language modeling.
             Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
-            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+            Indices are selected in ``[-100, 0, ..., config.vocab_size]``
             All labels set to ``-100`` are ignored (masked), the loss is only
             computed for labels in ``[0, ..., config.vocab_size]``
 
diff --git a/src/transformers/modeling_openai.py b/src/transformers/modeling_openai.py
index d8c5abc92e..ec0f09d158 100644
--- a/src/transformers/modeling_openai.py
+++ b/src/transformers/modeling_openai.py
@@ -490,7 +490,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Labels for language modeling.
             Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
-            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+            Indices are selected in ``[-100, 0, ..., config.vocab_size]``
             All labels set to ``-100`` are ignored (masked), the loss is only
             computed for labels in ``[0, ..., config.vocab_size]``
 
@@ -578,7 +578,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Labels for language modeling.
             Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
-            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+            Indices are selected in ``[-100, 0, ..., config.vocab_size]``
             All labels set to ``-100`` are ignored (masked), the loss is only
             computed for labels in ``[0, ..., config.vocab_size]``
         **mc_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``:
diff --git a/src/transformers/modeling_roberta.py b/src/transformers/modeling_roberta.py
index 8958d30a22..56e983e01c 100644
--- a/src/transformers/modeling_roberta.py
+++ b/src/transformers/modeling_roberta.py
@@ -223,7 +223,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
     r"""
         **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Labels for computing the masked language modeling loss.
-            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
             Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
             in ``[0, ..., config.vocab_size]``