From dc667ce1a7b6d6e2026d39c4a3bc3c25b395e0d6 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Sat, 14 Dec 2019 09:56:27 +0100
Subject: [PATCH] double check cc @LysandreJik

---
 examples/contrib/run_openai_gpt.py | 2 +-
 examples/distillation/distiller.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/contrib/run_openai_gpt.py b/examples/contrib/run_openai_gpt.py
index 2d165a91e3..bc5695becd 100644
--- a/examples/contrib/run_openai_gpt.py
+++ b/examples/contrib/run_openai_gpt.py
@@ -75,7 +75,7 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d
         n_batch = len(dataset)
         input_ids = np.zeros((n_batch, 2, input_len), dtype=np.int64)
         mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64)
-        lm_labels = np.full((n_batch, 2, input_len), fill_value=-1, dtype=np.int64)
+        lm_labels = np.full((n_batch, 2, input_len), fill_value=-100, dtype=np.int64)
         mc_labels = np.zeros((n_batch,), dtype=np.int64)
         for i, (story, cont1, cont2, mc_label), in enumerate(dataset):
             with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
diff --git a/examples/distillation/distiller.py b/examples/distillation/distiller.py
index 7658fe4acd..d5a86247a8 100644
--- a/examples/distillation/distiller.py
+++ b/examples/distillation/distiller.py
@@ -186,7 +186,7 @@ class Distiller:
         -------
             token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
             attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
-            mlm_labels: `torch.tensor(bs, seq_length)` - The masked languge modeling labels. There is a -1 where there is nothing to predict.
+            mlm_labels: `torch.tensor(bs, seq_length)` - The masked languge modeling labels. There is a -100 where there is nothing to predict.
         """
         token_ids, lengths = batch
         token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
@@ -246,7 +246,7 @@ class Distiller:
         -------
             token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
             attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
-            clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -1 where there is nothing to predict.
+            clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -100 where there is nothing to predict.
         """
         token_ids, lengths = batch
         token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)