From 418589244d263087f1d48655f621a65f2a5fcba6 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Tue, 10 Dec 2019 15:26:19 -0500 Subject: [PATCH 1/5] Uniforming the ignored indices --- templates/adding_a_new_model/modeling_xxx.py | 4 ++-- transformers/modeling_albert.py | 4 ++-- transformers/modeling_bert.py | 14 +++++++------- transformers/modeling_camembert.py | 2 +- transformers/modeling_ctrl.py | 4 ++-- transformers/modeling_distilbert.py | 4 ++-- transformers/modeling_gpt2.py | 8 ++++---- transformers/modeling_openai.py | 8 ++++---- transformers/modeling_roberta.py | 4 ++-- transformers/modeling_tf_roberta.py | 6 ------ transformers/modeling_transfo_xl.py | 10 +++++----- transformers/modeling_xlm.py | 2 +- transformers/modeling_xlnet.py | 4 ++-- 13 files changed, 34 insertions(+), 40 deletions(-) diff --git a/templates/adding_a_new_model/modeling_xxx.py b/templates/adding_a_new_model/modeling_xxx.py index 94c4b0db9a..2758ade571 100644 --- a/templates/adding_a_new_model/modeling_xxx.py +++ b/templates/adding_a_new_model/modeling_xxx.py @@ -362,7 +362,7 @@ class XxxForMaskedLM(XxxPreTrainedModel): **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Labels for computing the masked language modeling loss. Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) - Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels + Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -413,7 +413,7 @@ class XxxForMaskedLM(XxxPreTrainedModel): outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here if masked_lm_labels is not None: - loss_fct = CrossEntropyLoss(ignore_index=-1) + loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) outputs = (masked_lm_loss,) + outputs diff --git a/transformers/modeling_albert.py b/transformers/modeling_albert.py index 0f67bf8f36..f833b6d6bf 100644 --- a/transformers/modeling_albert.py +++ b/transformers/modeling_albert.py @@ -572,7 +572,7 @@ class AlbertForMaskedLM(AlbertPreTrainedModel): **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Labels for computing the masked language modeling loss. Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) - Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels + Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -624,7 +624,7 @@ class AlbertForMaskedLM(AlbertPreTrainedModel): outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here if masked_lm_labels is not None: - loss_fct = CrossEntropyLoss(ignore_index=-1) + loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) outputs = (masked_lm_loss,) + outputs diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py index d84b0a1a7c..1c142fcd28 100644 --- a/transformers/modeling_bert.py +++ b/transformers/modeling_bert.py @@ -748,7 +748,7 @@ class BertForPreTraining(BertPreTrainedModel): **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Labels for computing the masked language modeling loss. Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) - Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels + Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` **next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring) @@ -807,7 +807,7 @@ class BertForPreTraining(BertPreTrainedModel): outputs = (prediction_scores, seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here if masked_lm_labels is not None and next_sentence_label is not None: - loss_fct = CrossEntropyLoss(ignore_index=-1) + loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) total_loss = masked_lm_loss + next_sentence_loss @@ -824,12 +824,12 @@ class BertForMaskedLM(BertPreTrainedModel): **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Labels for computing the masked language modeling loss. Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) - Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels + Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) - Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels + Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -891,7 +891,7 @@ class BertForMaskedLM(BertPreTrainedModel): # 2. If `lm_labels` is provided we are in a causal scenario where we # try to predict the next token for each input in the decoder. if masked_lm_labels is not None: - loss_fct = CrossEntropyLoss(ignore_index=-1) # -1 index = padding token + loss_fct = CrossEntropyLoss() # -1 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) outputs = (masked_lm_loss,) + outputs @@ -899,7 +899,7 @@ class BertForMaskedLM(BertPreTrainedModel): # we are doing next-token prediction; shift prediction scores and input ids by one prediction_scores = prediction_scores[:, :-1, :].contiguous() lm_labels = lm_labels[:, 1:].contiguous() - loss_fct = CrossEntropyLoss(ignore_index=-1) + loss_fct = CrossEntropyLoss() ltr_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), lm_labels.view(-1)) outputs = (ltr_lm_loss,) + outputs @@ -963,7 +963,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel): outputs = (seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here if next_sentence_label is not None: - loss_fct = CrossEntropyLoss(ignore_index=-1) + loss_fct = CrossEntropyLoss() next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) outputs = (next_sentence_loss,) + outputs diff --git a/transformers/modeling_camembert.py b/transformers/modeling_camembert.py index f302346f2d..1b808bfd82 100644 --- a/transformers/modeling_camembert.py +++ b/transformers/modeling_camembert.py @@ -156,7 +156,7 @@ class CamembertForMaskedLM(RobertaForMaskedLM): **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Labels for computing the masked language modeling loss. Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) - Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels + Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: diff --git a/transformers/modeling_ctrl.py b/transformers/modeling_ctrl.py index 97bcb14434..f9dc2aba73 100644 --- a/transformers/modeling_ctrl.py +++ b/transformers/modeling_ctrl.py @@ -429,7 +429,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` Indices are selected in ``[-1, 0, ..., config.vocab_size]`` - All labels set to ``-1`` are ignored (masked), the loss is only + All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -494,7 +494,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel): shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens - loss_fct = CrossEntropyLoss(ignore_index=-1) + loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) outputs = (loss,) + outputs diff --git a/transformers/modeling_distilbert.py b/transformers/modeling_distilbert.py index 6faeafa15e..5a4d55d3b1 100644 --- a/transformers/modeling_distilbert.py +++ b/transformers/modeling_distilbert.py @@ -491,7 +491,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel): **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Labels for computing the masked language modeling loss. Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) - Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels + Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -528,7 +528,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel): self.init_weights() - self.mlm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1) + self.mlm_loss_fct = nn.CrossEntropyLoss() def get_output_embeddings(self): return self.vocab_projector diff --git a/transformers/modeling_gpt2.py b/transformers/modeling_gpt2.py index 96fd1c0607..a4a6c89998 100644 --- a/transformers/modeling_gpt2.py +++ b/transformers/modeling_gpt2.py @@ -494,7 +494,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` Indices are selected in ``[-1, 0, ..., config.vocab_size]`` - All labels set to ``-1`` are ignored (masked), the loss is only + All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -557,7 +557,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens - loss_fct = CrossEntropyLoss(ignore_index=-1) + loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) outputs = (loss,) + outputs @@ -579,7 +579,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` Indices are selected in ``[-1, 0, ..., config.vocab_size]`` - All labels set to ``-1`` are ignored (masked), the loss is only + All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` **mc_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``: Labels for computing the multiple choice classification loss. @@ -667,7 +667,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): if lm_labels is not None: shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = lm_labels[..., 1:].contiguous() - loss_fct = CrossEntropyLoss(ignore_index=-1) + loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) outputs = (loss,) + outputs diff --git a/transformers/modeling_openai.py b/transformers/modeling_openai.py index 4fe7ffee8b..f980ad7e8d 100644 --- a/transformers/modeling_openai.py +++ b/transformers/modeling_openai.py @@ -471,7 +471,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids`` Indices are selected in ``[-1, 0, ..., config.vocab_size]`` - All labels set to ``-1`` are ignored (masked), the loss is only + All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -523,7 +523,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = labels[..., 1:].contiguous() # Flatten the tokens - loss_fct = CrossEntropyLoss(ignore_index=-1) + loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) outputs = (loss,) + outputs @@ -545,7 +545,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` Indices are selected in ``[-1, 0, ..., config.vocab_size]`` - All labels set to ``-1`` are ignored (masked), the loss is only + All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` **mc_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``: Labels for computing the multiple choice classification loss. @@ -621,7 +621,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): if lm_labels is not None: shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = lm_labels[..., 1:].contiguous() - loss_fct = CrossEntropyLoss(ignore_index=-1) + loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) outputs = (loss,) + outputs diff --git a/transformers/modeling_roberta.py b/transformers/modeling_roberta.py index fc27353d37..8fac453ecd 100644 --- a/transformers/modeling_roberta.py +++ b/transformers/modeling_roberta.py @@ -196,7 +196,7 @@ class RobertaForMaskedLM(BertPreTrainedModel): **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Labels for computing the masked language modeling loss. Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) - Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels + Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -250,7 +250,7 @@ class RobertaForMaskedLM(BertPreTrainedModel): outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here if masked_lm_labels is not None: - loss_fct = CrossEntropyLoss(ignore_index=-1) + loss_fct = CrossEntropyLoss() masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) outputs = (masked_lm_loss,) + outputs diff --git a/transformers/modeling_tf_roberta.py b/transformers/modeling_tf_roberta.py index 954279f873..1ed8f330eb 100644 --- a/transformers/modeling_tf_roberta.py +++ b/transformers/modeling_tf_roberta.py @@ -250,12 +250,6 @@ class TFRobertaLMHead(tf.keras.layers.Layer): ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING) class TFRobertaForMaskedLM(TFRobertaPreTrainedModel): r""" - **masked_lm_labels**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``: - Labels for computing the masked language modeling loss. - Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) - Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels - in ``[0, ..., config.vocab_size]`` - Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: **loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``tf.Tensor`` of shape ``(1,)``: Masked language modeling loss. diff --git a/transformers/modeling_transfo_xl.py b/transformers/modeling_transfo_xl.py index a6a82f0dfe..b9271805e0 100644 --- a/transformers/modeling_transfo_xl.py +++ b/transformers/modeling_transfo_xl.py @@ -796,17 +796,17 @@ class TransfoXLModel(TransfoXLPreTrainedModel): TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING) class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): r""" - **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Labels for language modeling. - Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` + Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids`` Indices are selected in ``[-1, 0, ..., config.vocab_size]`` - All labels set to ``-1`` are ignored (masked), the loss is only + All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: - **loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Language modeling loss. - **prediction_scores**: ``None`` if ``lm_labels`` is provided else ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` + **prediction_scores**: ``None`` if ``labels`` is provided else ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). We don't output them when the loss is computed to speedup adaptive softmax decoding. **mems**: diff --git a/transformers/modeling_xlm.py b/transformers/modeling_xlm.py index 257f0da394..77027aab04 100644 --- a/transformers/modeling_xlm.py +++ b/transformers/modeling_xlm.py @@ -604,7 +604,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` Indices are selected in ``[-1, 0, ..., config.vocab_size]`` - All labels set to ``-1`` are ignored (masked), the loss is only + All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: diff --git a/transformers/modeling_xlnet.py b/transformers/modeling_xlnet.py index 225e5b059b..0a8b9f0af3 100644 --- a/transformers/modeling_xlnet.py +++ b/transformers/modeling_xlnet.py @@ -898,7 +898,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` Indices are selected in ``[-1, 0, ..., config.vocab_size]`` - All labels set to ``-1`` are ignored (masked), the loss is only + All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: @@ -965,7 +965,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): if labels is not None: # Flatten the tokens - loss_fct = CrossEntropyLoss(ignore_index=-1) + loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1)) outputs = (loss,) + outputs From ec6fb25c21d0c9248f5ef6ce986426e124cd3da6 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Tue, 10 Dec 2019 15:49:20 -0500 Subject: [PATCH 2/5] Patch documentation --- transformers/modeling_bert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py index 1c142fcd28..549bc5950b 100644 --- a/transformers/modeling_bert.py +++ b/transformers/modeling_bert.py @@ -891,7 +891,7 @@ class BertForMaskedLM(BertPreTrainedModel): # 2. If `lm_labels` is provided we are in a causal scenario where we # try to predict the next token for each input in the decoder. if masked_lm_labels is not None: - loss_fct = CrossEntropyLoss() # -1 index = padding token + loss_fct = CrossEntropyLoss() # -100 index = padding token masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) outputs = (masked_lm_loss,) + outputs From b72f9d340e9c57591fa655e31d93dee3d11270c7 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Tue, 10 Dec 2019 18:33:17 -0500 Subject: [PATCH 3/5] Correct index in script --- examples/run_lm_finetuning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py index c4c73e71af..c35b6e02a8 100644 --- a/examples/run_lm_finetuning.py +++ b/examples/run_lm_finetuning.py @@ -150,7 +150,7 @@ def mask_tokens(inputs, tokenizer, args): special_tokens_mask = [tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()] probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0) masked_indices = torch.bernoulli(probability_matrix).bool() - labels[~masked_indices] = -1 # We only compute loss on masked tokens + labels[~masked_indices] = -100 # We only compute loss on masked tokens # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices From 3fd71c4431f2b31eaad737d364e4a4d9bf35fd5b Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Thu, 12 Dec 2019 12:08:54 -0500 Subject: [PATCH 4/5] Update example scripts --- examples/distillation/distiller.py | 6 +++--- examples/utils_ner.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/distillation/distiller.py b/examples/distillation/distiller.py index 1e33190aca..7658fe4acd 100644 --- a/examples/distillation/distiller.py +++ b/examples/distillation/distiller.py @@ -112,7 +112,7 @@ class Distiller: self.last_log = 0 self.ce_loss_fct = nn.KLDivLoss(reduction='batchmean') - self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1) + self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-100) if self.alpha_mse > 0.: self.mse_loss_fct = nn.MSELoss(reduction='sum') if self.alpha_cos > 0.: @@ -224,7 +224,7 @@ class Distiller: _token_ids = _token_ids_mask * (probs == 0).long() + _token_ids_real * (probs == 1).long() + _token_ids_rand * (probs == 2).long() token_ids = token_ids.masked_scatter(pred_mask, _token_ids) - mlm_labels[~pred_mask] = -1 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility + mlm_labels[~pred_mask] = -100 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility # sanity checks assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size @@ -254,7 +254,7 @@ class Distiller: attn_mask = (torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None]) clm_labels = token_ids.new(token_ids.size()).copy_(token_ids) - clm_labels[~attn_mask] = -1 # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility + clm_labels[~attn_mask] = -100 # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility # sanity checks assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size diff --git a/examples/utils_ner.py b/examples/utils_ner.py index c20d7b0d1f..45ddeafbd5 100644 --- a/examples/utils_ner.py +++ b/examples/utils_ner.py @@ -94,7 +94,7 @@ def convert_examples_to_features(examples, pad_on_left=False, pad_token=0, pad_token_segment_id=0, - pad_token_label_id=-1, + pad_token_label_id=-100, sequence_a_segment_id=0, mask_padding_with_zero=True): """ Loads a data file into a list of `InputBatch`s From dc667ce1a7b6d6e2026d39c4a3bc3c25b395e0d6 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Sat, 14 Dec 2019 09:56:27 +0100 Subject: [PATCH 5/5] double check cc @LysandreJik --- examples/contrib/run_openai_gpt.py | 2 +- examples/distillation/distiller.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/contrib/run_openai_gpt.py b/examples/contrib/run_openai_gpt.py index 2d165a91e3..bc5695becd 100644 --- a/examples/contrib/run_openai_gpt.py +++ b/examples/contrib/run_openai_gpt.py @@ -75,7 +75,7 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d n_batch = len(dataset) input_ids = np.zeros((n_batch, 2, input_len), dtype=np.int64) mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64) - lm_labels = np.full((n_batch, 2, input_len), fill_value=-1, dtype=np.int64) + lm_labels = np.full((n_batch, 2, input_len), fill_value=-100, dtype=np.int64) mc_labels = np.zeros((n_batch,), dtype=np.int64) for i, (story, cont1, cont2, mc_label), in enumerate(dataset): with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token] diff --git a/examples/distillation/distiller.py b/examples/distillation/distiller.py index 7658fe4acd..d5a86247a8 100644 --- a/examples/distillation/distiller.py +++ b/examples/distillation/distiller.py @@ -186,7 +186,7 @@ class Distiller: ------- token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM. attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention. - mlm_labels: `torch.tensor(bs, seq_length)` - The masked languge modeling labels. There is a -1 where there is nothing to predict. + mlm_labels: `torch.tensor(bs, seq_length)` - The masked languge modeling labels. There is a -100 where there is nothing to predict. """ token_ids, lengths = batch token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths) @@ -246,7 +246,7 @@ class Distiller: ------- token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM. attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention. - clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -1 where there is nothing to predict. + clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -100 where there is nothing to predict. """ token_ids, lengths = batch token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)