Uniforming the ignored indices
This commit is contained in:
@@ -362,7 +362,7 @@ class XxxForMaskedLM(XxxPreTrainedModel):
|
|||||||
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Labels for computing the masked language modeling loss.
|
Labels for computing the masked language modeling loss.
|
||||||
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||||
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
|
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||||
in ``[0, ..., config.vocab_size]``
|
in ``[0, ..., config.vocab_size]``
|
||||||
|
|
||||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
@@ -413,7 +413,7 @@ class XxxForMaskedLM(XxxPreTrainedModel):
|
|||||||
|
|
||||||
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here
|
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here
|
||||||
if masked_lm_labels is not None:
|
if masked_lm_labels is not None:
|
||||||
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
loss_fct = CrossEntropyLoss()
|
||||||
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
|
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
|
||||||
outputs = (masked_lm_loss,) + outputs
|
outputs = (masked_lm_loss,) + outputs
|
||||||
|
|
||||||
|
|||||||
@@ -572,7 +572,7 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
|
|||||||
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Labels for computing the masked language modeling loss.
|
Labels for computing the masked language modeling loss.
|
||||||
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||||
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
|
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||||
in ``[0, ..., config.vocab_size]``
|
in ``[0, ..., config.vocab_size]``
|
||||||
|
|
||||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
@@ -624,7 +624,7 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
|
|||||||
|
|
||||||
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here
|
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here
|
||||||
if masked_lm_labels is not None:
|
if masked_lm_labels is not None:
|
||||||
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
loss_fct = CrossEntropyLoss()
|
||||||
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
|
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
|
||||||
outputs = (masked_lm_loss,) + outputs
|
outputs = (masked_lm_loss,) + outputs
|
||||||
|
|
||||||
|
|||||||
@@ -748,7 +748,7 @@ class BertForPreTraining(BertPreTrainedModel):
|
|||||||
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Labels for computing the masked language modeling loss.
|
Labels for computing the masked language modeling loss.
|
||||||
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||||
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
|
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||||
in ``[0, ..., config.vocab_size]``
|
in ``[0, ..., config.vocab_size]``
|
||||||
**next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
**next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
||||||
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
|
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
|
||||||
@@ -807,7 +807,7 @@ class BertForPreTraining(BertPreTrainedModel):
|
|||||||
outputs = (prediction_scores, seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here
|
outputs = (prediction_scores, seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here
|
||||||
|
|
||||||
if masked_lm_labels is not None and next_sentence_label is not None:
|
if masked_lm_labels is not None and next_sentence_label is not None:
|
||||||
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
loss_fct = CrossEntropyLoss()
|
||||||
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
|
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
|
||||||
next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
|
next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
|
||||||
total_loss = masked_lm_loss + next_sentence_loss
|
total_loss = masked_lm_loss + next_sentence_loss
|
||||||
@@ -824,12 +824,12 @@ class BertForMaskedLM(BertPreTrainedModel):
|
|||||||
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Labels for computing the masked language modeling loss.
|
Labels for computing the masked language modeling loss.
|
||||||
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||||
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
|
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||||
in ``[0, ..., config.vocab_size]``
|
in ``[0, ..., config.vocab_size]``
|
||||||
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Labels for computing the left-to-right language modeling loss (next word prediction).
|
Labels for computing the left-to-right language modeling loss (next word prediction).
|
||||||
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||||
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
|
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||||
in ``[0, ..., config.vocab_size]``
|
in ``[0, ..., config.vocab_size]``
|
||||||
|
|
||||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
@@ -891,7 +891,7 @@ class BertForMaskedLM(BertPreTrainedModel):
|
|||||||
# 2. If `lm_labels` is provided we are in a causal scenario where we
|
# 2. If `lm_labels` is provided we are in a causal scenario where we
|
||||||
# try to predict the next token for each input in the decoder.
|
# try to predict the next token for each input in the decoder.
|
||||||
if masked_lm_labels is not None:
|
if masked_lm_labels is not None:
|
||||||
loss_fct = CrossEntropyLoss(ignore_index=-1) # -1 index = padding token
|
loss_fct = CrossEntropyLoss() # -1 index = padding token
|
||||||
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
|
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
|
||||||
outputs = (masked_lm_loss,) + outputs
|
outputs = (masked_lm_loss,) + outputs
|
||||||
|
|
||||||
@@ -899,7 +899,7 @@ class BertForMaskedLM(BertPreTrainedModel):
|
|||||||
# we are doing next-token prediction; shift prediction scores and input ids by one
|
# we are doing next-token prediction; shift prediction scores and input ids by one
|
||||||
prediction_scores = prediction_scores[:, :-1, :].contiguous()
|
prediction_scores = prediction_scores[:, :-1, :].contiguous()
|
||||||
lm_labels = lm_labels[:, 1:].contiguous()
|
lm_labels = lm_labels[:, 1:].contiguous()
|
||||||
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
loss_fct = CrossEntropyLoss()
|
||||||
ltr_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), lm_labels.view(-1))
|
ltr_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), lm_labels.view(-1))
|
||||||
outputs = (ltr_lm_loss,) + outputs
|
outputs = (ltr_lm_loss,) + outputs
|
||||||
|
|
||||||
@@ -963,7 +963,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
|
|||||||
|
|
||||||
outputs = (seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here
|
outputs = (seq_relationship_score,) + outputs[2:] # add hidden states and attention if they are here
|
||||||
if next_sentence_label is not None:
|
if next_sentence_label is not None:
|
||||||
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
loss_fct = CrossEntropyLoss()
|
||||||
next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
|
next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
|
||||||
outputs = (next_sentence_loss,) + outputs
|
outputs = (next_sentence_loss,) + outputs
|
||||||
|
|
||||||
|
|||||||
@@ -156,7 +156,7 @@ class CamembertForMaskedLM(RobertaForMaskedLM):
|
|||||||
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Labels for computing the masked language modeling loss.
|
Labels for computing the masked language modeling loss.
|
||||||
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||||
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
|
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||||
in ``[0, ..., config.vocab_size]``
|
in ``[0, ..., config.vocab_size]``
|
||||||
|
|
||||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
|||||||
@@ -429,7 +429,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
|
|||||||
Labels for language modeling.
|
Labels for language modeling.
|
||||||
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
|
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
|
||||||
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
|
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
|
||||||
All labels set to ``-1`` are ignored (masked), the loss is only
|
All labels set to ``-100`` are ignored (masked), the loss is only
|
||||||
computed for labels in ``[0, ..., config.vocab_size]``
|
computed for labels in ``[0, ..., config.vocab_size]``
|
||||||
|
|
||||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
@@ -494,7 +494,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
|
|||||||
shift_logits = lm_logits[..., :-1, :].contiguous()
|
shift_logits = lm_logits[..., :-1, :].contiguous()
|
||||||
shift_labels = labels[..., 1:].contiguous()
|
shift_labels = labels[..., 1:].contiguous()
|
||||||
# Flatten the tokens
|
# Flatten the tokens
|
||||||
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
loss_fct = CrossEntropyLoss()
|
||||||
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
|
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
|
||||||
shift_labels.view(-1))
|
shift_labels.view(-1))
|
||||||
outputs = (loss,) + outputs
|
outputs = (loss,) + outputs
|
||||||
|
|||||||
@@ -491,7 +491,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
|
|||||||
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Labels for computing the masked language modeling loss.
|
Labels for computing the masked language modeling loss.
|
||||||
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||||
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
|
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||||
in ``[0, ..., config.vocab_size]``
|
in ``[0, ..., config.vocab_size]``
|
||||||
|
|
||||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
@@ -528,7 +528,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
|
|||||||
|
|
||||||
self.init_weights()
|
self.init_weights()
|
||||||
|
|
||||||
self.mlm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
|
self.mlm_loss_fct = nn.CrossEntropyLoss()
|
||||||
|
|
||||||
def get_output_embeddings(self):
|
def get_output_embeddings(self):
|
||||||
return self.vocab_projector
|
return self.vocab_projector
|
||||||
|
|||||||
@@ -494,7 +494,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
|||||||
Labels for language modeling.
|
Labels for language modeling.
|
||||||
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
|
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
|
||||||
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
|
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
|
||||||
All labels set to ``-1`` are ignored (masked), the loss is only
|
All labels set to ``-100`` are ignored (masked), the loss is only
|
||||||
computed for labels in ``[0, ..., config.vocab_size]``
|
computed for labels in ``[0, ..., config.vocab_size]``
|
||||||
|
|
||||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
@@ -557,7 +557,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
|||||||
shift_logits = lm_logits[..., :-1, :].contiguous()
|
shift_logits = lm_logits[..., :-1, :].contiguous()
|
||||||
shift_labels = labels[..., 1:].contiguous()
|
shift_labels = labels[..., 1:].contiguous()
|
||||||
# Flatten the tokens
|
# Flatten the tokens
|
||||||
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
loss_fct = CrossEntropyLoss()
|
||||||
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
|
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
|
||||||
shift_labels.view(-1))
|
shift_labels.view(-1))
|
||||||
outputs = (loss,) + outputs
|
outputs = (loss,) + outputs
|
||||||
@@ -579,7 +579,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
|||||||
Labels for language modeling.
|
Labels for language modeling.
|
||||||
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
|
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
|
||||||
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
|
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
|
||||||
All labels set to ``-1`` are ignored (masked), the loss is only
|
All labels set to ``-100`` are ignored (masked), the loss is only
|
||||||
computed for labels in ``[0, ..., config.vocab_size]``
|
computed for labels in ``[0, ..., config.vocab_size]``
|
||||||
**mc_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``:
|
**mc_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``:
|
||||||
Labels for computing the multiple choice classification loss.
|
Labels for computing the multiple choice classification loss.
|
||||||
@@ -667,7 +667,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
|||||||
if lm_labels is not None:
|
if lm_labels is not None:
|
||||||
shift_logits = lm_logits[..., :-1, :].contiguous()
|
shift_logits = lm_logits[..., :-1, :].contiguous()
|
||||||
shift_labels = lm_labels[..., 1:].contiguous()
|
shift_labels = lm_labels[..., 1:].contiguous()
|
||||||
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
loss_fct = CrossEntropyLoss()
|
||||||
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
|
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
|
||||||
shift_labels.view(-1))
|
shift_labels.view(-1))
|
||||||
outputs = (loss,) + outputs
|
outputs = (loss,) + outputs
|
||||||
|
|||||||
@@ -471,7 +471,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
|
|||||||
Labels for language modeling.
|
Labels for language modeling.
|
||||||
Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
|
Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
|
||||||
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
|
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
|
||||||
All labels set to ``-1`` are ignored (masked), the loss is only
|
All labels set to ``-100`` are ignored (masked), the loss is only
|
||||||
computed for labels in ``[0, ..., config.vocab_size]``
|
computed for labels in ``[0, ..., config.vocab_size]``
|
||||||
|
|
||||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
@@ -523,7 +523,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
|
|||||||
shift_logits = lm_logits[..., :-1, :].contiguous()
|
shift_logits = lm_logits[..., :-1, :].contiguous()
|
||||||
shift_labels = labels[..., 1:].contiguous()
|
shift_labels = labels[..., 1:].contiguous()
|
||||||
# Flatten the tokens
|
# Flatten the tokens
|
||||||
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
loss_fct = CrossEntropyLoss()
|
||||||
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
|
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
|
||||||
shift_labels.view(-1))
|
shift_labels.view(-1))
|
||||||
outputs = (loss,) + outputs
|
outputs = (loss,) + outputs
|
||||||
@@ -545,7 +545,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
|||||||
Labels for language modeling.
|
Labels for language modeling.
|
||||||
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
|
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
|
||||||
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
|
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
|
||||||
All labels set to ``-1`` are ignored (masked), the loss is only
|
All labels set to ``-100`` are ignored (masked), the loss is only
|
||||||
computed for labels in ``[0, ..., config.vocab_size]``
|
computed for labels in ``[0, ..., config.vocab_size]``
|
||||||
**mc_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``:
|
**mc_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``:
|
||||||
Labels for computing the multiple choice classification loss.
|
Labels for computing the multiple choice classification loss.
|
||||||
@@ -621,7 +621,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
|||||||
if lm_labels is not None:
|
if lm_labels is not None:
|
||||||
shift_logits = lm_logits[..., :-1, :].contiguous()
|
shift_logits = lm_logits[..., :-1, :].contiguous()
|
||||||
shift_labels = lm_labels[..., 1:].contiguous()
|
shift_labels = lm_labels[..., 1:].contiguous()
|
||||||
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
loss_fct = CrossEntropyLoss()
|
||||||
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
|
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
|
||||||
shift_labels.view(-1))
|
shift_labels.view(-1))
|
||||||
outputs = (loss,) + outputs
|
outputs = (loss,) + outputs
|
||||||
|
|||||||
@@ -196,7 +196,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
|
|||||||
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
**masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Labels for computing the masked language modeling loss.
|
Labels for computing the masked language modeling loss.
|
||||||
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||||
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
|
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||||
in ``[0, ..., config.vocab_size]``
|
in ``[0, ..., config.vocab_size]``
|
||||||
|
|
||||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
@@ -250,7 +250,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
|
|||||||
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here
|
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here
|
||||||
|
|
||||||
if masked_lm_labels is not None:
|
if masked_lm_labels is not None:
|
||||||
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
loss_fct = CrossEntropyLoss()
|
||||||
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
|
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
|
||||||
outputs = (masked_lm_loss,) + outputs
|
outputs = (masked_lm_loss,) + outputs
|
||||||
|
|
||||||
|
|||||||
@@ -250,12 +250,6 @@ class TFRobertaLMHead(tf.keras.layers.Layer):
|
|||||||
ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
|
ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
|
||||||
class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
|
class TFRobertaForMaskedLM(TFRobertaPreTrainedModel):
|
||||||
r"""
|
r"""
|
||||||
**masked_lm_labels**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
|
|
||||||
Labels for computing the masked language modeling loss.
|
|
||||||
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
|
||||||
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
|
|
||||||
in ``[0, ..., config.vocab_size]``
|
|
||||||
|
|
||||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
**loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``tf.Tensor`` of shape ``(1,)``:
|
**loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``tf.Tensor`` of shape ``(1,)``:
|
||||||
Masked language modeling loss.
|
Masked language modeling loss.
|
||||||
|
|||||||
@@ -796,17 +796,17 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
|
|||||||
TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING)
|
TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING)
|
||||||
class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
|
class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
|
||||||
r"""
|
r"""
|
||||||
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Labels for language modeling.
|
Labels for language modeling.
|
||||||
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
|
Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
|
||||||
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
|
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
|
||||||
All labels set to ``-1`` are ignored (masked), the loss is only
|
All labels set to ``-100`` are ignored (masked), the loss is only
|
||||||
computed for labels in ``[0, ..., config.vocab_size]``
|
computed for labels in ``[0, ..., config.vocab_size]``
|
||||||
|
|
||||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
**loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||||
Language modeling loss.
|
Language modeling loss.
|
||||||
**prediction_scores**: ``None`` if ``lm_labels`` is provided else ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
**prediction_scores**: ``None`` if ``labels`` is provided else ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
|
||||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||||
We don't output them when the loss is computed to speedup adaptive softmax decoding.
|
We don't output them when the loss is computed to speedup adaptive softmax decoding.
|
||||||
**mems**:
|
**mems**:
|
||||||
|
|||||||
@@ -604,7 +604,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
|
|||||||
Labels for language modeling.
|
Labels for language modeling.
|
||||||
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
|
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
|
||||||
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
|
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
|
||||||
All labels set to ``-1`` are ignored (masked), the loss is only
|
All labels set to ``-100`` are ignored (masked), the loss is only
|
||||||
computed for labels in ``[0, ..., config.vocab_size]``
|
computed for labels in ``[0, ..., config.vocab_size]``
|
||||||
|
|
||||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
|
|||||||
@@ -898,7 +898,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
|
|||||||
Labels for language modeling.
|
Labels for language modeling.
|
||||||
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
|
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
|
||||||
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
|
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
|
||||||
All labels set to ``-1`` are ignored (masked), the loss is only
|
All labels set to ``-100`` are ignored (masked), the loss is only
|
||||||
computed for labels in ``[0, ..., config.vocab_size]``
|
computed for labels in ``[0, ..., config.vocab_size]``
|
||||||
|
|
||||||
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
|
||||||
@@ -965,7 +965,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
|
|||||||
|
|
||||||
if labels is not None:
|
if labels is not None:
|
||||||
# Flatten the tokens
|
# Flatten the tokens
|
||||||
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
loss_fct = CrossEntropyLoss()
|
||||||
loss = loss_fct(logits.view(-1, logits.size(-1)),
|
loss = loss_fct(logits.view(-1, logits.size(-1)),
|
||||||
labels.view(-1))
|
labels.view(-1))
|
||||||
outputs = (loss,) + outputs
|
outputs = (loss,) + outputs
|
||||||
|
|||||||
Reference in New Issue
Block a user