Unify label args (#4722)

* Deprecate masked_lm_labels argument

* Apply to all models

* Better error message
This commit is contained in:
Sylvain Gugger
2020-06-03 09:36:26 -04:00
committed by GitHub
parent 3e5928c57d
commit 1b5820a565
14 changed files with 223 additions and 93 deletions

View File

@@ -17,6 +17,7 @@
import logging import logging
import math import math
import os import os
import warnings
import torch import torch
import torch.nn as nn import torch.nn as nn
@@ -599,11 +600,12 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
position_ids=None, position_ids=None,
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
masked_lm_labels=None, labels=None,
sentence_order_label=None, sentence_order_label=None,
**kwargs
): ):
r""" r"""
masked_lm_labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss. Labels for computing the masked language modeling loss.
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
@@ -613,10 +615,12 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
Indices should be in ``[0, 1]``. Indices should be in ``[0, 1]``.
``0`` indicates original order (sequence A, then sequence B), ``0`` indicates original order (sequence A, then sequence B),
``1`` indicates switched order (sequence B, then sequence A). ``1`` indicates switched order (sequence B, then sequence A).
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
Returns: Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss. Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
@@ -651,6 +655,14 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
""" """
if "masked_lm_labels" in kwargs:
warnings.warn(
"The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
DeprecationWarning,
)
labels = kwargs.pop("masked_lm_labels")
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
outputs = self.albert( outputs = self.albert(
input_ids, input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
@@ -667,9 +679,9 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
outputs = (prediction_scores, sop_scores,) + outputs[2:] # add hidden states and attention if they are here outputs = (prediction_scores, sop_scores,) + outputs[2:] # add hidden states and attention if they are here
if masked_lm_labels is not None and sentence_order_label is not None: if labels is not None and sentence_order_label is not None:
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
sentence_order_loss = loss_fct(sop_scores.view(-1, 2), sentence_order_label.view(-1)) sentence_order_loss = loss_fct(sop_scores.view(-1, 2), sentence_order_label.view(-1))
total_loss = masked_lm_loss + sentence_order_loss total_loss = masked_lm_loss + sentence_order_loss
outputs = (total_loss,) + outputs outputs = (total_loss,) + outputs
@@ -742,18 +754,21 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
position_ids=None, position_ids=None,
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
masked_lm_labels=None, labels=None,
**kwargs
): ):
r""" r"""
masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss. Labels for computing the masked language modeling loss.
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with
labels in ``[0, ..., config.vocab_size]`` labels in ``[0, ..., config.vocab_size]``
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
Returns: Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Masked language modeling loss. Masked language modeling loss.
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
@@ -777,10 +792,18 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForMaskedLM.from_pretrained('albert-base-v2') model = AlbertForMaskedLM.from_pretrained('albert-base-v2')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, masked_lm_labels=input_ids) outputs = model(input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2] loss, prediction_scores = outputs[:2]
""" """
if "masked_lm_labels" in kwargs:
warnings.warn(
"The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
DeprecationWarning,
)
labels = kwargs.pop("masked_lm_labels")
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
outputs = self.albert( outputs = self.albert(
input_ids=input_ids, input_ids=input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
@@ -794,9 +817,9 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
prediction_scores = self.predictions(sequence_outputs) prediction_scores = self.predictions(sequence_outputs)
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here
if masked_lm_labels is not None: if labels is not None:
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
outputs = (masked_lm_loss,) + outputs outputs = (masked_lm_loss,) + outputs
return outputs return outputs

View File

@@ -16,6 +16,7 @@
import logging import logging
import math import math
import random import random
import warnings
from typing import Dict, List, Optional, Tuple from typing import Dict, List, Optional, Tuple
import numpy as np import numpy as np
@@ -900,12 +901,12 @@ class BartForConditionalGeneration(PretrainedBartModel):
decoder_input_ids=None, decoder_input_ids=None,
decoder_attention_mask=None, decoder_attention_mask=None,
decoder_cached_states=None, decoder_cached_states=None,
lm_labels=None, labels=None,
use_cache=False, use_cache=False,
**unused **unused
): ):
r""" r"""
lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss. Labels for computing the masked language modeling loss.
Indices should either be in ``[0, ..., config.vocab_size]`` or -100 (see ``input_ids`` docstring). Indices should either be in ``[0, ..., config.vocab_size]`` or -100 (see ``input_ids`` docstring).
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens
@@ -914,7 +915,7 @@ class BartForConditionalGeneration(PretrainedBartModel):
Returns: Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
masked_lm_loss (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: masked_lm_loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Masked language modeling loss. Masked language modeling loss.
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
@@ -945,6 +946,13 @@ class BartForConditionalGeneration(PretrainedBartModel):
tokenizer.decode(predictions).split() tokenizer.decode(predictions).split()
# ['good', 'great', 'all', 'really', 'very'] # ['good', 'great', 'all', 'really', 'very']
""" """
if "lm_labels" in unused:
warnings.warn(
"The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
DeprecationWarning,
)
labels = unused.pop("lm_labels")
outputs = self.model( outputs = self.model(
input_ids, input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
@@ -956,10 +964,10 @@ class BartForConditionalGeneration(PretrainedBartModel):
) )
lm_logits = F.linear(outputs[0], self.model.shared.weight, bias=self.final_logits_bias) lm_logits = F.linear(outputs[0], self.model.shared.weight, bias=self.final_logits_bias)
outputs = (lm_logits,) + outputs[1:] # Add cache, hidden states and attention if they are here outputs = (lm_logits,) + outputs[1:] # Add cache, hidden states and attention if they are here
if lm_labels is not None: if labels is not None:
loss_fct = nn.CrossEntropyLoss() loss_fct = nn.CrossEntropyLoss()
# TODO(SS): do we need to ignore pad tokens in lm_labels? # TODO(SS): do we need to ignore pad tokens in labels?
masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), lm_labels.view(-1)) masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
outputs = (masked_lm_loss,) + outputs outputs = (masked_lm_loss,) + outputs
return outputs return outputs

View File

@@ -19,6 +19,7 @@
import logging import logging
import math import math
import os import os
import warnings
import torch import torch
from torch import nn from torch import nn
@@ -768,11 +769,12 @@ class BertForPreTraining(BertPreTrainedModel):
position_ids=None, position_ids=None,
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
masked_lm_labels=None, labels=None,
next_sentence_label=None, next_sentence_label=None,
**kwargs
): ):
r""" r"""
masked_lm_labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss. Labels for computing the masked language modeling loss.
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
@@ -782,10 +784,12 @@ class BertForPreTraining(BertPreTrainedModel):
Indices should be in ``[0, 1]``. Indices should be in ``[0, 1]``.
``0`` indicates sequence B is a continuation of sequence A, ``0`` indicates sequence B is a continuation of sequence A,
``1`` indicates sequence B is a random sequence. ``1`` indicates sequence B is a random sequence.
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
Returns: Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss. Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
@@ -819,6 +823,13 @@ class BertForPreTraining(BertPreTrainedModel):
prediction_scores, seq_relationship_scores = outputs[:2] prediction_scores, seq_relationship_scores = outputs[:2]
""" """
if "masked_lm_labels" in kwargs:
warnings.warn(
"The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
DeprecationWarning,
)
labels = kwargs.pop("masked_lm_labels")
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
outputs = self.bert( outputs = self.bert(
input_ids, input_ids,
@@ -836,9 +847,9 @@ class BertForPreTraining(BertPreTrainedModel):
2: 2:
] # add hidden states and attention if they are here ] # add hidden states and attention if they are here
if masked_lm_labels is not None and next_sentence_label is not None: if labels is not None and next_sentence_label is not None:
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
total_loss = masked_lm_loss + next_sentence_loss total_loss = masked_lm_loss + next_sentence_loss
outputs = (total_loss,) + outputs outputs = (total_loss,) + outputs
@@ -846,6 +857,7 @@ class BertForPreTraining(BertPreTrainedModel):
return outputs # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions) return outputs # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions)
# TODO: Split with a different BertWithLMHead to get rid of `lm_labels` here and in encoder_decoder.
@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING) @add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
class BertForMaskedLM(BertPreTrainedModel): class BertForMaskedLM(BertPreTrainedModel):
def __init__(self, config): def __init__(self, config):
@@ -868,13 +880,14 @@ class BertForMaskedLM(BertPreTrainedModel):
position_ids=None, position_ids=None,
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
masked_lm_labels=None, labels=None,
encoder_hidden_states=None, encoder_hidden_states=None,
encoder_attention_mask=None, encoder_attention_mask=None,
lm_labels=None, lm_labels=None,
**kwargs
): ):
r""" r"""
masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss. Labels for computing the masked language modeling loss.
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
@@ -884,10 +897,12 @@ class BertForMaskedLM(BertPreTrainedModel):
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]`` in ``[0, ..., config.vocab_size]``
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
Returns: Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: masked_lm_loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Masked language modeling loss. Masked language modeling loss.
ltr_lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_labels` is provided): ltr_lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_labels` is provided):
Next token prediction loss. Next token prediction loss.
@@ -914,11 +929,18 @@ class BertForMaskedLM(BertPreTrainedModel):
model = BertForMaskedLM.from_pretrained('bert-base-uncased') model = BertForMaskedLM.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, masked_lm_labels=input_ids) outputs = model(input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2] loss, prediction_scores = outputs[:2]
""" """
if "masked_lm_labels" in kwargs:
warnings.warn(
"The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
DeprecationWarning,
)
labels = kwargs.pop("masked_lm_labels")
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
outputs = self.bert( outputs = self.bert(
input_ids, input_ids,
@@ -942,9 +964,9 @@ class BertForMaskedLM(BertPreTrainedModel):
# of predictions for masked words. # of predictions for masked words.
# 2. If `lm_labels` is provided we are in a causal scenario where we # 2. If `lm_labels` is provided we are in a causal scenario where we
# try to predict the next token for each input in the decoder. # try to predict the next token for each input in the decoder.
if masked_lm_labels is not None: if labels is not None:
loss_fct = CrossEntropyLoss() # -100 index = padding token loss_fct = CrossEntropyLoss() # -100 index = padding token
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
outputs = (masked_lm_loss,) + outputs outputs = (masked_lm_loss,) + outputs
if lm_labels is not None: if lm_labels is not None:

View File

@@ -489,7 +489,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
r""" r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for language modeling. Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
Indices are selected in ``[-100, 0, ..., config.vocab_size]`` Indices are selected in ``[-100, 0, ..., config.vocab_size]``
All labels set to ``-100`` are ignored (masked), the loss is only All labels set to ``-100`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]`` computed for labels in ``[0, ..., config.vocab_size]``

View File

@@ -21,6 +21,7 @@
import copy import copy
import logging import logging
import math import math
import warnings
import numpy as np import numpy as np
import torch import torch
@@ -493,17 +494,19 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
return self.vocab_projector return self.vocab_projector
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, masked_lm_labels=None): def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None, **kwargs):
r""" r"""
masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss. Labels for computing the masked language modeling loss.
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]`` in ``[0, ..., config.vocab_size]``
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
Returns: Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.DistilBertConfig`) and inputs: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.DistilBertConfig`) and inputs:
loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Masked language modeling loss. Masked language modeling loss.
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
@@ -527,10 +530,18 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
model = DistilBertForMaskedLM.from_pretrained('distilbert-base-cased') model = DistilBertForMaskedLM.from_pretrained('distilbert-base-cased')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, masked_lm_labels=input_ids) outputs = model(input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2] loss, prediction_scores = outputs[:2]
""" """
if "masked_lm_labels" in kwargs:
warnings.warn(
"The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
DeprecationWarning,
)
labels = kwargs.pop("masked_lm_labels")
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
dlbrt_output = self.distilbert( dlbrt_output = self.distilbert(
input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds
) )
@@ -541,10 +552,8 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
prediction_logits = self.vocab_projector(prediction_logits) # (bs, seq_length, vocab_size) prediction_logits = self.vocab_projector(prediction_logits) # (bs, seq_length, vocab_size)
outputs = (prediction_logits,) + dlbrt_output[1:] outputs = (prediction_logits,) + dlbrt_output[1:]
if masked_lm_labels is not None: if labels is not None:
mlm_loss = self.mlm_loss_fct( mlm_loss = self.mlm_loss_fct(prediction_logits.view(-1, prediction_logits.size(-1)), labels.view(-1))
prediction_logits.view(-1, prediction_logits.size(-1)), masked_lm_labels.view(-1)
)
outputs = (mlm_loss,) + outputs outputs = (mlm_loss,) + outputs
return outputs # (mlm_loss), prediction_logits, (all hidden_states), (all attentions) return outputs # (mlm_loss), prediction_logits, (all hidden_states), (all attentions)

View File

@@ -1,5 +1,6 @@
import logging import logging
import os import os
import warnings
import torch import torch
import torch.nn as nn import torch.nn as nn
@@ -561,18 +562,21 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
position_ids=None, position_ids=None,
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
masked_lm_labels=None, labels=None,
**kwargs
): ):
r""" r"""
masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss. Labels for computing the masked language modeling loss.
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]`` in ``[0, ..., config.vocab_size]``
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
Returns: Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: masked_lm_loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Masked language modeling loss. Masked language modeling loss.
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
@@ -597,11 +601,18 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
model = ElectraForMaskedLM.from_pretrained('google/electra-small-generator') model = ElectraForMaskedLM.from_pretrained('google/electra-small-generator')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, masked_lm_labels=input_ids) outputs = model(input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2] loss, prediction_scores = outputs[:2]
""" """
if "masked_lm_labels" in kwargs:
warnings.warn(
"The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
DeprecationWarning,
)
labels = kwargs.pop("masked_lm_labels")
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
generator_hidden_states = self.electra( generator_hidden_states = self.electra(
input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds
@@ -614,9 +625,9 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
output = (prediction_scores,) output = (prediction_scores,)
# Masked language modeling softmax layer # Masked language modeling softmax layer
if masked_lm_labels is not None: if labels is not None:
loss_fct = nn.CrossEntropyLoss() # -100 index = padding token loss_fct = nn.CrossEntropyLoss() # -100 index = padding token
loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
output = (loss,) + output output = (loss,) + output
output += generator_hidden_states[1:] output += generator_hidden_states[1:]

View File

@@ -191,7 +191,7 @@ class EncoderDecoderModel(PreTrainedModel):
decoder_attention_mask=None, decoder_attention_mask=None,
decoder_head_mask=None, decoder_head_mask=None,
decoder_inputs_embeds=None, decoder_inputs_embeds=None,
masked_lm_labels=None, labels=None,
lm_labels=None, lm_labels=None,
**kwargs, **kwargs,
): ):
@@ -234,7 +234,7 @@ class EncoderDecoderModel(PreTrainedModel):
Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded representation. Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `decoder_input_ids` indices into associated vectors This is useful if you want more control over how to convert `decoder_input_ids` indices into associated vectors
than the model's internal embedding lookup matrix. than the model's internal embedding lookup matrix.
masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss for the decoder. Labels for computing the masked language modeling loss for the decoder.
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
@@ -294,7 +294,7 @@ class EncoderDecoderModel(PreTrainedModel):
encoder_attention_mask=attention_mask, encoder_attention_mask=attention_mask,
head_mask=decoder_head_mask, head_mask=decoder_head_mask,
lm_labels=lm_labels, lm_labels=lm_labels,
masked_lm_labels=masked_lm_labels, labels=labels,
**kwargs_decoder, **kwargs_decoder,
) )

View File

@@ -18,6 +18,7 @@
import logging import logging
import os import os
import warnings
import torch import torch
import torch.nn as nn import torch.nn as nn
@@ -652,17 +653,18 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
mc_token_ids=None, mc_token_ids=None,
lm_labels=None, labels=None,
mc_labels=None, mc_labels=None,
use_cache=True, use_cache=True,
**kwargs
): ):
r""" r"""
mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input) mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
Index of the classification token in each input sequence. Index of the classification token in each input sequence.
Selected in the range ``[0, input_ids.size(-1) - 1[``. Selected in the range ``[0, input_ids.size(-1) - 1[``.
lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`) labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`)
Labels for language modeling. Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
Indices are selected in ``[-1, 0, ..., config.vocab_size]`` Indices are selected in ``[-1, 0, ..., config.vocab_size]``
All labels set to ``-100`` are ignored (masked), the loss is only All labels set to ``-100`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]`` computed for labels in ``[0, ..., config.vocab_size]``
@@ -670,12 +672,14 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
Labels for computing the multiple choice classification loss. Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above) of the input tensors. (see `input_ids` above)
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
Return: Return:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs:
lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``lm_labels`` is provided): lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided):
Language modeling loss. Language modeling loss.
mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`multiple_choice_labels` is provided): mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`mc_labels` is provided):
Multiple choice classification loss. Multiple choice classification loss.
lm_prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`): lm_prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
@@ -720,6 +724,14 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
lm_prediction_scores, mc_prediction_scores = outputs[:2] lm_prediction_scores, mc_prediction_scores = outputs[:2]
""" """
if "lm_labels" in kwargs:
warnings.warn(
"The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
DeprecationWarning,
)
labels = kwargs.pop("lm_labels")
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
input_ids, input_ids,
past=past, past=past,
@@ -741,9 +753,9 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)) loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))
outputs = (loss,) + outputs outputs = (loss,) + outputs
if lm_labels is not None: if labels is not None:
shift_logits = lm_logits[..., :-1, :].contiguous() shift_logits = lm_logits[..., :-1, :].contiguous()
shift_labels = lm_labels[..., 1:].contiguous() shift_labels = labels[..., 1:].contiguous()
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
outputs = (loss,) + outputs outputs = (loss,) + outputs

View File

@@ -16,6 +16,7 @@
import logging import logging
import math import math
import warnings
import torch import torch
import torch.nn as nn import torch.nn as nn
@@ -587,14 +588,11 @@ class LongformerModel(RobertaModel):
token_type_ids=None, token_type_ids=None,
position_ids=None, position_ids=None,
inputs_embeds=None, inputs_embeds=None,
masked_lm_labels=None,
): ):
r""" r"""
Returns: Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Masked language modeling loss.
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
@@ -704,18 +702,21 @@ class LongformerForMaskedLM(BertPreTrainedModel):
token_type_ids=None, token_type_ids=None,
position_ids=None, position_ids=None,
inputs_embeds=None, inputs_embeds=None,
masked_lm_labels=None, labels=None,
**kwargs
): ):
r""" r"""
masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss. Labels for computing the masked language modeling loss.
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]`` in ``[0, ..., config.vocab_size]``
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
Returns: Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: masked_lm_loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Masked language modeling loss. Masked language modeling loss.
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
@@ -744,9 +745,17 @@ class LongformerForMaskedLM(BertPreTrainedModel):
attention_mask = None # default is local attention everywhere, which is a good choice for MaskedLM attention_mask = None # default is local attention everywhere, which is a good choice for MaskedLM
# check ``LongformerModel.forward`` for more details how to set `attention_mask` # check ``LongformerModel.forward`` for more details how to set `attention_mask`
loss, prediction_scores = model(input_ids, attention_mask=attention_mask, masked_lm_labels=input_ids) loss, prediction_scores = model(input_ids, attention_mask=attention_mask, labels=input_ids)
""" """
if "masked_lm_labels" in kwargs:
warnings.warn(
"The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
DeprecationWarning,
)
labels = kwargs.pop("masked_lm_labels")
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
outputs = self.longformer( outputs = self.longformer(
input_ids, input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
@@ -760,9 +769,9 @@ class LongformerForMaskedLM(BertPreTrainedModel):
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here
if masked_lm_labels is not None: if labels is not None:
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
outputs = (masked_lm_loss,) + outputs outputs = (masked_lm_loss,) + outputs
return outputs # (masked_lm_loss), prediction_scores, (hidden_states), (attentions) return outputs # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)

View File

@@ -20,6 +20,7 @@ import json
import logging import logging
import math import math
import os import os
import warnings
import torch import torch
import torch.nn as nn import torch.nn as nn
@@ -588,16 +589,17 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
mc_token_ids=None, mc_token_ids=None,
lm_labels=None, labels=None,
mc_labels=None, mc_labels=None,
**kwargs
): ):
r""" r"""
mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input) mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
Index of the classification token in each input sequence. Index of the classification token in each input sequence.
Selected in the range ``[0, input_ids.size(-1) - 1[``. Selected in the range ``[0, input_ids.size(-1) - 1[``.
lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`) labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`)
Labels for language modeling. Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
Indices are selected in ``[-1, 0, ..., config.vocab_size]`` Indices are selected in ``[-1, 0, ..., config.vocab_size]``
All labels set to ``-100`` are ignored (masked), the loss is only All labels set to ``-100`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]`` computed for labels in ``[0, ..., config.vocab_size]``
@@ -605,12 +607,14 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
Labels for computing the multiple choice classification loss. Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above) of the input tensors. (see `input_ids` above)
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
Return: Return:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs:
lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``lm_labels`` is provided): lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided):
Language modeling loss. Language modeling loss.
mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`multiple_choice_labels` is provided): mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`mc_labels` is provided):
Multiple choice classification loss. Multiple choice classification loss.
lm_prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`): lm_prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
@@ -650,6 +654,14 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
lm_prediction_scores, mc_prediction_scores = outputs[:2] lm_prediction_scores, mc_prediction_scores = outputs[:2]
""" """
if "lm_labels" in kwargs:
warnings.warn(
"The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
DeprecationWarning,
)
labels = kwargs.pop("lm_labels")
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
transformer_outputs = self.transformer( transformer_outputs = self.transformer(
input_ids, input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
@@ -668,9 +680,9 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)) loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))
outputs = (loss,) + outputs outputs = (loss,) + outputs
if lm_labels is not None: if labels is not None:
shift_logits = lm_logits[..., :-1, :].contiguous() shift_logits = lm_logits[..., :-1, :].contiguous()
shift_labels = lm_labels[..., 1:].contiguous() shift_labels = labels[..., 1:].contiguous()
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
outputs = (loss,) + outputs outputs = (loss,) + outputs

View File

@@ -1755,7 +1755,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
Return: Return:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_label` is provided): loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Classification loss (cross entropy). Classification loss (cross entropy).
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).

View File

@@ -17,6 +17,7 @@
import logging import logging
import warnings
import torch import torch
import torch.nn as nn import torch.nn as nn
@@ -183,18 +184,21 @@ class RobertaForMaskedLM(BertPreTrainedModel):
position_ids=None, position_ids=None,
head_mask=None, head_mask=None,
inputs_embeds=None, inputs_embeds=None,
masked_lm_labels=None, labels=None,
**kwargs
): ):
r""" r"""
masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
Labels for computing the masked language modeling loss. Labels for computing the masked language modeling loss.
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
in ``[0, ..., config.vocab_size]`` in ``[0, ..., config.vocab_size]``
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
Returns: Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: masked_lm_loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Masked language modeling loss. Masked language modeling loss.
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
@@ -218,10 +222,18 @@ class RobertaForMaskedLM(BertPreTrainedModel):
tokenizer = RobertaTokenizer.from_pretrained('roberta-base') tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForMaskedLM.from_pretrained('roberta-base') model = RobertaForMaskedLM.from_pretrained('roberta-base')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
outputs = model(input_ids, masked_lm_labels=input_ids) outputs = model(input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2] loss, prediction_scores = outputs[:2]
""" """
if "masked_lm_labels" in kwargs:
warnings.warn(
"The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
DeprecationWarning,
)
labels = kwargs.pop("masked_lm_labels")
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
outputs = self.roberta( outputs = self.roberta(
input_ids, input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
@@ -235,9 +247,9 @@ class RobertaForMaskedLM(BertPreTrainedModel):
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here
if masked_lm_labels is not None: if labels is not None:
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
outputs = (masked_lm_loss,) + outputs outputs = (masked_lm_loss,) + outputs
return outputs # (masked_lm_loss), prediction_scores, (hidden_states), (attentions) return outputs # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)

View File

@@ -19,6 +19,7 @@ import copy
import logging import logging
import math import math
import os import os
import warnings
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
@@ -616,10 +617,10 @@ class T5PreTrainedModel(PreTrainedModel):
shifted_input_ids[..., 0] = decoder_start_token_id shifted_input_ids[..., 0] = decoder_start_token_id
assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined." assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
# replace possible -100 values in lm_labels by `pad_token_id` # replace possible -100 values in labels by `pad_token_id`
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
assert torch.all(shifted_input_ids >= 0).item(), "Verify that `lm_labels` has only positive values and -100" assert torch.all(shifted_input_ids >= 0).item(), "Verify that `labels` has only positive values and -100"
return shifted_input_ids return shifted_input_ids
@@ -1008,21 +1009,24 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
decoder_attention_mask=None, decoder_attention_mask=None,
decoder_past_key_value_states=None, decoder_past_key_value_states=None,
use_cache=True, use_cache=True,
lm_labels=None, labels=None,
inputs_embeds=None, inputs_embeds=None,
decoder_inputs_embeds=None, decoder_inputs_embeds=None,
head_mask=None, head_mask=None,
**kwargs
): ):
r""" r"""
lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
Labels for computing the sequence classification/regression loss. Labels for computing the sequence classification/regression loss.
Indices should be in :obj:`[-100, 0, ..., config.vocab_size - 1]`. Indices should be in :obj:`[-100, 0, ..., config.vocab_size - 1]`.
All labels set to ``-100`` are ignored (masked), the loss is only All labels set to ``-100`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]`` computed for labels in ``[0, ..., config.vocab_size]``
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
Returns: Returns:
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs. :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs.
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_label` is provided): loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
Classification loss (cross entropy). Classification loss (cross entropy).
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
@@ -1047,7 +1051,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
tokenizer = T5Tokenizer.from_pretrained('t5-small') tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small') model = T5ForConditionalGeneration.from_pretrained('t5-small')
input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1 input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1
outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, lm_labels=input_ids) outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)
loss, prediction_scores = outputs[:2] loss, prediction_scores = outputs[:2]
tokenizer = T5Tokenizer.from_pretrained('t5-small') tokenizer = T5Tokenizer.from_pretrained('t5-small')
@@ -1056,6 +1060,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
outputs = model.generate(input_ids) outputs = model.generate(input_ids)
""" """
if "lm_labels" in kwargs:
warnings.warn(
"The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
DeprecationWarning,
)
labels = kwargs.pop("lm_labels")
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
# Encode if needed (training, first prediction pass) # Encode if needed (training, first prediction pass)
if encoder_outputs is None: if encoder_outputs is None:
# Convert encoder inputs in embeddings if needed # Convert encoder inputs in embeddings if needed
@@ -1065,14 +1077,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
hidden_states = encoder_outputs[0] hidden_states = encoder_outputs[0]
if lm_labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
# get decoder inputs from shifting lm labels to the right # get decoder inputs from shifting lm labels to the right
decoder_input_ids = self._shift_right(lm_labels) decoder_input_ids = self._shift_right(labels)
# If decoding with past key value states, only the last tokens # If decoding with past key value states, only the last tokens
# should be given as an input # should be given as an input
if decoder_past_key_value_states is not None: if decoder_past_key_value_states is not None:
assert lm_labels is None, "Decoder should not use cached key value states when training." assert labels is None, "Decoder should not use cached key value states when training."
if decoder_input_ids is not None: if decoder_input_ids is not None:
decoder_input_ids = decoder_input_ids[:, -1:] decoder_input_ids = decoder_input_ids[:, -1:]
if decoder_inputs_embeds is not None: if decoder_inputs_embeds is not None:
@@ -1103,9 +1115,9 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
lm_logits = self.lm_head(sequence_output) lm_logits = self.lm_head(sequence_output)
decoder_outputs = (lm_logits,) + decoder_outputs[1:] # Add hidden states and attention if they are here decoder_outputs = (lm_logits,) + decoder_outputs[1:] # Add hidden states and attention if they are here
if lm_labels is not None: if labels is not None:
loss_fct = CrossEntropyLoss(ignore_index=-100) loss_fct = CrossEntropyLoss(ignore_index=-100)
loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1)) loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
# TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666 # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
decoder_outputs = (loss,) + decoder_outputs decoder_outputs = (loss,) + decoder_outputs

View File

@@ -71,7 +71,7 @@ class EncoderDecoderModelTest(unittest.TestCase):
"decoder_choice_labels": decoder_choice_labels, "decoder_choice_labels": decoder_choice_labels,
"encoder_hidden_states": encoder_hidden_states, "encoder_hidden_states": encoder_hidden_states,
"lm_labels": decoder_token_labels, "lm_labels": decoder_token_labels,
"masked_lm_labels": decoder_token_labels, "labels": decoder_token_labels,
} }
def create_and_check_bert_encoder_decoder_model( def create_and_check_bert_encoder_decoder_model(
@@ -224,7 +224,7 @@ class EncoderDecoderModelTest(unittest.TestCase):
def check_loss_output(self, loss): def check_loss_output(self, loss):
self.assertEqual(loss.size(), ()) self.assertEqual(loss.size(), ())
def create_and_check_bert_encoder_decoder_model_mlm_labels( def create_and_check_bert_encoder_decoder_model_labels(
self, self,
config, config,
input_ids, input_ids,
@@ -233,7 +233,7 @@ class EncoderDecoderModelTest(unittest.TestCase):
decoder_config, decoder_config,
decoder_input_ids, decoder_input_ids,
decoder_attention_mask, decoder_attention_mask,
masked_lm_labels, labels,
**kwargs **kwargs
): ):
encoder_model = BertModel(config) encoder_model = BertModel(config)
@@ -245,7 +245,7 @@ class EncoderDecoderModelTest(unittest.TestCase):
decoder_input_ids=decoder_input_ids, decoder_input_ids=decoder_input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
decoder_attention_mask=decoder_attention_mask, decoder_attention_mask=decoder_attention_mask,
masked_lm_labels=masked_lm_labels, labels=labels,
) )
mlm_loss = outputs_encoder_decoder[0] mlm_loss = outputs_encoder_decoder[0]
@@ -316,9 +316,9 @@ class EncoderDecoderModelTest(unittest.TestCase):
input_ids_dict = self.prepare_config_and_inputs_bert() input_ids_dict = self.prepare_config_and_inputs_bert()
self.create_and_check_save_and_load_encoder_decoder_model(**input_ids_dict) self.create_and_check_save_and_load_encoder_decoder_model(**input_ids_dict)
def test_bert_encoder_decoder_model_mlm_labels(self): def test_bert_encoder_decoder_model_labels(self):
input_ids_dict = self.prepare_config_and_inputs_bert() input_ids_dict = self.prepare_config_and_inputs_bert()
self.create_and_check_bert_encoder_decoder_model_mlm_labels(**input_ids_dict) self.create_and_check_bert_encoder_decoder_model_labels(**input_ids_dict)
def test_bert_encoder_decoder_model_lm_labels(self): def test_bert_encoder_decoder_model_lm_labels(self):
input_ids_dict = self.prepare_config_and_inputs_bert() input_ids_dict = self.prepare_config_and_inputs_bert()