Unify label args (#4722)
* Deprecate masked_lm_labels argument * Apply to all models * Better error message
This commit is contained in:
@@ -17,6 +17,7 @@
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import warnings
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@@ -599,11 +600,12 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
inputs_embeds=None,
|
||||
masked_lm_labels=None,
|
||||
labels=None,
|
||||
sentence_order_label=None,
|
||||
**kwargs
|
||||
):
|
||||
r"""
|
||||
masked_lm_labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
|
||||
labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the masked language modeling loss.
|
||||
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||
@@ -613,10 +615,12 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
|
||||
Indices should be in ``[0, 1]``.
|
||||
``0`` indicates original order (sequence A, then sequence B),
|
||||
``1`` indicates switched order (sequence B, then sequence A).
|
||||
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
|
||||
Used to hide legacy arguments that have been deprecated.
|
||||
|
||||
Returns:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
|
||||
loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
|
||||
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
@@ -651,6 +655,14 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
|
||||
|
||||
"""
|
||||
|
||||
if "masked_lm_labels" in kwargs:
|
||||
warnings.warn(
|
||||
"The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
labels = kwargs.pop("masked_lm_labels")
|
||||
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
|
||||
|
||||
outputs = self.albert(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
@@ -667,9 +679,9 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
|
||||
|
||||
outputs = (prediction_scores, sop_scores,) + outputs[2:] # add hidden states and attention if they are here
|
||||
|
||||
if masked_lm_labels is not None and sentence_order_label is not None:
|
||||
if labels is not None and sentence_order_label is not None:
|
||||
loss_fct = CrossEntropyLoss()
|
||||
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
|
||||
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
|
||||
sentence_order_loss = loss_fct(sop_scores.view(-1, 2), sentence_order_label.view(-1))
|
||||
total_loss = masked_lm_loss + sentence_order_loss
|
||||
outputs = (total_loss,) + outputs
|
||||
@@ -742,18 +754,21 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
inputs_embeds=None,
|
||||
masked_lm_labels=None,
|
||||
labels=None,
|
||||
**kwargs
|
||||
):
|
||||
r"""
|
||||
masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the masked language modeling loss.
|
||||
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with
|
||||
labels in ``[0, ..., config.vocab_size]``
|
||||
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
|
||||
Used to hide legacy arguments that have been deprecated.
|
||||
|
||||
Returns:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs:
|
||||
loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
Masked language modeling loss.
|
||||
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
@@ -777,10 +792,18 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
|
||||
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
|
||||
model = AlbertForMaskedLM.from_pretrained('albert-base-v2')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, masked_lm_labels=input_ids)
|
||||
outputs = model(input_ids, labels=input_ids)
|
||||
loss, prediction_scores = outputs[:2]
|
||||
|
||||
"""
|
||||
if "masked_lm_labels" in kwargs:
|
||||
warnings.warn(
|
||||
"The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
labels = kwargs.pop("masked_lm_labels")
|
||||
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
|
||||
|
||||
outputs = self.albert(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
@@ -794,9 +817,9 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
|
||||
prediction_scores = self.predictions(sequence_outputs)
|
||||
|
||||
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here
|
||||
if masked_lm_labels is not None:
|
||||
if labels is not None:
|
||||
loss_fct = CrossEntropyLoss()
|
||||
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
|
||||
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
|
||||
outputs = (masked_lm_loss,) + outputs
|
||||
|
||||
return outputs
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
import logging
|
||||
import math
|
||||
import random
|
||||
import warnings
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
@@ -900,12 +901,12 @@ class BartForConditionalGeneration(PretrainedBartModel):
|
||||
decoder_input_ids=None,
|
||||
decoder_attention_mask=None,
|
||||
decoder_cached_states=None,
|
||||
lm_labels=None,
|
||||
labels=None,
|
||||
use_cache=False,
|
||||
**unused
|
||||
):
|
||||
r"""
|
||||
lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the masked language modeling loss.
|
||||
Indices should either be in ``[0, ..., config.vocab_size]`` or -100 (see ``input_ids`` docstring).
|
||||
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens
|
||||
@@ -914,7 +915,7 @@ class BartForConditionalGeneration(PretrainedBartModel):
|
||||
|
||||
Returns:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
|
||||
masked_lm_loss (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
masked_lm_loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
Masked language modeling loss.
|
||||
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
@@ -945,6 +946,13 @@ class BartForConditionalGeneration(PretrainedBartModel):
|
||||
tokenizer.decode(predictions).split()
|
||||
# ['good', 'great', 'all', 'really', 'very']
|
||||
"""
|
||||
if "lm_labels" in unused:
|
||||
warnings.warn(
|
||||
"The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
labels = unused.pop("lm_labels")
|
||||
|
||||
outputs = self.model(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
@@ -956,10 +964,10 @@ class BartForConditionalGeneration(PretrainedBartModel):
|
||||
)
|
||||
lm_logits = F.linear(outputs[0], self.model.shared.weight, bias=self.final_logits_bias)
|
||||
outputs = (lm_logits,) + outputs[1:] # Add cache, hidden states and attention if they are here
|
||||
if lm_labels is not None:
|
||||
if labels is not None:
|
||||
loss_fct = nn.CrossEntropyLoss()
|
||||
# TODO(SS): do we need to ignore pad tokens in lm_labels?
|
||||
masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), lm_labels.view(-1))
|
||||
# TODO(SS): do we need to ignore pad tokens in labels?
|
||||
masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
|
||||
outputs = (masked_lm_loss,) + outputs
|
||||
|
||||
return outputs
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import warnings
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
@@ -768,11 +769,12 @@ class BertForPreTraining(BertPreTrainedModel):
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
inputs_embeds=None,
|
||||
masked_lm_labels=None,
|
||||
labels=None,
|
||||
next_sentence_label=None,
|
||||
**kwargs
|
||||
):
|
||||
r"""
|
||||
masked_lm_labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
|
||||
labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the masked language modeling loss.
|
||||
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||
@@ -782,10 +784,12 @@ class BertForPreTraining(BertPreTrainedModel):
|
||||
Indices should be in ``[0, 1]``.
|
||||
``0`` indicates sequence B is a continuation of sequence A,
|
||||
``1`` indicates sequence B is a random sequence.
|
||||
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
|
||||
Used to hide legacy arguments that have been deprecated.
|
||||
|
||||
Returns:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
|
||||
loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
|
||||
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
@@ -819,6 +823,13 @@ class BertForPreTraining(BertPreTrainedModel):
|
||||
prediction_scores, seq_relationship_scores = outputs[:2]
|
||||
|
||||
"""
|
||||
if "masked_lm_labels" in kwargs:
|
||||
warnings.warn(
|
||||
"The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
labels = kwargs.pop("masked_lm_labels")
|
||||
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
|
||||
|
||||
outputs = self.bert(
|
||||
input_ids,
|
||||
@@ -836,9 +847,9 @@ class BertForPreTraining(BertPreTrainedModel):
|
||||
2:
|
||||
] # add hidden states and attention if they are here
|
||||
|
||||
if masked_lm_labels is not None and next_sentence_label is not None:
|
||||
if labels is not None and next_sentence_label is not None:
|
||||
loss_fct = CrossEntropyLoss()
|
||||
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
|
||||
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
|
||||
next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
|
||||
total_loss = masked_lm_loss + next_sentence_loss
|
||||
outputs = (total_loss,) + outputs
|
||||
@@ -846,6 +857,7 @@ class BertForPreTraining(BertPreTrainedModel):
|
||||
return outputs # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions)
|
||||
|
||||
|
||||
# TODO: Split with a different BertWithLMHead to get rid of `lm_labels` here and in encoder_decoder.
|
||||
@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
|
||||
class BertForMaskedLM(BertPreTrainedModel):
|
||||
def __init__(self, config):
|
||||
@@ -868,13 +880,14 @@ class BertForMaskedLM(BertPreTrainedModel):
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
inputs_embeds=None,
|
||||
masked_lm_labels=None,
|
||||
labels=None,
|
||||
encoder_hidden_states=None,
|
||||
encoder_attention_mask=None,
|
||||
lm_labels=None,
|
||||
**kwargs
|
||||
):
|
||||
r"""
|
||||
masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the masked language modeling loss.
|
||||
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||
@@ -884,10 +897,12 @@ class BertForMaskedLM(BertPreTrainedModel):
|
||||
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||
in ``[0, ..., config.vocab_size]``
|
||||
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
|
||||
Used to hide legacy arguments that have been deprecated.
|
||||
|
||||
Returns:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
|
||||
masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
masked_lm_loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
Masked language modeling loss.
|
||||
ltr_lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_labels` is provided):
|
||||
Next token prediction loss.
|
||||
@@ -914,11 +929,18 @@ class BertForMaskedLM(BertPreTrainedModel):
|
||||
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
|
||||
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, masked_lm_labels=input_ids)
|
||||
outputs = model(input_ids, labels=input_ids)
|
||||
|
||||
loss, prediction_scores = outputs[:2]
|
||||
|
||||
"""
|
||||
if "masked_lm_labels" in kwargs:
|
||||
warnings.warn(
|
||||
"The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
labels = kwargs.pop("masked_lm_labels")
|
||||
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
|
||||
|
||||
outputs = self.bert(
|
||||
input_ids,
|
||||
@@ -942,9 +964,9 @@ class BertForMaskedLM(BertPreTrainedModel):
|
||||
# of predictions for masked words.
|
||||
# 2. If `lm_labels` is provided we are in a causal scenario where we
|
||||
# try to predict the next token for each input in the decoder.
|
||||
if masked_lm_labels is not None:
|
||||
if labels is not None:
|
||||
loss_fct = CrossEntropyLoss() # -100 index = padding token
|
||||
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
|
||||
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
|
||||
outputs = (masked_lm_loss,) + outputs
|
||||
|
||||
if lm_labels is not None:
|
||||
|
||||
@@ -489,7 +489,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
|
||||
r"""
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for language modeling.
|
||||
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
|
||||
Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
|
||||
Indices are selected in ``[-100, 0, ..., config.vocab_size]``
|
||||
All labels set to ``-100`` are ignored (masked), the loss is only
|
||||
computed for labels in ``[0, ..., config.vocab_size]``
|
||||
|
||||
@@ -21,6 +21,7 @@
|
||||
import copy
|
||||
import logging
|
||||
import math
|
||||
import warnings
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -493,17 +494,19 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
|
||||
return self.vocab_projector
|
||||
|
||||
@add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING)
|
||||
def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, masked_lm_labels=None):
|
||||
def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None, **kwargs):
|
||||
r"""
|
||||
masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the masked language modeling loss.
|
||||
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||
in ``[0, ..., config.vocab_size]``
|
||||
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
|
||||
Used to hide legacy arguments that have been deprecated.
|
||||
|
||||
Returns:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.DistilBertConfig`) and inputs:
|
||||
loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
Masked language modeling loss.
|
||||
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
@@ -527,10 +530,18 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
|
||||
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
|
||||
model = DistilBertForMaskedLM.from_pretrained('distilbert-base-cased')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, masked_lm_labels=input_ids)
|
||||
outputs = model(input_ids, labels=input_ids)
|
||||
loss, prediction_scores = outputs[:2]
|
||||
|
||||
"""
|
||||
if "masked_lm_labels" in kwargs:
|
||||
warnings.warn(
|
||||
"The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
labels = kwargs.pop("masked_lm_labels")
|
||||
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
|
||||
|
||||
dlbrt_output = self.distilbert(
|
||||
input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds
|
||||
)
|
||||
@@ -541,10 +552,8 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
|
||||
prediction_logits = self.vocab_projector(prediction_logits) # (bs, seq_length, vocab_size)
|
||||
|
||||
outputs = (prediction_logits,) + dlbrt_output[1:]
|
||||
if masked_lm_labels is not None:
|
||||
mlm_loss = self.mlm_loss_fct(
|
||||
prediction_logits.view(-1, prediction_logits.size(-1)), masked_lm_labels.view(-1)
|
||||
)
|
||||
if labels is not None:
|
||||
mlm_loss = self.mlm_loss_fct(prediction_logits.view(-1, prediction_logits.size(-1)), labels.view(-1))
|
||||
outputs = (mlm_loss,) + outputs
|
||||
|
||||
return outputs # (mlm_loss), prediction_logits, (all hidden_states), (all attentions)
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import logging
|
||||
import os
|
||||
import warnings
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@@ -561,18 +562,21 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
inputs_embeds=None,
|
||||
masked_lm_labels=None,
|
||||
labels=None,
|
||||
**kwargs
|
||||
):
|
||||
r"""
|
||||
masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the masked language modeling loss.
|
||||
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||
in ``[0, ..., config.vocab_size]``
|
||||
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
|
||||
Used to hide legacy arguments that have been deprecated.
|
||||
|
||||
Returns:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs:
|
||||
masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
masked_lm_loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
Masked language modeling loss.
|
||||
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
@@ -597,11 +601,18 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
|
||||
model = ElectraForMaskedLM.from_pretrained('google/electra-small-generator')
|
||||
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, masked_lm_labels=input_ids)
|
||||
outputs = model(input_ids, labels=input_ids)
|
||||
|
||||
loss, prediction_scores = outputs[:2]
|
||||
|
||||
"""
|
||||
if "masked_lm_labels" in kwargs:
|
||||
warnings.warn(
|
||||
"The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
labels = kwargs.pop("masked_lm_labels")
|
||||
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
|
||||
|
||||
generator_hidden_states = self.electra(
|
||||
input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds
|
||||
@@ -614,9 +625,9 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
|
||||
output = (prediction_scores,)
|
||||
|
||||
# Masked language modeling softmax layer
|
||||
if masked_lm_labels is not None:
|
||||
if labels is not None:
|
||||
loss_fct = nn.CrossEntropyLoss() # -100 index = padding token
|
||||
loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
|
||||
loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
|
||||
output = (loss,) + output
|
||||
|
||||
output += generator_hidden_states[1:]
|
||||
|
||||
@@ -191,7 +191,7 @@ class EncoderDecoderModel(PreTrainedModel):
|
||||
decoder_attention_mask=None,
|
||||
decoder_head_mask=None,
|
||||
decoder_inputs_embeds=None,
|
||||
masked_lm_labels=None,
|
||||
labels=None,
|
||||
lm_labels=None,
|
||||
**kwargs,
|
||||
):
|
||||
@@ -234,7 +234,7 @@ class EncoderDecoderModel(PreTrainedModel):
|
||||
Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded representation.
|
||||
This is useful if you want more control over how to convert `decoder_input_ids` indices into associated vectors
|
||||
than the model's internal embedding lookup matrix.
|
||||
masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the masked language modeling loss for the decoder.
|
||||
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||
@@ -294,7 +294,7 @@ class EncoderDecoderModel(PreTrainedModel):
|
||||
encoder_attention_mask=attention_mask,
|
||||
head_mask=decoder_head_mask,
|
||||
lm_labels=lm_labels,
|
||||
masked_lm_labels=masked_lm_labels,
|
||||
labels=labels,
|
||||
**kwargs_decoder,
|
||||
)
|
||||
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
|
||||
import logging
|
||||
import os
|
||||
import warnings
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@@ -652,17 +653,18 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
||||
head_mask=None,
|
||||
inputs_embeds=None,
|
||||
mc_token_ids=None,
|
||||
lm_labels=None,
|
||||
labels=None,
|
||||
mc_labels=None,
|
||||
use_cache=True,
|
||||
**kwargs
|
||||
):
|
||||
r"""
|
||||
mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
|
||||
Index of the classification token in each input sequence.
|
||||
Selected in the range ``[0, input_ids.size(-1) - 1[``.
|
||||
lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`)
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`)
|
||||
Labels for language modeling.
|
||||
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
|
||||
Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
|
||||
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
|
||||
All labels set to ``-100`` are ignored (masked), the loss is only
|
||||
computed for labels in ``[0, ..., config.vocab_size]``
|
||||
@@ -670,12 +672,14 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
||||
Labels for computing the multiple choice classification loss.
|
||||
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
|
||||
of the input tensors. (see `input_ids` above)
|
||||
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
|
||||
Used to hide legacy arguments that have been deprecated.
|
||||
|
||||
Return:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs:
|
||||
lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``lm_labels`` is provided):
|
||||
lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided):
|
||||
Language modeling loss.
|
||||
mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`multiple_choice_labels` is provided):
|
||||
mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`mc_labels` is provided):
|
||||
Multiple choice classification loss.
|
||||
lm_prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
@@ -720,6 +724,14 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
||||
lm_prediction_scores, mc_prediction_scores = outputs[:2]
|
||||
|
||||
"""
|
||||
if "lm_labels" in kwargs:
|
||||
warnings.warn(
|
||||
"The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
labels = kwargs.pop("lm_labels")
|
||||
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
|
||||
|
||||
transformer_outputs = self.transformer(
|
||||
input_ids,
|
||||
past=past,
|
||||
@@ -741,9 +753,9 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))
|
||||
outputs = (loss,) + outputs
|
||||
if lm_labels is not None:
|
||||
if labels is not None:
|
||||
shift_logits = lm_logits[..., :-1, :].contiguous()
|
||||
shift_labels = lm_labels[..., 1:].contiguous()
|
||||
shift_labels = labels[..., 1:].contiguous()
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
|
||||
outputs = (loss,) + outputs
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
|
||||
import logging
|
||||
import math
|
||||
import warnings
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@@ -587,14 +588,11 @@ class LongformerModel(RobertaModel):
|
||||
token_type_ids=None,
|
||||
position_ids=None,
|
||||
inputs_embeds=None,
|
||||
masked_lm_labels=None,
|
||||
):
|
||||
r"""
|
||||
|
||||
Returns:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
|
||||
masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
Masked language modeling loss.
|
||||
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``):
|
||||
@@ -704,18 +702,21 @@ class LongformerForMaskedLM(BertPreTrainedModel):
|
||||
token_type_ids=None,
|
||||
position_ids=None,
|
||||
inputs_embeds=None,
|
||||
masked_lm_labels=None,
|
||||
labels=None,
|
||||
**kwargs
|
||||
):
|
||||
r"""
|
||||
masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the masked language modeling loss.
|
||||
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||
in ``[0, ..., config.vocab_size]``
|
||||
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
|
||||
Used to hide legacy arguments that have been deprecated.
|
||||
|
||||
Returns:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
|
||||
masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
masked_lm_loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
Masked language modeling loss.
|
||||
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
@@ -744,9 +745,17 @@ class LongformerForMaskedLM(BertPreTrainedModel):
|
||||
|
||||
attention_mask = None # default is local attention everywhere, which is a good choice for MaskedLM
|
||||
# check ``LongformerModel.forward`` for more details how to set `attention_mask`
|
||||
loss, prediction_scores = model(input_ids, attention_mask=attention_mask, masked_lm_labels=input_ids)
|
||||
loss, prediction_scores = model(input_ids, attention_mask=attention_mask, labels=input_ids)
|
||||
"""
|
||||
|
||||
if "masked_lm_labels" in kwargs:
|
||||
warnings.warn(
|
||||
"The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
labels = kwargs.pop("masked_lm_labels")
|
||||
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
|
||||
|
||||
outputs = self.longformer(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
@@ -760,9 +769,9 @@ class LongformerForMaskedLM(BertPreTrainedModel):
|
||||
|
||||
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here
|
||||
|
||||
if masked_lm_labels is not None:
|
||||
if labels is not None:
|
||||
loss_fct = CrossEntropyLoss()
|
||||
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
|
||||
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
|
||||
outputs = (masked_lm_loss,) + outputs
|
||||
|
||||
return outputs # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
|
||||
|
||||
@@ -20,6 +20,7 @@ import json
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import warnings
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@@ -588,16 +589,17 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
||||
head_mask=None,
|
||||
inputs_embeds=None,
|
||||
mc_token_ids=None,
|
||||
lm_labels=None,
|
||||
labels=None,
|
||||
mc_labels=None,
|
||||
**kwargs
|
||||
):
|
||||
r"""
|
||||
mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input)
|
||||
Index of the classification token in each input sequence.
|
||||
Selected in the range ``[0, input_ids.size(-1) - 1[``.
|
||||
lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`)
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`)
|
||||
Labels for language modeling.
|
||||
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
|
||||
Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
|
||||
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
|
||||
All labels set to ``-100`` are ignored (masked), the loss is only
|
||||
computed for labels in ``[0, ..., config.vocab_size]``
|
||||
@@ -605,12 +607,14 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
||||
Labels for computing the multiple choice classification loss.
|
||||
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
|
||||
of the input tensors. (see `input_ids` above)
|
||||
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
|
||||
Used to hide legacy arguments that have been deprecated.
|
||||
|
||||
Return:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs:
|
||||
lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``lm_labels`` is provided):
|
||||
lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided):
|
||||
Language modeling loss.
|
||||
mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`multiple_choice_labels` is provided):
|
||||
mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`mc_labels` is provided):
|
||||
Multiple choice classification loss.
|
||||
lm_prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`):
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
@@ -650,6 +654,14 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
||||
lm_prediction_scores, mc_prediction_scores = outputs[:2]
|
||||
|
||||
"""
|
||||
if "lm_labels" in kwargs:
|
||||
warnings.warn(
|
||||
"The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
labels = kwargs.pop("lm_labels")
|
||||
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
|
||||
|
||||
transformer_outputs = self.transformer(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
@@ -668,9 +680,9 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))
|
||||
outputs = (loss,) + outputs
|
||||
if lm_labels is not None:
|
||||
if labels is not None:
|
||||
shift_logits = lm_logits[..., :-1, :].contiguous()
|
||||
shift_labels = lm_labels[..., 1:].contiguous()
|
||||
shift_labels = labels[..., 1:].contiguous()
|
||||
loss_fct = CrossEntropyLoss()
|
||||
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
|
||||
outputs = (loss,) + outputs
|
||||
|
||||
@@ -1755,7 +1755,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel):
|
||||
|
||||
Return:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
|
||||
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_label` is provided):
|
||||
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
|
||||
Classification loss (cross entropy).
|
||||
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
|
||||
|
||||
import logging
|
||||
import warnings
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
@@ -183,18 +184,21 @@ class RobertaForMaskedLM(BertPreTrainedModel):
|
||||
position_ids=None,
|
||||
head_mask=None,
|
||||
inputs_embeds=None,
|
||||
masked_lm_labels=None,
|
||||
labels=None,
|
||||
**kwargs
|
||||
):
|
||||
r"""
|
||||
masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the masked language modeling loss.
|
||||
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
|
||||
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
|
||||
in ``[0, ..., config.vocab_size]``
|
||||
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
|
||||
Used to hide legacy arguments that have been deprecated.
|
||||
|
||||
Returns:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs:
|
||||
masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
masked_lm_loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
|
||||
Masked language modeling loss.
|
||||
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
@@ -218,10 +222,18 @@ class RobertaForMaskedLM(BertPreTrainedModel):
|
||||
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
|
||||
model = RobertaForMaskedLM.from_pretrained('roberta-base')
|
||||
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
|
||||
outputs = model(input_ids, masked_lm_labels=input_ids)
|
||||
outputs = model(input_ids, labels=input_ids)
|
||||
loss, prediction_scores = outputs[:2]
|
||||
|
||||
"""
|
||||
if "masked_lm_labels" in kwargs:
|
||||
warnings.warn(
|
||||
"The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
labels = kwargs.pop("masked_lm_labels")
|
||||
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
|
||||
|
||||
outputs = self.roberta(
|
||||
input_ids,
|
||||
attention_mask=attention_mask,
|
||||
@@ -235,9 +247,9 @@ class RobertaForMaskedLM(BertPreTrainedModel):
|
||||
|
||||
outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here
|
||||
|
||||
if masked_lm_labels is not None:
|
||||
if labels is not None:
|
||||
loss_fct = CrossEntropyLoss()
|
||||
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
|
||||
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
|
||||
outputs = (masked_lm_loss,) + outputs
|
||||
|
||||
return outputs # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
|
||||
|
||||
@@ -19,6 +19,7 @@ import copy
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import warnings
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
@@ -616,10 +617,10 @@ class T5PreTrainedModel(PreTrainedModel):
|
||||
shifted_input_ids[..., 0] = decoder_start_token_id
|
||||
|
||||
assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
|
||||
# replace possible -100 values in lm_labels by `pad_token_id`
|
||||
# replace possible -100 values in labels by `pad_token_id`
|
||||
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
|
||||
|
||||
assert torch.all(shifted_input_ids >= 0).item(), "Verify that `lm_labels` has only positive values and -100"
|
||||
assert torch.all(shifted_input_ids >= 0).item(), "Verify that `labels` has only positive values and -100"
|
||||
|
||||
return shifted_input_ids
|
||||
|
||||
@@ -1008,21 +1009,24 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
|
||||
decoder_attention_mask=None,
|
||||
decoder_past_key_value_states=None,
|
||||
use_cache=True,
|
||||
lm_labels=None,
|
||||
labels=None,
|
||||
inputs_embeds=None,
|
||||
decoder_inputs_embeds=None,
|
||||
head_mask=None,
|
||||
**kwargs
|
||||
):
|
||||
r"""
|
||||
lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
|
||||
Labels for computing the sequence classification/regression loss.
|
||||
Indices should be in :obj:`[-100, 0, ..., config.vocab_size - 1]`.
|
||||
All labels set to ``-100`` are ignored (masked), the loss is only
|
||||
computed for labels in ``[0, ..., config.vocab_size]``
|
||||
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
|
||||
Used to hide legacy arguments that have been deprecated.
|
||||
|
||||
Returns:
|
||||
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs.
|
||||
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_label` is provided):
|
||||
loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
|
||||
Classification loss (cross entropy).
|
||||
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
|
||||
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
|
||||
@@ -1047,7 +1051,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
|
||||
tokenizer = T5Tokenizer.from_pretrained('t5-small')
|
||||
model = T5ForConditionalGeneration.from_pretrained('t5-small')
|
||||
input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1
|
||||
outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, lm_labels=input_ids)
|
||||
outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)
|
||||
loss, prediction_scores = outputs[:2]
|
||||
|
||||
tokenizer = T5Tokenizer.from_pretrained('t5-small')
|
||||
@@ -1056,6 +1060,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
|
||||
outputs = model.generate(input_ids)
|
||||
"""
|
||||
|
||||
if "lm_labels" in kwargs:
|
||||
warnings.warn(
|
||||
"The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
|
||||
DeprecationWarning,
|
||||
)
|
||||
labels = kwargs.pop("lm_labels")
|
||||
assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
|
||||
|
||||
# Encode if needed (training, first prediction pass)
|
||||
if encoder_outputs is None:
|
||||
# Convert encoder inputs in embeddings if needed
|
||||
@@ -1065,14 +1077,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
|
||||
|
||||
hidden_states = encoder_outputs[0]
|
||||
|
||||
if lm_labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
|
||||
if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
|
||||
# get decoder inputs from shifting lm labels to the right
|
||||
decoder_input_ids = self._shift_right(lm_labels)
|
||||
decoder_input_ids = self._shift_right(labels)
|
||||
|
||||
# If decoding with past key value states, only the last tokens
|
||||
# should be given as an input
|
||||
if decoder_past_key_value_states is not None:
|
||||
assert lm_labels is None, "Decoder should not use cached key value states when training."
|
||||
assert labels is None, "Decoder should not use cached key value states when training."
|
||||
if decoder_input_ids is not None:
|
||||
decoder_input_ids = decoder_input_ids[:, -1:]
|
||||
if decoder_inputs_embeds is not None:
|
||||
@@ -1103,9 +1115,9 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
|
||||
lm_logits = self.lm_head(sequence_output)
|
||||
|
||||
decoder_outputs = (lm_logits,) + decoder_outputs[1:] # Add hidden states and attention if they are here
|
||||
if lm_labels is not None:
|
||||
if labels is not None:
|
||||
loss_fct = CrossEntropyLoss(ignore_index=-100)
|
||||
loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1))
|
||||
loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
|
||||
# TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
|
||||
decoder_outputs = (loss,) + decoder_outputs
|
||||
|
||||
|
||||
@@ -71,7 +71,7 @@ class EncoderDecoderModelTest(unittest.TestCase):
|
||||
"decoder_choice_labels": decoder_choice_labels,
|
||||
"encoder_hidden_states": encoder_hidden_states,
|
||||
"lm_labels": decoder_token_labels,
|
||||
"masked_lm_labels": decoder_token_labels,
|
||||
"labels": decoder_token_labels,
|
||||
}
|
||||
|
||||
def create_and_check_bert_encoder_decoder_model(
|
||||
@@ -224,7 +224,7 @@ class EncoderDecoderModelTest(unittest.TestCase):
|
||||
def check_loss_output(self, loss):
|
||||
self.assertEqual(loss.size(), ())
|
||||
|
||||
def create_and_check_bert_encoder_decoder_model_mlm_labels(
|
||||
def create_and_check_bert_encoder_decoder_model_labels(
|
||||
self,
|
||||
config,
|
||||
input_ids,
|
||||
@@ -233,7 +233,7 @@ class EncoderDecoderModelTest(unittest.TestCase):
|
||||
decoder_config,
|
||||
decoder_input_ids,
|
||||
decoder_attention_mask,
|
||||
masked_lm_labels,
|
||||
labels,
|
||||
**kwargs
|
||||
):
|
||||
encoder_model = BertModel(config)
|
||||
@@ -245,7 +245,7 @@ class EncoderDecoderModelTest(unittest.TestCase):
|
||||
decoder_input_ids=decoder_input_ids,
|
||||
attention_mask=attention_mask,
|
||||
decoder_attention_mask=decoder_attention_mask,
|
||||
masked_lm_labels=masked_lm_labels,
|
||||
labels=labels,
|
||||
)
|
||||
|
||||
mlm_loss = outputs_encoder_decoder[0]
|
||||
@@ -316,9 +316,9 @@ class EncoderDecoderModelTest(unittest.TestCase):
|
||||
input_ids_dict = self.prepare_config_and_inputs_bert()
|
||||
self.create_and_check_save_and_load_encoder_decoder_model(**input_ids_dict)
|
||||
|
||||
def test_bert_encoder_decoder_model_mlm_labels(self):
|
||||
def test_bert_encoder_decoder_model_labels(self):
|
||||
input_ids_dict = self.prepare_config_and_inputs_bert()
|
||||
self.create_and_check_bert_encoder_decoder_model_mlm_labels(**input_ids_dict)
|
||||
self.create_and_check_bert_encoder_decoder_model_labels(**input_ids_dict)
|
||||
|
||||
def test_bert_encoder_decoder_model_lm_labels(self):
|
||||
input_ids_dict = self.prepare_config_and_inputs_bert()
|
||||
|
||||
Reference in New Issue
Block a user