diff --git a/src/transformers/modeling_albert.py b/src/transformers/modeling_albert.py index 2cf4b34f24..a6c54970f0 100644 --- a/src/transformers/modeling_albert.py +++ b/src/transformers/modeling_albert.py @@ -17,6 +17,7 @@ import logging import math import os +import warnings import torch import torch.nn as nn @@ -599,11 +600,12 @@ class AlbertForPreTraining(AlbertPreTrainedModel): position_ids=None, head_mask=None, inputs_embeds=None, - masked_lm_labels=None, + labels=None, sentence_order_label=None, + **kwargs ): r""" - masked_lm_labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): + labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels @@ -613,10 +615,12 @@ class AlbertForPreTraining(AlbertPreTrainedModel): Indices should be in ``[0, 1]``. ``0`` indicates original order (sequence A, then sequence B), ``1`` indicates switched order (sequence B, then sequence A). + kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): + Used to hide legacy arguments that have been deprecated. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). @@ -651,6 +655,14 @@ class AlbertForPreTraining(AlbertPreTrainedModel): """ + if "masked_lm_labels" in kwargs: + warnings.warn( + "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.", + DeprecationWarning, + ) + labels = kwargs.pop("masked_lm_labels") + assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." + outputs = self.albert( input_ids, attention_mask=attention_mask, @@ -667,9 +679,9 @@ class AlbertForPreTraining(AlbertPreTrainedModel): outputs = (prediction_scores, sop_scores,) + outputs[2:] # add hidden states and attention if they are here - if masked_lm_labels is not None and sentence_order_label is not None: + if labels is not None and sentence_order_label is not None: loss_fct = CrossEntropyLoss() - masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) sentence_order_loss = loss_fct(sop_scores.view(-1, 2), sentence_order_label.view(-1)) total_loss = masked_lm_loss + sentence_order_loss outputs = (total_loss,) + outputs @@ -742,18 +754,21 @@ class AlbertForMaskedLM(AlbertPreTrainedModel): position_ids=None, head_mask=None, inputs_embeds=None, - masked_lm_labels=None, + labels=None, + **kwargs ): r""" - masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` + kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): + Used to hide legacy arguments that have been deprecated. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.AlbertConfig`) and inputs: - loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). @@ -777,10 +792,18 @@ class AlbertForMaskedLM(AlbertPreTrainedModel): tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = AlbertForMaskedLM.from_pretrained('albert-base-v2') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 - outputs = model(input_ids, masked_lm_labels=input_ids) + outputs = model(input_ids, labels=input_ids) loss, prediction_scores = outputs[:2] """ + if "masked_lm_labels" in kwargs: + warnings.warn( + "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.", + DeprecationWarning, + ) + labels = kwargs.pop("masked_lm_labels") + assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." + outputs = self.albert( input_ids=input_ids, attention_mask=attention_mask, @@ -794,9 +817,9 @@ class AlbertForMaskedLM(AlbertPreTrainedModel): prediction_scores = self.predictions(sequence_outputs) outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here - if masked_lm_labels is not None: + if labels is not None: loss_fct = CrossEntropyLoss() - masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) outputs = (masked_lm_loss,) + outputs return outputs diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py index 929944b248..0c44de1b12 100644 --- a/src/transformers/modeling_bart.py +++ b/src/transformers/modeling_bart.py @@ -16,6 +16,7 @@ import logging import math import random +import warnings from typing import Dict, List, Optional, Tuple import numpy as np @@ -900,12 +901,12 @@ class BartForConditionalGeneration(PretrainedBartModel): decoder_input_ids=None, decoder_attention_mask=None, decoder_cached_states=None, - lm_labels=None, + labels=None, use_cache=False, **unused ): r""" - lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should either be in ``[0, ..., config.vocab_size]`` or -100 (see ``input_ids`` docstring). Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens @@ -914,7 +915,7 @@ class BartForConditionalGeneration(PretrainedBartModel): Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: - masked_lm_loss (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + masked_lm_loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). @@ -945,6 +946,13 @@ class BartForConditionalGeneration(PretrainedBartModel): tokenizer.decode(predictions).split() # ['good', 'great', 'all', 'really', 'very'] """ + if "lm_labels" in unused: + warnings.warn( + "The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.", + DeprecationWarning, + ) + labels = unused.pop("lm_labels") + outputs = self.model( input_ids, attention_mask=attention_mask, @@ -956,10 +964,10 @@ class BartForConditionalGeneration(PretrainedBartModel): ) lm_logits = F.linear(outputs[0], self.model.shared.weight, bias=self.final_logits_bias) outputs = (lm_logits,) + outputs[1:] # Add cache, hidden states and attention if they are here - if lm_labels is not None: + if labels is not None: loss_fct = nn.CrossEntropyLoss() - # TODO(SS): do we need to ignore pad tokens in lm_labels? - masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), lm_labels.view(-1)) + # TODO(SS): do we need to ignore pad tokens in labels? + masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) outputs = (masked_lm_loss,) + outputs return outputs diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py index 877bb55fc2..82e8df0abb 100644 --- a/src/transformers/modeling_bert.py +++ b/src/transformers/modeling_bert.py @@ -19,6 +19,7 @@ import logging import math import os +import warnings import torch from torch import nn @@ -768,11 +769,12 @@ class BertForPreTraining(BertPreTrainedModel): position_ids=None, head_mask=None, inputs_embeds=None, - masked_lm_labels=None, + labels=None, next_sentence_label=None, + **kwargs ): r""" - masked_lm_labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): + labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels @@ -782,10 +784,12 @@ class BertForPreTraining(BertPreTrainedModel): Indices should be in ``[0, 1]``. ``0`` indicates sequence B is a continuation of sequence A, ``1`` indicates sequence B is a random sequence. + kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): + Used to hide legacy arguments that have been deprecated. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). @@ -819,6 +823,13 @@ class BertForPreTraining(BertPreTrainedModel): prediction_scores, seq_relationship_scores = outputs[:2] """ + if "masked_lm_labels" in kwargs: + warnings.warn( + "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.", + DeprecationWarning, + ) + labels = kwargs.pop("masked_lm_labels") + assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." outputs = self.bert( input_ids, @@ -836,9 +847,9 @@ class BertForPreTraining(BertPreTrainedModel): 2: ] # add hidden states and attention if they are here - if masked_lm_labels is not None and next_sentence_label is not None: + if labels is not None and next_sentence_label is not None: loss_fct = CrossEntropyLoss() - masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) total_loss = masked_lm_loss + next_sentence_loss outputs = (total_loss,) + outputs @@ -846,6 +857,7 @@ class BertForPreTraining(BertPreTrainedModel): return outputs # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions) +# TODO: Split with a different BertWithLMHead to get rid of `lm_labels` here and in encoder_decoder. @add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING) class BertForMaskedLM(BertPreTrainedModel): def __init__(self, config): @@ -868,13 +880,14 @@ class BertForMaskedLM(BertPreTrainedModel): position_ids=None, head_mask=None, inputs_embeds=None, - masked_lm_labels=None, + labels=None, encoder_hidden_states=None, encoder_attention_mask=None, lm_labels=None, + **kwargs ): r""" - masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels @@ -884,10 +897,12 @@ class BertForMaskedLM(BertPreTrainedModel): Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` + kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): + Used to hide legacy arguments that have been deprecated. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + masked_lm_loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. ltr_lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_labels` is provided): Next token prediction loss. @@ -914,11 +929,18 @@ class BertForMaskedLM(BertPreTrainedModel): model = BertForMaskedLM.from_pretrained('bert-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 - outputs = model(input_ids, masked_lm_labels=input_ids) + outputs = model(input_ids, labels=input_ids) loss, prediction_scores = outputs[:2] """ + if "masked_lm_labels" in kwargs: + warnings.warn( + "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.", + DeprecationWarning, + ) + labels = kwargs.pop("masked_lm_labels") + assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." outputs = self.bert( input_ids, @@ -942,9 +964,9 @@ class BertForMaskedLM(BertPreTrainedModel): # of predictions for masked words. # 2. If `lm_labels` is provided we are in a causal scenario where we # try to predict the next token for each input in the decoder. - if masked_lm_labels is not None: + if labels is not None: loss_fct = CrossEntropyLoss() # -100 index = padding token - masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) outputs = (masked_lm_loss,) + outputs if lm_labels is not None: diff --git a/src/transformers/modeling_ctrl.py b/src/transformers/modeling_ctrl.py index 4f2d063f1b..c17672cb10 100644 --- a/src/transformers/modeling_ctrl.py +++ b/src/transformers/modeling_ctrl.py @@ -489,7 +489,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for language modeling. - Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` + Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` diff --git a/src/transformers/modeling_distilbert.py b/src/transformers/modeling_distilbert.py index ee74bd6744..281553616e 100644 --- a/src/transformers/modeling_distilbert.py +++ b/src/transformers/modeling_distilbert.py @@ -21,6 +21,7 @@ import copy import logging import math +import warnings import numpy as np import torch @@ -493,17 +494,19 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel): return self.vocab_projector @add_start_docstrings_to_callable(DISTILBERT_INPUTS_DOCSTRING) - def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, masked_lm_labels=None): + def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_embeds=None, labels=None, **kwargs): r""" - masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` + kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): + Used to hide legacy arguments that have been deprecated. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.DistilBertConfig`) and inputs: - loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). @@ -527,10 +530,18 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel): tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased') model = DistilBertForMaskedLM.from_pretrained('distilbert-base-cased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 - outputs = model(input_ids, masked_lm_labels=input_ids) + outputs = model(input_ids, labels=input_ids) loss, prediction_scores = outputs[:2] """ + if "masked_lm_labels" in kwargs: + warnings.warn( + "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.", + DeprecationWarning, + ) + labels = kwargs.pop("masked_lm_labels") + assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." + dlbrt_output = self.distilbert( input_ids=input_ids, attention_mask=attention_mask, head_mask=head_mask, inputs_embeds=inputs_embeds ) @@ -541,10 +552,8 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel): prediction_logits = self.vocab_projector(prediction_logits) # (bs, seq_length, vocab_size) outputs = (prediction_logits,) + dlbrt_output[1:] - if masked_lm_labels is not None: - mlm_loss = self.mlm_loss_fct( - prediction_logits.view(-1, prediction_logits.size(-1)), masked_lm_labels.view(-1) - ) + if labels is not None: + mlm_loss = self.mlm_loss_fct(prediction_logits.view(-1, prediction_logits.size(-1)), labels.view(-1)) outputs = (mlm_loss,) + outputs return outputs # (mlm_loss), prediction_logits, (all hidden_states), (all attentions) diff --git a/src/transformers/modeling_electra.py b/src/transformers/modeling_electra.py index b1c346c6e4..fea83559f2 100644 --- a/src/transformers/modeling_electra.py +++ b/src/transformers/modeling_electra.py @@ -1,5 +1,6 @@ import logging import os +import warnings import torch import torch.nn as nn @@ -561,18 +562,21 @@ class ElectraForMaskedLM(ElectraPreTrainedModel): position_ids=None, head_mask=None, inputs_embeds=None, - masked_lm_labels=None, + labels=None, + **kwargs ): r""" - masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` + kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): + Used to hide legacy arguments that have been deprecated. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.ElectraConfig`) and inputs: - masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + masked_lm_loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). @@ -597,11 +601,18 @@ class ElectraForMaskedLM(ElectraPreTrainedModel): model = ElectraForMaskedLM.from_pretrained('google/electra-small-generator') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 - outputs = model(input_ids, masked_lm_labels=input_ids) + outputs = model(input_ids, labels=input_ids) loss, prediction_scores = outputs[:2] """ + if "masked_lm_labels" in kwargs: + warnings.warn( + "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.", + DeprecationWarning, + ) + labels = kwargs.pop("masked_lm_labels") + assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." generator_hidden_states = self.electra( input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds @@ -614,9 +625,9 @@ class ElectraForMaskedLM(ElectraPreTrainedModel): output = (prediction_scores,) # Masked language modeling softmax layer - if masked_lm_labels is not None: + if labels is not None: loss_fct = nn.CrossEntropyLoss() # -100 index = padding token - loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) + loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) output = (loss,) + output output += generator_hidden_states[1:] diff --git a/src/transformers/modeling_encoder_decoder.py b/src/transformers/modeling_encoder_decoder.py index c30e8abe1f..e7a8e154ea 100644 --- a/src/transformers/modeling_encoder_decoder.py +++ b/src/transformers/modeling_encoder_decoder.py @@ -191,7 +191,7 @@ class EncoderDecoderModel(PreTrainedModel): decoder_attention_mask=None, decoder_head_mask=None, decoder_inputs_embeds=None, - masked_lm_labels=None, + labels=None, lm_labels=None, **kwargs, ): @@ -234,7 +234,7 @@ class EncoderDecoderModel(PreTrainedModel): Optionally, instead of passing :obj:`decoder_input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix. - masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss for the decoder. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels @@ -294,7 +294,7 @@ class EncoderDecoderModel(PreTrainedModel): encoder_attention_mask=attention_mask, head_mask=decoder_head_mask, lm_labels=lm_labels, - masked_lm_labels=masked_lm_labels, + labels=labels, **kwargs_decoder, ) diff --git a/src/transformers/modeling_gpt2.py b/src/transformers/modeling_gpt2.py index c59cc506a5..cc9c89cd39 100644 --- a/src/transformers/modeling_gpt2.py +++ b/src/transformers/modeling_gpt2.py @@ -18,6 +18,7 @@ import logging import os +import warnings import torch import torch.nn as nn @@ -652,17 +653,18 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): head_mask=None, inputs_embeds=None, mc_token_ids=None, - lm_labels=None, + labels=None, mc_labels=None, use_cache=True, + **kwargs ): r""" mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input) Index of the classification token in each input sequence. Selected in the range ``[0, input_ids.size(-1) - 1[``. - lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`) + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`) Labels for language modeling. - Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` + Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids`` Indices are selected in ``[-1, 0, ..., config.vocab_size]`` All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` @@ -670,12 +672,14 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) + kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): + Used to hide legacy arguments that have been deprecated. Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.GPT2Config`) and inputs: - lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``lm_labels`` is provided): + lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided): Language modeling loss. - mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`multiple_choice_labels` is provided): + mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`mc_labels` is provided): Multiple choice classification loss. lm_prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). @@ -720,6 +724,14 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): lm_prediction_scores, mc_prediction_scores = outputs[:2] """ + if "lm_labels" in kwargs: + warnings.warn( + "The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.", + DeprecationWarning, + ) + labels = kwargs.pop("lm_labels") + assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." + transformer_outputs = self.transformer( input_ids, past=past, @@ -741,9 +753,9 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): loss_fct = CrossEntropyLoss() loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)) outputs = (loss,) + outputs - if lm_labels is not None: + if labels is not None: shift_logits = lm_logits[..., :-1, :].contiguous() - shift_labels = lm_labels[..., 1:].contiguous() + shift_labels = labels[..., 1:].contiguous() loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) outputs = (loss,) + outputs diff --git a/src/transformers/modeling_longformer.py b/src/transformers/modeling_longformer.py index 89a677aa9b..dda95bfee5 100644 --- a/src/transformers/modeling_longformer.py +++ b/src/transformers/modeling_longformer.py @@ -16,6 +16,7 @@ import logging import math +import warnings import torch import torch.nn as nn @@ -587,14 +588,11 @@ class LongformerModel(RobertaModel): token_type_ids=None, position_ids=None, inputs_embeds=None, - masked_lm_labels=None, ): r""" Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: - masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: - Masked language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): @@ -704,18 +702,21 @@ class LongformerForMaskedLM(BertPreTrainedModel): token_type_ids=None, position_ids=None, inputs_embeds=None, - masked_lm_labels=None, + labels=None, + **kwargs ): r""" - masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` + kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): + Used to hide legacy arguments that have been deprecated. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: - masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + masked_lm_loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). @@ -744,9 +745,17 @@ class LongformerForMaskedLM(BertPreTrainedModel): attention_mask = None # default is local attention everywhere, which is a good choice for MaskedLM # check ``LongformerModel.forward`` for more details how to set `attention_mask` - loss, prediction_scores = model(input_ids, attention_mask=attention_mask, masked_lm_labels=input_ids) + loss, prediction_scores = model(input_ids, attention_mask=attention_mask, labels=input_ids) """ + if "masked_lm_labels" in kwargs: + warnings.warn( + "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.", + DeprecationWarning, + ) + labels = kwargs.pop("masked_lm_labels") + assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." + outputs = self.longformer( input_ids, attention_mask=attention_mask, @@ -760,9 +769,9 @@ class LongformerForMaskedLM(BertPreTrainedModel): outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here - if masked_lm_labels is not None: + if labels is not None: loss_fct = CrossEntropyLoss() - masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) outputs = (masked_lm_loss,) + outputs return outputs # (masked_lm_loss), prediction_scores, (hidden_states), (attentions) diff --git a/src/transformers/modeling_openai.py b/src/transformers/modeling_openai.py index 077a1ca2a0..a1c729ac69 100644 --- a/src/transformers/modeling_openai.py +++ b/src/transformers/modeling_openai.py @@ -20,6 +20,7 @@ import json import logging import math import os +import warnings import torch import torch.nn as nn @@ -588,16 +589,17 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): head_mask=None, inputs_embeds=None, mc_token_ids=None, - lm_labels=None, + labels=None, mc_labels=None, + **kwargs ): r""" mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input) Index of the classification token in each input sequence. Selected in the range ``[0, input_ids.size(-1) - 1[``. - lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`) + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`) Labels for language modeling. - Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` + Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids`` Indices are selected in ``[-1, 0, ..., config.vocab_size]`` All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` @@ -605,12 +607,14 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): Labels for computing the multiple choice classification loss. Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see `input_ids` above) + kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): + Used to hide legacy arguments that have been deprecated. Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.OpenAIGPTConfig`) and inputs: - lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``lm_labels`` is provided): + lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided): Language modeling loss. - mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`multiple_choice_labels` is provided): + mc_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`mc_labels` is provided): Multiple choice classification loss. lm_prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices, sequence_length, config.vocab_size)`): Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). @@ -650,6 +654,14 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): lm_prediction_scores, mc_prediction_scores = outputs[:2] """ + if "lm_labels" in kwargs: + warnings.warn( + "The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.", + DeprecationWarning, + ) + labels = kwargs.pop("lm_labels") + assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." + transformer_outputs = self.transformer( input_ids, attention_mask=attention_mask, @@ -668,9 +680,9 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): loss_fct = CrossEntropyLoss() loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)) outputs = (loss,) + outputs - if lm_labels is not None: + if labels is not None: shift_logits = lm_logits[..., :-1, :].contiguous() - shift_labels = lm_labels[..., 1:].contiguous() + shift_labels = labels[..., 1:].contiguous() loss_fct = CrossEntropyLoss() loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) outputs = (loss,) + outputs diff --git a/src/transformers/modeling_reformer.py b/src/transformers/modeling_reformer.py index c24a4219ea..c287a030af 100644 --- a/src/transformers/modeling_reformer.py +++ b/src/transformers/modeling_reformer.py @@ -1755,7 +1755,7 @@ class ReformerModelWithLMHead(ReformerPreTrainedModel): Return: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: - loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_label` is provided): + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): Classification loss (cross entropy). prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). diff --git a/src/transformers/modeling_roberta.py b/src/transformers/modeling_roberta.py index 175075a2fb..7d3176311e 100644 --- a/src/transformers/modeling_roberta.py +++ b/src/transformers/modeling_roberta.py @@ -17,6 +17,7 @@ import logging +import warnings import torch import torch.nn as nn @@ -183,18 +184,21 @@ class RobertaForMaskedLM(BertPreTrainedModel): position_ids=None, head_mask=None, inputs_embeds=None, - masked_lm_labels=None, + labels=None, + **kwargs ): r""" - masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` + kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): + Used to hide legacy arguments that have been deprecated. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: - masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + masked_lm_loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: Masked language modeling loss. prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). @@ -218,10 +222,18 @@ class RobertaForMaskedLM(BertPreTrainedModel): tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = RobertaForMaskedLM.from_pretrained('roberta-base') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 - outputs = model(input_ids, masked_lm_labels=input_ids) + outputs = model(input_ids, labels=input_ids) loss, prediction_scores = outputs[:2] """ + if "masked_lm_labels" in kwargs: + warnings.warn( + "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.", + DeprecationWarning, + ) + labels = kwargs.pop("masked_lm_labels") + assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." + outputs = self.roberta( input_ids, attention_mask=attention_mask, @@ -235,9 +247,9 @@ class RobertaForMaskedLM(BertPreTrainedModel): outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here - if masked_lm_labels is not None: + if labels is not None: loss_fct = CrossEntropyLoss() - masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1)) + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) outputs = (masked_lm_loss,) + outputs return outputs # (masked_lm_loss), prediction_scores, (hidden_states), (attentions) diff --git a/src/transformers/modeling_t5.py b/src/transformers/modeling_t5.py index 90d094eb54..ccc005571b 100644 --- a/src/transformers/modeling_t5.py +++ b/src/transformers/modeling_t5.py @@ -19,6 +19,7 @@ import copy import logging import math import os +import warnings import torch import torch.nn.functional as F @@ -616,10 +617,10 @@ class T5PreTrainedModel(PreTrainedModel): shifted_input_ids[..., 0] = decoder_start_token_id assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined." - # replace possible -100 values in lm_labels by `pad_token_id` + # replace possible -100 values in labels by `pad_token_id` shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) - assert torch.all(shifted_input_ids >= 0).item(), "Verify that `lm_labels` has only positive values and -100" + assert torch.all(shifted_input_ids >= 0).item(), "Verify that `labels` has only positive values and -100" return shifted_input_ids @@ -1008,21 +1009,24 @@ class T5ForConditionalGeneration(T5PreTrainedModel): decoder_attention_mask=None, decoder_past_key_value_states=None, use_cache=True, - lm_labels=None, + labels=None, inputs_embeds=None, decoder_inputs_embeds=None, head_mask=None, + **kwargs ): r""" - lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[-100, 0, ..., config.vocab_size - 1]`. All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` + kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`): + Used to hide legacy arguments that have been deprecated. Returns: :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.T5Config`) and inputs. - loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`lm_label` is provided): + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): Classification loss (cross entropy). prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). @@ -1047,7 +1051,7 @@ class T5ForConditionalGeneration(T5PreTrainedModel): tokenizer = T5Tokenizer.from_pretrained('t5-small') model = T5ForConditionalGeneration.from_pretrained('t5-small') input_ids = tokenizer.encode("Hello, my dog is cute", return_tensors="pt") # Batch size 1 - outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, lm_labels=input_ids) + outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids) loss, prediction_scores = outputs[:2] tokenizer = T5Tokenizer.from_pretrained('t5-small') @@ -1056,6 +1060,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel): outputs = model.generate(input_ids) """ + if "lm_labels" in kwargs: + warnings.warn( + "The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.", + DeprecationWarning, + ) + labels = kwargs.pop("lm_labels") + assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}." + # Encode if needed (training, first prediction pass) if encoder_outputs is None: # Convert encoder inputs in embeddings if needed @@ -1065,14 +1077,14 @@ class T5ForConditionalGeneration(T5PreTrainedModel): hidden_states = encoder_outputs[0] - if lm_labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: + if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: # get decoder inputs from shifting lm labels to the right - decoder_input_ids = self._shift_right(lm_labels) + decoder_input_ids = self._shift_right(labels) # If decoding with past key value states, only the last tokens # should be given as an input if decoder_past_key_value_states is not None: - assert lm_labels is None, "Decoder should not use cached key value states when training." + assert labels is None, "Decoder should not use cached key value states when training." if decoder_input_ids is not None: decoder_input_ids = decoder_input_ids[:, -1:] if decoder_inputs_embeds is not None: @@ -1103,9 +1115,9 @@ class T5ForConditionalGeneration(T5PreTrainedModel): lm_logits = self.lm_head(sequence_output) decoder_outputs = (lm_logits,) + decoder_outputs[1:] # Add hidden states and attention if they are here - if lm_labels is not None: + if labels is not None: loss_fct = CrossEntropyLoss(ignore_index=-100) - loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1)) + loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1)) # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666 decoder_outputs = (loss,) + decoder_outputs diff --git a/tests/test_modeling_encoder_decoder.py b/tests/test_modeling_encoder_decoder.py index 6130cb8804..ce3b69d91e 100644 --- a/tests/test_modeling_encoder_decoder.py +++ b/tests/test_modeling_encoder_decoder.py @@ -71,7 +71,7 @@ class EncoderDecoderModelTest(unittest.TestCase): "decoder_choice_labels": decoder_choice_labels, "encoder_hidden_states": encoder_hidden_states, "lm_labels": decoder_token_labels, - "masked_lm_labels": decoder_token_labels, + "labels": decoder_token_labels, } def create_and_check_bert_encoder_decoder_model( @@ -224,7 +224,7 @@ class EncoderDecoderModelTest(unittest.TestCase): def check_loss_output(self, loss): self.assertEqual(loss.size(), ()) - def create_and_check_bert_encoder_decoder_model_mlm_labels( + def create_and_check_bert_encoder_decoder_model_labels( self, config, input_ids, @@ -233,7 +233,7 @@ class EncoderDecoderModelTest(unittest.TestCase): decoder_config, decoder_input_ids, decoder_attention_mask, - masked_lm_labels, + labels, **kwargs ): encoder_model = BertModel(config) @@ -245,7 +245,7 @@ class EncoderDecoderModelTest(unittest.TestCase): decoder_input_ids=decoder_input_ids, attention_mask=attention_mask, decoder_attention_mask=decoder_attention_mask, - masked_lm_labels=masked_lm_labels, + labels=labels, ) mlm_loss = outputs_encoder_decoder[0] @@ -316,9 +316,9 @@ class EncoderDecoderModelTest(unittest.TestCase): input_ids_dict = self.prepare_config_and_inputs_bert() self.create_and_check_save_and_load_encoder_decoder_model(**input_ids_dict) - def test_bert_encoder_decoder_model_mlm_labels(self): + def test_bert_encoder_decoder_model_labels(self): input_ids_dict = self.prepare_config_and_inputs_bert() - self.create_and_check_bert_encoder_decoder_model_mlm_labels(**input_ids_dict) + self.create_and_check_bert_encoder_decoder_model_labels(**input_ids_dict) def test_bert_encoder_decoder_model_lm_labels(self): input_ids_dict = self.prepare_config_and_inputs_bert()