From c28bc80bbbe0cf5cb289944ec2374d5397ff6e31 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Fri, 29 Oct 2021 10:32:56 -0400 Subject: [PATCH] Generalize problem_type to all sequence classification models (#14180) * Generalize problem_type to all classification models * Missing import * Deberta BC and fix tests * Fix template * Missing imports * Revert change to reformer test * Fix style --- src/transformers/models/bart/modeling_bart.py | 24 ++++++-- .../modeling_bigbird_pegasus.py | 24 ++++++-- src/transformers/models/ctrl/modeling_ctrl.py | 24 ++++++-- .../models/deberta/modeling_deberta.py | 59 ++++++++++++------- .../models/deberta_v2/modeling_deberta_v2.py | 59 ++++++++++++------- src/transformers/models/fnet/modeling_fnet.py | 24 ++++++-- src/transformers/models/gpt2/modeling_gpt2.py | 24 ++++++-- .../models/gpt_neo/modeling_gpt_neo.py | 24 ++++++-- src/transformers/models/gptj/modeling_gptj.py | 24 ++++++-- .../models/ibert/modeling_ibert.py | 24 ++++++-- .../models/layoutlm/modeling_layoutlm.py | 24 ++++++-- .../models/layoutlmv2/modeling_layoutlmv2.py | 24 ++++++-- src/transformers/models/led/modeling_led.py | 23 +++++++- .../models/mbart/modeling_mbart.py | 24 ++++++-- .../megatron_bert/modeling_megatron_bert.py | 24 ++++++-- .../models/mpnet/modeling_mpnet.py | 24 ++++++-- .../models/openai/modeling_openai.py | 24 ++++++-- .../models/rembert/modeling_rembert.py | 24 ++++++-- .../models/roformer/modeling_roformer.py | 24 ++++++-- .../models/tapas/modeling_tapas.py | 24 ++++++-- .../models/transfo_xl/modeling_transfo_xl.py | 23 ++++++-- ...ng_{{cookiecutter.lowercase_modelname}}.py | 47 ++++++++++++--- tests/test_modeling_albert.py | 2 - tests/test_modeling_bert.py | 1 - tests/test_modeling_big_bird.py | 1 - tests/test_modeling_common.py | 26 ++++---- tests/test_modeling_convbert.py | 1 - tests/test_modeling_distilbert.py | 1 - tests/test_modeling_electra.py | 1 - tests/test_modeling_funnel.py | 1 - tests/test_modeling_longformer.py | 1 - tests/test_modeling_mobilebert.py | 1 - tests/test_modeling_openai.py | 2 +- tests/test_modeling_reformer.py | 4 ++ tests/test_modeling_roberta.py | 1 - tests/test_modeling_squeezebert.py | 1 - tests/test_modeling_xlm.py | 1 - tests/test_modeling_xlnet.py | 1 - 38 files changed, 474 insertions(+), 191 deletions(-) diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py index 8dc635d70c..e86e67f67f 100755 --- a/src/transformers/models/bart/modeling_bart.py +++ b/src/transformers/models/bart/modeling_bart.py @@ -22,7 +22,7 @@ from typing import Optional, Tuple import torch import torch.utils.checkpoint from torch import nn -from torch.nn import CrossEntropyLoss, MSELoss +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN from ...file_utils import ( @@ -1475,14 +1475,26 @@ class BartForSequenceClassification(BartPretrainedModel): loss = None if labels is not None: - if self.config.num_labels == 1: - # regression + if self.config.problem_type is None: + if self.config.num_labels == 1: + self.config.problem_type = "regression" + elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": loss_fct = MSELoss() - loss = loss_fct(logits.view(-1), labels.view(-1)) - else: + if self.config.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) - + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) if not return_dict: output = (logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py index 3467076ade..68d9ae0c0e 100755 --- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py +++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -23,7 +23,7 @@ from typing import Optional, Tuple import numpy as np import torch from torch import nn -from torch.nn import CrossEntropyLoss, MSELoss +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN from ...file_utils import ( @@ -2680,14 +2680,26 @@ class BigBirdPegasusForSequenceClassification(BigBirdPegasusPreTrainedModel): loss = None if labels is not None: - if self.config.num_labels == 1: - # regression + if self.config.problem_type is None: + if self.config.num_labels == 1: + self.config.problem_type = "regression" + elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": loss_fct = MSELoss() - loss = loss_fct(logits.view(-1), labels.view(-1)) - else: + if self.config.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) - + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) if not return_dict: output = (logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/models/ctrl/modeling_ctrl.py b/src/transformers/models/ctrl/modeling_ctrl.py index ea98f1322e..9c06e20269 100644 --- a/src/transformers/models/ctrl/modeling_ctrl.py +++ b/src/transformers/models/ctrl/modeling_ctrl.py @@ -20,7 +20,7 @@ from typing import Tuple import numpy as np import torch from torch import nn -from torch.nn import CrossEntropyLoss, MSELoss +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutput @@ -690,14 +690,26 @@ class CTRLForSequenceClassification(CTRLPreTrainedModel): loss = None if labels is not None: - if self.num_labels == 1: - # We are doing regression + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": loss_fct = MSELoss() - loss = loss_fct(pooled_logits.view(-1), labels.to(self.dtype).view(-1)) - else: + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) if not return_dict: output = (pooled_logits,) + transformer_outputs[2:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py index 6179012f8d..4a54ab6340 100644 --- a/src/transformers/models/deberta/modeling_deberta.py +++ b/src/transformers/models/deberta/modeling_deberta.py @@ -19,7 +19,7 @@ from collections.abc import Sequence import torch from torch import _softmax_backward_data, nn -from torch.nn import CrossEntropyLoss +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward @@ -1194,31 +1194,46 @@ class DebertaForSequenceClassification(DebertaPreTrainedModel): loss = None if labels is not None: - if self.num_labels == 1: - # regression task - loss_fn = nn.MSELoss() - logits = logits.view(-1).to(labels.dtype) - loss = loss_fn(logits, labels.view(-1)) - elif labels.dim() == 1 or labels.size(-1) == 1: - label_index = (labels >= 0).nonzero() - labels = labels.long() - if label_index.size(0) > 0: - labeled_logits = torch.gather(logits, 0, label_index.expand(label_index.size(0), logits.size(1))) - labels = torch.gather(labels, 0, label_index.view(-1)) - loss_fct = CrossEntropyLoss() - loss = loss_fct(labeled_logits.view(-1, self.num_labels).float(), labels.view(-1)) + if self.config.problem_type is None: + if self.num_labels == 1: + # regression task + loss_fn = nn.MSELoss() + logits = logits.view(-1).to(labels.dtype) + loss = loss_fn(logits, labels.view(-1)) + elif labels.dim() == 1 or labels.size(-1) == 1: + label_index = (labels >= 0).nonzero() + labels = labels.long() + if label_index.size(0) > 0: + labeled_logits = torch.gather( + logits, 0, label_index.expand(label_index.size(0), logits.size(1)) + ) + labels = torch.gather(labels, 0, label_index.view(-1)) + loss_fct = CrossEntropyLoss() + loss = loss_fct(labeled_logits.view(-1, self.num_labels).float(), labels.view(-1)) + else: + loss = torch.tensor(0).to(logits) else: - loss = torch.tensor(0).to(logits) - else: - log_softmax = nn.LogSoftmax(-1) - loss = -((log_softmax(logits) * labels).sum(-1)).mean() + log_softmax = nn.LogSoftmax(-1) + loss = -((log_softmax(logits) * labels).sum(-1)).mean() + elif self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) if not return_dict: output = (logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output - else: - return SequenceClassifierOutput( - loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions - ) + + return SequenceClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions + ) @add_start_docstrings( diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index f59ce7306a..b1ec6bd011 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -20,7 +20,7 @@ from collections.abc import Sequence import numpy as np import torch from torch import _softmax_backward_data, nn -from torch.nn import CrossEntropyLoss, LayerNorm +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss from ...activations import ACT2FN from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward @@ -1304,31 +1304,46 @@ class DebertaV2ForSequenceClassification(DebertaV2PreTrainedModel): loss = None if labels is not None: - if self.num_labels == 1: - # regression task - loss_fn = nn.MSELoss() - logits = logits.view(-1).to(labels.dtype) - loss = loss_fn(logits, labels.view(-1)) - elif labels.dim() == 1 or labels.size(-1) == 1: - label_index = (labels >= 0).nonzero() - labels = labels.long() - if label_index.size(0) > 0: - labeled_logits = torch.gather(logits, 0, label_index.expand(label_index.size(0), logits.size(1))) - labels = torch.gather(labels, 0, label_index.view(-1)) - loss_fct = CrossEntropyLoss() - loss = loss_fct(labeled_logits.view(-1, self.num_labels).float(), labels.view(-1)) + if self.config.problem_type is None: + if self.num_labels == 1: + # regression task + loss_fn = nn.MSELoss() + logits = logits.view(-1).to(labels.dtype) + loss = loss_fn(logits, labels.view(-1)) + elif labels.dim() == 1 or labels.size(-1) == 1: + label_index = (labels >= 0).nonzero() + labels = labels.long() + if label_index.size(0) > 0: + labeled_logits = torch.gather( + logits, 0, label_index.expand(label_index.size(0), logits.size(1)) + ) + labels = torch.gather(labels, 0, label_index.view(-1)) + loss_fct = CrossEntropyLoss() + loss = loss_fct(labeled_logits.view(-1, self.num_labels).float(), labels.view(-1)) + else: + loss = torch.tensor(0).to(logits) else: - loss = torch.tensor(0).to(logits) - else: - log_softmax = nn.LogSoftmax(-1) - loss = -((log_softmax(logits) * labels).sum(-1)).mean() + log_softmax = nn.LogSoftmax(-1) + loss = -((log_softmax(logits) * labels).sum(-1)).mean() + elif self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) if not return_dict: output = (logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output - else: - return SequenceClassifierOutput( - loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions - ) + + return SequenceClassifierOutput( + loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions + ) @add_start_docstrings( diff --git a/src/transformers/models/fnet/modeling_fnet.py b/src/transformers/models/fnet/modeling_fnet.py index afcfd273f3..16ae695b34 100755 --- a/src/transformers/models/fnet/modeling_fnet.py +++ b/src/transformers/models/fnet/modeling_fnet.py @@ -23,7 +23,7 @@ import torch import torch.utils.checkpoint from packaging import version from torch import nn -from torch.nn import CrossEntropyLoss, MSELoss +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...file_utils import is_scipy_available @@ -927,14 +927,26 @@ class FNetForSequenceClassification(FNetPreTrainedModel): loss = None if labels is not None: - if self.num_labels == 1: - # We are doing regression + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": loss_fct = MSELoss() - loss = loss_fct(logits.view(-1), labels.view(-1)) - else: + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py index c0611158ed..d8f09d3e72 100644 --- a/src/transformers/models/gpt2/modeling_gpt2.py +++ b/src/transformers/models/gpt2/modeling_gpt2.py @@ -24,7 +24,7 @@ import torch import torch.utils.checkpoint from packaging import version from torch import nn -from torch.nn import CrossEntropyLoss, MSELoss +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss if version.parse(torch.__version__) >= version.parse("1.6"): @@ -1406,14 +1406,26 @@ class GPT2ForSequenceClassification(GPT2PreTrainedModel): loss = None if labels is not None: - if self.num_labels == 1: - # We are doing regression + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": loss_fct = MSELoss() - loss = loss_fct(pooled_logits.view(-1), labels.to(self.dtype).view(-1)) - else: + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) if not return_dict: output = (pooled_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py index a845ad5987..1be7de2f2c 100755 --- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py +++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py @@ -21,7 +21,7 @@ from typing import Tuple import torch import torch.utils.checkpoint from torch import nn -from torch.nn import CrossEntropyLoss, MSELoss +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward @@ -895,14 +895,26 @@ class GPTNeoForSequenceClassification(GPTNeoPreTrainedModel): loss = None if labels is not None: - if self.num_labels == 1: - # We are doing regression + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": loss_fct = MSELoss() - loss = loss_fct(pooled_logits.view(-1), labels.to(self.dtype).view(-1)) - else: + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) if not return_dict: output = (pooled_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py index 15d16627ad..0ea10b1bb0 100755 --- a/src/transformers/models/gptj/modeling_gptj.py +++ b/src/transformers/models/gptj/modeling_gptj.py @@ -19,7 +19,7 @@ from typing import Tuple import torch import torch.utils.checkpoint from torch import nn -from torch.nn import CrossEntropyLoss, MSELoss +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward @@ -931,14 +931,26 @@ class GPTJForSequenceClassification(GPTJPreTrainedModel): loss = None if labels is not None: - if self.num_labels == 1: - # We are doing regression + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": loss_fct = MSELoss() - loss = loss_fct(pooled_logits.view(-1), labels.to(self.dtype).view(-1)) - else: + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) if not return_dict: output = (pooled_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/models/ibert/modeling_ibert.py b/src/transformers/models/ibert/modeling_ibert.py index 3aa936ba6a..8173ce1be8 100644 --- a/src/transformers/models/ibert/modeling_ibert.py +++ b/src/transformers/models/ibert/modeling_ibert.py @@ -22,7 +22,7 @@ import math import torch import torch.utils.checkpoint from torch import nn -from torch.nn import CrossEntropyLoss, MSELoss +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import gelu from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward @@ -1025,14 +1025,26 @@ class IBertForSequenceClassification(IBertPreTrainedModel): loss = None if labels is not None: - if self.num_labels == 1: - # We are doing regression + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": loss_fct = MSELoss() - loss = loss_fct(logits.view(-1), labels.view(-1)) - else: + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py index a8509561fe..ff93fb948b 100644 --- a/src/transformers/models/layoutlm/modeling_layoutlm.py +++ b/src/transformers/models/layoutlm/modeling_layoutlm.py @@ -20,7 +20,7 @@ import math import torch import torch.utils.checkpoint from torch import nn -from torch.nn import CrossEntropyLoss, MSELoss +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings @@ -1059,14 +1059,26 @@ class LayoutLMForSequenceClassification(LayoutLMPreTrainedModel): loss = None if labels is not None: - if self.num_labels == 1: - # We are doing regression + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": loss_fct = MSELoss() - loss = loss_fct(logits.view(-1), labels.view(-1)) - else: + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py index 6c42ce1ccc..653d6ea627 100755 --- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py +++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py @@ -20,7 +20,7 @@ import math import torch import torch.utils.checkpoint from torch import nn -from torch.nn import CrossEntropyLoss, MSELoss +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN from ...file_utils import ( @@ -1076,14 +1076,26 @@ class LayoutLMv2ForSequenceClassification(LayoutLMv2PreTrainedModel): loss = None if labels is not None: - if self.num_labels == 1: - # We are doing regression + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": loss_fct = MSELoss() - loss = loss_fct(logits.view(-1), labels.view(-1)) - else: + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py index c078a4e54c..5433d0a7c7 100755 --- a/src/transformers/models/led/modeling_led.py +++ b/src/transformers/models/led/modeling_led.py @@ -23,7 +23,7 @@ from typing import List, Optional, Tuple import torch import torch.utils.checkpoint from torch import nn -from torch.nn import CrossEntropyLoss +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN from ...file_utils import ( @@ -2536,9 +2536,26 @@ class LEDForSequenceClassification(LEDPreTrainedModel): loss = None if labels is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) + if self.config.problem_type is None: + if self.config.num_labels == 1: + self.config.problem_type = "regression" + elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.config.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) if not return_dict: output = (logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py index 093a2e0993..17a8dde45d 100755 --- a/src/transformers/models/mbart/modeling_mbart.py +++ b/src/transformers/models/mbart/modeling_mbart.py @@ -21,7 +21,7 @@ from typing import Optional, Tuple import torch import torch.utils.checkpoint from torch import nn -from torch.nn import CrossEntropyLoss, MSELoss +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN from ...file_utils import ( @@ -1475,14 +1475,26 @@ class MBartForSequenceClassification(MBartPreTrainedModel): loss = None if labels is not None: - if self.config.num_labels == 1: - # regression + if self.config.problem_type is None: + if self.config.num_labels == 1: + self.config.problem_type = "regression" + elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": loss_fct = MSELoss() - loss = loss_fct(logits.view(-1), labels.view(-1)) - else: + if self.config.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) - + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) if not return_dict: output = (logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py index 042720667c..47218d058f 100755 --- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py +++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py @@ -25,7 +25,7 @@ from typing import Optional, Tuple import torch import torch.utils.checkpoint from torch import nn -from torch.nn import CrossEntropyLoss, MSELoss +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN from ...file_utils import ( @@ -1525,14 +1525,26 @@ class MegatronBertForSequenceClassification(MegatronBertPreTrainedModel): loss = None if labels is not None: - if self.num_labels == 1: - # We are doing regression + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": loss_fct = MSELoss() - loss = loss_fct(logits.view(-1), labels.view(-1)) - else: + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/models/mpnet/modeling_mpnet.py b/src/transformers/models/mpnet/modeling_mpnet.py index 9f3e7bd493..52cf537ed2 100644 --- a/src/transformers/models/mpnet/modeling_mpnet.py +++ b/src/transformers/models/mpnet/modeling_mpnet.py @@ -20,7 +20,7 @@ import math import torch from torch import nn -from torch.nn import CrossEntropyLoss, MSELoss +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN, gelu from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward @@ -736,14 +736,26 @@ class MPNetForSequenceClassification(MPNetPreTrainedModel): loss = None if labels is not None: - if self.num_labels == 1: - # We are doing regression + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": loss_fct = MSELoss() - loss = loss_fct(logits.view(-1), labels.view(-1)) - else: + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/models/openai/modeling_openai.py b/src/transformers/models/openai/modeling_openai.py index c3f36a8889..4a0e568d1d 100644 --- a/src/transformers/models/openai/modeling_openai.py +++ b/src/transformers/models/openai/modeling_openai.py @@ -24,7 +24,7 @@ from typing import Optional, Tuple import torch from torch import nn -from torch.nn import CrossEntropyLoss, MSELoss +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import gelu_new, silu from ...file_utils import ( @@ -823,14 +823,26 @@ class OpenAIGPTForSequenceClassification(OpenAIGPTPreTrainedModel): loss = None if labels is not None: - if self.num_labels == 1: - # We are doing regression + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": loss_fct = MSELoss() - loss = loss_fct(pooled_logits.view(-1), labels.to(self.dtype).view(-1)) - else: + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) if not return_dict: output = (pooled_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/models/rembert/modeling_rembert.py b/src/transformers/models/rembert/modeling_rembert.py index 599875347f..ec9babb5e1 100755 --- a/src/transformers/models/rembert/modeling_rembert.py +++ b/src/transformers/models/rembert/modeling_rembert.py @@ -21,7 +21,7 @@ import os import torch import torch.utils.checkpoint from torch import nn -from torch.nn import CrossEntropyLoss, MSELoss +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN from ...file_utils import ( @@ -1220,14 +1220,26 @@ class RemBertForSequenceClassification(RemBertPreTrainedModel): loss = None if labels is not None: - if self.num_labels == 1: - # We are doing regression + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": loss_fct = MSELoss() - loss = loss_fct(logits.view(-1), labels.view(-1)) - else: + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/models/roformer/modeling_roformer.py b/src/transformers/models/roformer/modeling_roformer.py index 0543fa8f24..48ae11ed33 100644 --- a/src/transformers/models/roformer/modeling_roformer.py +++ b/src/transformers/models/roformer/modeling_roformer.py @@ -23,7 +23,7 @@ import numpy as np import torch import torch.utils.checkpoint from torch import nn -from torch.nn import CrossEntropyLoss, MSELoss +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN from ...file_utils import ( @@ -1287,14 +1287,26 @@ class RoFormerForSequenceClassification(RoFormerPreTrainedModel): loss = None if labels is not None: - if self.num_labels == 1: - # We are doing regression + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": loss_fct = MSELoss() - loss = loss_fct(logits.view(-1), labels.view(-1)) - else: + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) if not return_dict: output = (logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py index a486b509fc..4189ac23ed 100644 --- a/src/transformers/models/tapas/modeling_tapas.py +++ b/src/transformers/models/tapas/modeling_tapas.py @@ -24,7 +24,7 @@ from typing import Optional, Tuple import torch import torch.utils.checkpoint from torch import nn -from torch.nn import CrossEntropyLoss, MSELoss +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN from ...file_utils import ( @@ -1532,14 +1532,26 @@ class TapasForSequenceClassification(TapasPreTrainedModel): loss = None if labels is not None: - if self.num_labels == 1: - # We are doing regression + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": loss_fct = MSELoss() - loss = loss_fct(logits.view(-1), labels.view(-1)) - else: + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) if not return_dict: output = (logits,) + outputs[2:] return ((loss,) + output) if loss is not None else output diff --git a/src/transformers/models/transfo_xl/modeling_transfo_xl.py b/src/transformers/models/transfo_xl/modeling_transfo_xl.py index b2703bee4f..e18a5b50f6 100644 --- a/src/transformers/models/transfo_xl/modeling_transfo_xl.py +++ b/src/transformers/models/transfo_xl/modeling_transfo_xl.py @@ -22,7 +22,7 @@ from typing import List, Optional, Tuple import torch from torch import nn -from torch.nn import CrossEntropyLoss, MSELoss +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...file_utils import ( ModelOutput, @@ -1234,13 +1234,26 @@ class TransfoXLForSequenceClassification(TransfoXLPreTrainedModel): loss = None if labels is not None: - if self.num_labels == 1: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": loss_fct = MSELoss() - loss = loss_fct(pooled_logits.view(-1), labels.to(self.dtype).view(-1)) - else: + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) if not return_dict: output = (pooled_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py index 5d894709a0..bd889fea2b 100755 --- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py +++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py @@ -24,7 +24,7 @@ import torch import torch.utils.checkpoint from packaging import version from torch import nn -from torch.nn import CrossEntropyLoss, MSELoss +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN from ...file_utils import ( @@ -1265,14 +1265,26 @@ class {{cookiecutter.camelcase_modelname}}ForSequenceClassification({{cookiecutt loss = None if labels is not None: - if self.num_labels == 1: - # We are doing regression + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": loss_fct = MSELoss() - loss = loss_fct(logits.view(-1), labels.view(-1)) - else: + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) - + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) if not return_dict: output = (logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output @@ -1564,7 +1576,7 @@ from typing import Optional, Tuple import torch from torch import nn -from torch.nn import CrossEntropyLoss +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN from ...file_utils import ( @@ -2981,9 +2993,26 @@ class {{cookiecutter.camelcase_modelname}}ForSequenceClassification({{cookiecutt loss = None if labels is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) + if self.config.problem_type is None: + if self.config.num_labels == 1: + self.config.problem_type = "regression" + elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.config.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) if not return_dict: output = (logits,) + outputs[1:] return ((loss,) + output) if loss is not None else output diff --git a/tests/test_modeling_albert.py b/tests/test_modeling_albert.py index a8228d3e13..fa841fe299 100644 --- a/tests/test_modeling_albert.py +++ b/tests/test_modeling_albert.py @@ -234,8 +234,6 @@ class AlbertModelTest(ModelTesterMixin, unittest.TestCase): fx_ready_model_classes = all_model_classes fx_dynamic_ready_model_classes = all_model_classes - test_sequence_classification_problem_types = True - # special case for ForPreTraining model def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) diff --git a/tests/test_modeling_bert.py b/tests/test_modeling_bert.py index 7b7f02a553..6c4613e6a3 100755 --- a/tests/test_modeling_bert.py +++ b/tests/test_modeling_bert.py @@ -446,7 +446,6 @@ class BertModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): all_generative_model_classes = (BertLMHeadModel,) if is_torch_available() else () fx_ready_model_classes = all_model_classes fx_dynamic_ready_model_classes = all_model_classes - test_sequence_classification_problem_types = True # special case for ForPreTraining model def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): diff --git a/tests/test_modeling_big_bird.py b/tests/test_modeling_big_bird.py index d0859fc08f..401fade117 100644 --- a/tests/test_modeling_big_bird.py +++ b/tests/test_modeling_big_bird.py @@ -435,7 +435,6 @@ class BigBirdModelTest(ModelTesterMixin, unittest.TestCase): # head masking & pruning is currently not supported for big bird test_head_masking = False test_pruning = False - test_sequence_classification_problem_types = True # torchscript should be possible, but takes prohibitively long to test. # Also torchscript is not an important feature to have in the beginning. diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 3890198edb..20ee1c49e9 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -113,7 +113,6 @@ class ModelTesterMixin: test_missing_keys = True test_model_parallel = False is_encoder_decoder = False - test_sequence_classification_problem_types = False def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): inputs_dict = copy.deepcopy(inputs_dict) @@ -387,12 +386,13 @@ class ModelTesterMixin: if not self.model_tester.is_training: return - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - config.return_dict = True - for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + if model_class in get_values(MODEL_MAPPING): continue + model = model_class(config) model.to(torch_device) model.train() @@ -401,14 +401,14 @@ class ModelTesterMixin: loss.backward() def test_training_gradient_checkpointing(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() if not self.model_tester.is_training: return - config.use_cache = False - config.return_dict = True - for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.use_cache = False + config.return_dict = True + if model_class in get_values(MODEL_MAPPING) or not model_class.supports_gradient_checkpointing: continue model = model_class(config) @@ -1842,9 +1842,6 @@ class ModelTesterMixin: model.generate(**cast_to_device(inputs_dict, "cuda:0"), num_beams=2) def test_problem_types(self): - if not self.test_sequence_classification_problem_types: - return - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() problem_types = [ @@ -1880,7 +1877,11 @@ class ModelTesterMixin: # See https://github.com/huggingface/transformers/issues/11780 with warnings.catch_warnings(record=True) as warning_list: loss = model(**inputs).loss - self.assertListEqual(warning_list, []) + for w in warning_list: + if "Using a target size that is different to the input size" in str(w.message): + raise ValueError( + f"Something is going wrong in the regression problem: intercepted {w.message}" + ) loss.backward() @@ -2184,7 +2185,6 @@ class ModelPushToHubTester(unittest.TestCase): f.write(FAKE_MODEL_CODE) repo.push_to_hub() - print(os.listdir(tmp_dir)) new_model = AutoModel.from_pretrained(f"{USER}/test-dynamic-model", trust_remote_code=True) for p1, p2 in zip(model.parameters(), new_model.parameters()): diff --git a/tests/test_modeling_convbert.py b/tests/test_modeling_convbert.py index d4cde34be8..dccf7662e7 100644 --- a/tests/test_modeling_convbert.py +++ b/tests/test_modeling_convbert.py @@ -262,7 +262,6 @@ class ConvBertModelTest(ModelTesterMixin, unittest.TestCase): ) test_pruning = False test_head_masking = False - test_sequence_classification_problem_types = True def setUp(self): self.model_tester = ConvBertModelTester(self) diff --git a/tests/test_modeling_distilbert.py b/tests/test_modeling_distilbert.py index ed7fba94bb..8026f92db6 100644 --- a/tests/test_modeling_distilbert.py +++ b/tests/test_modeling_distilbert.py @@ -214,7 +214,6 @@ class DistilBertModelTest(ModelTesterMixin, unittest.TestCase): test_pruning = True test_torchscript = True test_resize_embeddings = True - test_sequence_classification_problem_types = True test_resize_position_embeddings = True def setUp(self): diff --git a/tests/test_modeling_electra.py b/tests/test_modeling_electra.py index 2b19bb4a5d..933e542025 100644 --- a/tests/test_modeling_electra.py +++ b/tests/test_modeling_electra.py @@ -291,7 +291,6 @@ class ElectraModelTest(ModelTesterMixin, unittest.TestCase): ) fx_ready_model_classes = all_model_classes fx_dynamic_ready_model_classes = all_model_classes - test_sequence_classification_problem_types = True # special case for ForPreTraining model def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): diff --git a/tests/test_modeling_funnel.py b/tests/test_modeling_funnel.py index 0f720d3b42..9ed3b0339b 100644 --- a/tests/test_modeling_funnel.py +++ b/tests/test_modeling_funnel.py @@ -362,7 +362,6 @@ class FunnelModelTest(ModelTesterMixin, unittest.TestCase): if is_torch_available() else () ) - test_sequence_classification_problem_types = True # special case for ForPreTraining model def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): diff --git a/tests/test_modeling_longformer.py b/tests/test_modeling_longformer.py index 2e506348e4..2d30bd3ba4 100644 --- a/tests/test_modeling_longformer.py +++ b/tests/test_modeling_longformer.py @@ -278,7 +278,6 @@ class LongformerModelTester: class LongformerModelTest(ModelTesterMixin, unittest.TestCase): test_pruning = False # pruning is not supported test_torchscript = False - test_sequence_classification_problem_types = True all_model_classes = ( ( diff --git a/tests/test_modeling_mobilebert.py b/tests/test_modeling_mobilebert.py index 23eb5a9c5e..716714157a 100644 --- a/tests/test_modeling_mobilebert.py +++ b/tests/test_modeling_mobilebert.py @@ -271,7 +271,6 @@ class MobileBertModelTest(ModelTesterMixin, unittest.TestCase): ) fx_ready_model_classes = all_model_classes fx_dynamic_ready_model_classes = all_model_classes - test_sequence_classification_problem_types = True # special case for ForPreTraining model def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): diff --git a/tests/test_modeling_openai.py b/tests/test_modeling_openai.py index 584fac66b0..5ad53ac52c 100644 --- a/tests/test_modeling_openai.py +++ b/tests/test_modeling_openai.py @@ -143,7 +143,7 @@ class OpenAIGPTModelTester: model = OpenAIGPTForSequenceClassification(config) model.to(torch_device) model.eval() - # print(config.num_labels, sequence_labels.size()) + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) result = model(input_ids, token_type_ids=token_type_ids, labels=sequence_labels) self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) diff --git a/tests/test_modeling_reformer.py b/tests/test_modeling_reformer.py index de62e3ed5b..dff424bb99 100644 --- a/tests/test_modeling_reformer.py +++ b/tests/test_modeling_reformer.py @@ -795,6 +795,10 @@ class ReformerLSHAttnModelTest(ReformerTesterMixin, ModelTesterMixin, Generation [expected_shape] * len(iter_hidden_states), ) + def test_problem_types(self): + # Fails because the sequence length is not a multiple of 4 + pass + @require_torch @require_sentencepiece diff --git a/tests/test_modeling_roberta.py b/tests/test_modeling_roberta.py index 0f700009e0..031263cf6d 100644 --- a/tests/test_modeling_roberta.py +++ b/tests/test_modeling_roberta.py @@ -356,7 +356,6 @@ class RobertaModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas else () ) all_generative_model_classes = (RobertaForCausalLM,) if is_torch_available() else () - test_sequence_classification_problem_types = True def setUp(self): self.model_tester = RobertaModelTester(self) diff --git a/tests/test_modeling_squeezebert.py b/tests/test_modeling_squeezebert.py index 546b6f9e83..a81f1a3fed 100644 --- a/tests/test_modeling_squeezebert.py +++ b/tests/test_modeling_squeezebert.py @@ -232,7 +232,6 @@ class SqueezeBertModelTest(ModelTesterMixin, unittest.TestCase): test_torchscript = True test_resize_embeddings = True test_head_masking = False - test_sequence_classification_problem_types = True def setUp(self): self.model_tester = SqueezeBertModelTester(self) diff --git a/tests/test_modeling_xlm.py b/tests/test_modeling_xlm.py index 996bace511..82adbdc949 100644 --- a/tests/test_modeling_xlm.py +++ b/tests/test_modeling_xlm.py @@ -350,7 +350,6 @@ class XLMModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): all_generative_model_classes = ( (XLMWithLMHeadModel,) if is_torch_available() else () ) # TODO (PVP): Check other models whether language generation is also applicable - test_sequence_classification_problem_types = True # XLM has 2 QA models -> need to manually set the correct labels for one of them here def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): diff --git a/tests/test_modeling_xlnet.py b/tests/test_modeling_xlnet.py index c26e24df50..116cc5df3a 100644 --- a/tests/test_modeling_xlnet.py +++ b/tests/test_modeling_xlnet.py @@ -527,7 +527,6 @@ class XLNetModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase) (XLNetLMHeadModel,) if is_torch_available() else () ) # TODO (PVP): Check other models whether language generation is also applicable test_pruning = False - test_sequence_classification_problem_types = True # XLNet has 2 QA models -> need to manually set the correct labels for one of them here def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):