From 092dacfd623b75530c39773930783783f58fbdbe Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 26 Jun 2019 09:54:05 +0200
Subject: [PATCH] changing is_regression to unified API

---
 examples/utils_glue.py                        | 12 +++++++
 .../convert_xlnet_checkpoint_to_pytorch.py    | 24 +++++++-------
 pytorch_pretrained_bert/modeling.py           | 11 +++++--
 pytorch_pretrained_bert/modeling_xlnet.py     | 33 ++++++++++---------
 4 files changed, 49 insertions(+), 31 deletions(-)

diff --git a/examples/utils_glue.py b/examples/utils_glue.py
index 5d3454f439..e3e4179fae 100644
--- a/examples/utils_glue.py
+++ b/examples/utils_glue.py
@@ -591,3 +591,15 @@ output_modes = {
     "rte": "classification",
     "wnli": "classification",
 }
+
+GLUE_TASKS_NUM_LABELS = {
+    "cola": 2,
+    "mnli": 3,
+    "mrpc": 2,
+    "sst-2": 2,
+    "sts-b": 1,
+    "qqp": 2,
+    "qnli": 2,
+    "rte": 2,
+    "wnli": 2,
+}
diff --git a/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py b/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
index d46cc99e73..258b82e363 100755
--- a/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_xlnet_checkpoint_to_pytorch.py
@@ -28,16 +28,16 @@ from pytorch_pretrained_bert.modeling_xlnet import (CONFIG_NAME, WEIGHTS_NAME,
                                                     XLNetForSequenceClassification,
                                                     load_tf_weights_in_xlnet)
 
-GLUE_TASKS = {
-    "cola": "classification",
-    "mnli": "classification",
-    "mrpc": "classification",
-    "sst-2": "classification",
-    "sts-b": "regression",
-    "qqp": "classification",
-    "qnli": "classification",
-    "rte": "classification",
-    "wnli": "classification",
+GLUE_TASKS_NUM_LABELS = {
+    "cola": 2,
+    "mnli": 3,
+    "mrpc": 2,
+    "sst-2": 2,
+    "sts-b": 1,
+    "qqp": 2,
+    "qnli": 2,
+    "rte": 2,
+    "wnli": 2,
 }
 
 
@@ -46,9 +46,9 @@ def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, py
     config = XLNetConfig.from_json_file(bert_config_file)
 
     finetuning_task = finetuning_task.lower() if finetuning_task is not None else ""
-    if finetuning_task in GLUE_TASKS:
+    if finetuning_task in GLUE_TASKS_NUM_LABELS:
         print("Building PyTorch XLNetForSequenceClassification model from configuration: {}".format(str(config)))
-        model = XLNetForSequenceClassification(config, is_regression=bool(GLUE_TASKS[finetuning_task] == "regression"))
+        model = XLNetForSequenceClassification(config, num_labels=GLUE_TASKS_NUM_LABELS[finetuning_task])
     elif 'squad' in finetuning_task:
         model = XLNetForQuestionAnswering(config)
     else:
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index bc2304bc06..ce55c50c68 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -27,7 +27,7 @@ from io import open
 
 import torch
 from torch import nn
-from torch.nn import CrossEntropyLoss
+from torch.nn import CrossEntropyLoss, MSELoss
 
 from .file_utils import cached_path, WEIGHTS_NAME, CONFIG_NAME
 
@@ -1196,8 +1196,13 @@ class BertForSequenceClassification(BertPreTrainedModel):
         logits = self.classifier(pooled_output)
 
         if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
             return loss
         elif self.output_attentions:
             return all_attentions, logits
diff --git a/pytorch_pretrained_bert/modeling_xlnet.py b/pytorch_pretrained_bert/modeling_xlnet.py
index 7ee7be9025..8963f53615 100644
--- a/pytorch_pretrained_bert/modeling_xlnet.py
+++ b/pytorch_pretrained_bert/modeling_xlnet.py
@@ -1175,7 +1175,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
 
     def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
-                target=None, output_all_encoded_layers=True, head_mask=None):
+                labels=None, output_all_encoded_layers=True, head_mask=None):
         """
         Args:
             inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
@@ -1212,11 +1212,11 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
 
         logits = self.lm_loss(output)
 
-        if target is not None:
+        if labels is not None:
             # Flatten the tokens
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             loss = loss_fct(logits.view(-1, logits.size(-1)),
-                            target.view(-1))
+                            labels.view(-1))
             return loss, new_mems
 
         # if self.output_attentions:
@@ -1305,13 +1305,13 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
 
     Outputs: Tuple of (logits or loss, mems)
         `logits or loss`:
-            if target is None:
+            if labels is None:
                 Token logits with shape [batch_size, sequence_length] 
             else:
                 CrossEntropy loss with the targets
         `new_mems`: list (num layers) of updated mem states at the entry of each layer
             each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
-            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target`
+            Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
 
     Example usage:
     ```python
@@ -1328,13 +1328,13 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
     ```
     """
     def __init__(self, config, summary_type="last", use_proj=True, num_labels=2,
-                 is_regression=False, output_attentions=False, keep_multihead_output=False):
+                 output_attentions=False, keep_multihead_output=False):
         super(XLNetForSequenceClassification, self).__init__(config)
         self.output_attentions = output_attentions
         self.attn_type = config.attn_type
         self.same_length = config.same_length
         self.summary_type = summary_type
-        self.is_regression = is_regression
+        self.num_labels = num_labels
 
         self.transformer = XLNetModel(config, output_attentions=output_attentions,
                                               keep_multihead_output=keep_multihead_output)
@@ -1342,12 +1342,12 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
         self.sequence_summary = XLNetSequenceSummary(config, summary_type=summary_type,
                                                      use_proj=use_proj, output_attentions=output_attentions,
                                                      keep_multihead_output=keep_multihead_output)
-        self.logits_proj = nn.Linear(config.d_model, num_labels if not is_regression else 1)
+        self.logits_proj = nn.Linear(config.d_model, num_labels)
         self.apply(self.init_weights)
 
     def forward(self, inp_k, token_type_ids=None, input_mask=None, attention_mask=None,
                 mems=None, perm_mask=None, target_mapping=None, inp_q=None,
-                target=None, output_all_encoded_layers=True, head_mask=None):
+                labels=None, output_all_encoded_layers=True, head_mask=None):
         """
         Args:
             inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
@@ -1376,19 +1376,20 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
                 Set to None during finetuning.
         """
         output, _, new_mems = self.transformer(inp_k, token_type_ids, input_mask, attention_mask,
-                                            mems, perm_mask, target_mapping, inp_q,
-                                            output_all_encoded_layers, head_mask)
+                                               mems, perm_mask, target_mapping, inp_q,
+                                               output_all_encoded_layers, head_mask)
 
         output = self.sequence_summary(output)
         logits = self.logits_proj(output)
 
-        if target is not None:
-            if self.is_regression:
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
                 loss_fct = MSELoss()
-                loss = loss_fct(logits.view(-1), target.view(-1))
+                loss = loss_fct(logits.view(-1), labels.view(-1))
             else:
-                loss_fct = CrossEntropyLoss(ignore_index=-1)
-                loss = loss_fct(logits.view(-1, logits.size(-1)), target.view(-1))
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
             return loss, new_mems
 
         # if self.output_attentions: