From b5ec526f8576daedfd4bfc7614ef5a860b83fd36 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 24 Sep 2019 17:10:50 +0200
Subject: [PATCH] updated data processor and metrics

---
 .gitignore                                    |  2 +-
 examples/run_glue.py                          |  7 +-
 pytorch_transformers/__init__.py              |  7 ++
 pytorch_transformers/data/__init__.py         |  6 ++
 pytorch_transformers/data/metrics/__init__.py | 83 +++++++++++++++++++
 .../data/processors/__init__.py               |  2 +
 .../processors}/glue.py                       | 83 +++++++++----------
 .../processors}/utils.py                      | 80 +++++++-----------
 .../preprocessing/__init__.py                 | 56 -------------
 9 files changed, 171 insertions(+), 155 deletions(-)
 create mode 100644 pytorch_transformers/data/__init__.py
 create mode 100644 pytorch_transformers/data/metrics/__init__.py
 create mode 100644 pytorch_transformers/data/processors/__init__.py
 rename pytorch_transformers/{preprocessing => data/processors}/glue.py (92%)
 rename pytorch_transformers/{preprocessing => data/processors}/utils.py (52%)
 delete mode 100644 pytorch_transformers/preprocessing/__init__.py

diff --git a/.gitignore b/.gitignore
index d285d0ded9..e673ce5f47 100644
--- a/.gitignore
+++ b/.gitignore
@@ -130,5 +130,5 @@ runs
 examples/runs
 
 # data
-data
+/data
 serialization_dir
\ No newline at end of file
diff --git a/examples/run_glue.py b/examples/run_glue.py
index 3748c1a13a..b39c6bf054 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -46,7 +46,10 @@ from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
 
 from pytorch_transformers import AdamW, WarmupLinearSchedule
 
-from pytorch_transformers.preprocessing import (compute_metrics, output_modes, processors, convert_examples_to_glue_features)
+from pytorch_transformers import glue_compute_metrics as compute_metrics
+from pytorch_transformers import glue_output_modes as output_modes
+from pytorch_transformers import glue_processors as processors
+from pytorch_transformers import glue_convert_examples_to_features as convert_examples_to_features
 
 logger = logging.getLogger(__name__)
 
@@ -275,7 +278,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
             # HACK(label indices are swapped in RoBERTa pretrained model)
             label_list[1], label_list[2] = label_list[2], label_list[1] 
         examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
-        features = convert_examples_to_glue_features(examples, label_list, args.max_seq_length, tokenizer, output_mode,
+        features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode,
             pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
             pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
             pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py
index f12a5ebea7..93de6a982b 100644
--- a/pytorch_transformers/__init__.py
+++ b/pytorch_transformers/__init__.py
@@ -73,3 +73,10 @@ from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, Wa
 from .file_utils import (PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
                          cached_path, add_start_docstrings, add_end_docstrings,
                          WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME)
+
+from .data import (is_sklearn_available,
+                   InputExample, InputFeatures, DataProcessor,
+                   glue_output_modes, glue_convert_examples_to_features, glue_processors)
+
+if is_sklearn_available():
+    from .data import glue_compute_metrics
diff --git a/pytorch_transformers/data/__init__.py b/pytorch_transformers/data/__init__.py
new file mode 100644
index 0000000000..4522b802ab
--- /dev/null
+++ b/pytorch_transformers/data/__init__.py
@@ -0,0 +1,6 @@
+from .processors import (InputExample, InputFeatures, DataProcessor,
+                         glue_output_modes, glue_convert_examples_to_features, glue_processors)
+from .metrics import is_sklearn_available
+
+if is_sklearn_available():
+    from .metrics import glue_compute_metrics
diff --git a/pytorch_transformers/data/metrics/__init__.py b/pytorch_transformers/data/metrics/__init__.py
new file mode 100644
index 0000000000..4e0c088b90
--- /dev/null
+++ b/pytorch_transformers/data/metrics/__init__.py
@@ -0,0 +1,83 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import csv
+import sys
+import logging
+
+logger = logging.getLogger(__name__)
+
+try:
+    from scipy.stats import pearsonr, spearmanr
+    from sklearn.metrics import matthews_corrcoef, f1_score
+    _has_sklearn = True
+except e:
+    logger.warning("To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html")
+    _has_sklearn = False
+
+def is_sklearn_available():
+    return _has_sklearn
+
+if _has_sklearn:
+
+    def simple_accuracy(preds, labels):
+        return (preds == labels).mean()
+
+
+    def acc_and_f1(preds, labels):
+        acc = simple_accuracy(preds, labels)
+        f1 = f1_score(y_true=labels, y_pred=preds)
+        return {
+            "acc": acc,
+            "f1": f1,
+            "acc_and_f1": (acc + f1) / 2,
+        }
+
+
+    def pearson_and_spearman(preds, labels):
+        pearson_corr = pearsonr(preds, labels)[0]
+        spearman_corr = spearmanr(preds, labels)[0]
+        return {
+            "pearson": pearson_corr,
+            "spearmanr": spearman_corr,
+            "corr": (pearson_corr + spearman_corr) / 2,
+        }
+
+
+    def glue_compute_metrics(task_name, preds, labels):
+        assert len(preds) == len(labels)
+        if task_name == "cola":
+            return {"mcc": matthews_corrcoef(labels, preds)}
+        elif task_name == "sst-2":
+            return {"acc": simple_accuracy(preds, labels)}
+        elif task_name == "mrpc":
+            return acc_and_f1(preds, labels)
+        elif task_name == "sts-b":
+            return pearson_and_spearman(preds, labels)
+        elif task_name == "qqp":
+            return acc_and_f1(preds, labels)
+        elif task_name == "mnli":
+            return {"acc": simple_accuracy(preds, labels)}
+        elif task_name == "mnli-mm":
+            return {"acc": simple_accuracy(preds, labels)}
+        elif task_name == "qnli":
+            return {"acc": simple_accuracy(preds, labels)}
+        elif task_name == "rte":
+            return {"acc": simple_accuracy(preds, labels)}
+        elif task_name == "wnli":
+            return {"acc": simple_accuracy(preds, labels)}
+        else:
+            raise KeyError(task_name)
diff --git a/pytorch_transformers/data/processors/__init__.py b/pytorch_transformers/data/processors/__init__.py
new file mode 100644
index 0000000000..1a442bf839
--- /dev/null
+++ b/pytorch_transformers/data/processors/__init__.py
@@ -0,0 +1,2 @@
+from .utils import InputExample, InputFeatures, DataProcessor
+from .glue import output_modes, processors, convert_examples_to_glue_features
diff --git a/pytorch_transformers/preprocessing/glue.py b/pytorch_transformers/data/processors/glue.py
similarity index 92%
rename from pytorch_transformers/preprocessing/glue.py
rename to pytorch_transformers/data/processors/glue.py
index ed03e7ef4a..e1376816de 100644
--- a/pytorch_transformers/preprocessing/glue.py
+++ b/pytorch_transformers/data/processors/glue.py
@@ -15,12 +15,50 @@
 # limitations under the License.
 """ GLUE processors and helpers """
 
-from .utils import DataProcessor
 import logging
 import os
 
+from .utils import DataProcessor, InputExample, InputFeatures
+
 logger = logging.getLogger(__name__)
 
+GLUE_TASKS_NUM_LABELS = {
+    "cola": 2,
+    "mnli": 3,
+    "mrpc": 2,
+    "sst-2": 2,
+    "sts-b": 1,
+    "qqp": 2,
+    "qnli": 2,
+    "rte": 2,
+    "wnli": 2,
+}
+
+processors = {
+    "cola": ColaProcessor,
+    "mnli": MnliProcessor,
+    "mnli-mm": MnliMismatchedProcessor,
+    "mrpc": MrpcProcessor,
+    "sst-2": Sst2Processor,
+    "sts-b": StsbProcessor,
+    "qqp": QqpProcessor,
+    "qnli": QnliProcessor,
+    "rte": RteProcessor,
+    "wnli": WnliProcessor,
+}
+
+output_modes = {
+    "cola": "classification",
+    "mnli": "classification",
+    "mnli-mm": "classification",
+    "mrpc": "classification",
+    "sst-2": "classification",
+    "sts-b": "regression",
+    "qqp": "classification",
+    "qnli": "classification",
+    "rte": "classification",
+    "wnli": "classification",
+}
 
 def convert_examples_to_glue_features(examples, label_list, max_seq_length,
                                       tokenizer, output_mode,
@@ -91,37 +129,6 @@ def convert_examples_to_glue_features(examples, label_list, max_seq_length,
     return features
 
 
-class InputExample(object):
-    """A single training/test example for simple sequence classification."""
-
-    def __init__(self, guid, text_a, text_b=None, label=None):
-        """Constructs a InputExample.
-
-        Args:
-            guid: Unique id for the example.
-            text_a: string. The untokenized text of the first sequence. For single
-            sequence tasks, only this sequence must be specified.
-            text_b: (Optional) string. The untokenized text of the second sequence.
-            Only must be specified for sequence pair tasks.
-            label: (Optional) string. The label of the example. This should be
-            specified for train and dev examples, but not for test examples.
-        """
-        self.guid = guid
-        self.text_a = text_a
-        self.text_b = text_b
-        self.label = label
-
-
-class InputFeatures(object):
-    """A single set of features of data."""
-
-    def __init__(self, input_ids, input_mask, segment_ids, label_id):
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.segment_ids = segment_ids
-        self.label_id = label_id
-
-
 class MrpcProcessor(DataProcessor):
     """Processor for the MRPC data set (GLUE version)."""
 
@@ -420,15 +427,3 @@ class WnliProcessor(DataProcessor):
             examples.append(
                 InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
         return examples
-
-GLUE_TASKS_NUM_LABELS = {
-    "cola": 2,
-    "mnli": 3,
-    "mrpc": 2,
-    "sst-2": 2,
-    "sts-b": 1,
-    "qqp": 2,
-    "qnli": 2,
-    "rte": 2,
-    "wnli": 2,
-}
\ No newline at end of file
diff --git a/pytorch_transformers/preprocessing/utils.py b/pytorch_transformers/data/processors/utils.py
similarity index 52%
rename from pytorch_transformers/preprocessing/utils.py
rename to pytorch_transformers/data/processors/utils.py
index b4a3d0d968..af90e7a47c 100644
--- a/pytorch_transformers/preprocessing/utils.py
+++ b/pytorch_transformers/data/processors/utils.py
@@ -17,8 +17,34 @@
 import csv
 import sys
 
-from scipy.stats import pearsonr, spearmanr
-from sklearn.metrics import matthews_corrcoef, f1_score
+class InputExample(object):
+    """A single training/test example for simple sequence classification."""
+    def __init__(self, guid, text_a, text_b=None, label=None):
+        """Constructs a InputExample.
+
+        Args:
+            guid: Unique id for the example.
+            text_a: string. The untokenized text of the first sequence. For single
+            sequence tasks, only this sequence must be specified.
+            text_b: (Optional) string. The untokenized text of the second sequence.
+            Only must be specified for sequence pair tasks.
+            label: (Optional) string. The label of the example. This should be
+            specified for train and dev examples, but not for test examples.
+        """
+        self.guid = guid
+        self.text_a = text_a
+        self.text_b = text_b
+        self.label = label
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self, input_ids, input_mask, segment_ids, label_id):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_id = label_id
 
 
 class DataProcessor(object):
@@ -47,53 +73,3 @@ class DataProcessor(object):
                     line = list(unicode(cell, 'utf-8') for cell in line)
                 lines.append(line)
             return lines
-
-
-def simple_accuracy(preds, labels):
-    return (preds == labels).mean()
-
-
-def acc_and_f1(preds, labels):
-    acc = simple_accuracy(preds, labels)
-    f1 = f1_score(y_true=labels, y_pred=preds)
-    return {
-        "acc": acc,
-        "f1": f1,
-        "acc_and_f1": (acc + f1) / 2,
-    }
-
-
-def pearson_and_spearman(preds, labels):
-    pearson_corr = pearsonr(preds, labels)[0]
-    spearman_corr = spearmanr(preds, labels)[0]
-    return {
-        "pearson": pearson_corr,
-        "spearmanr": spearman_corr,
-        "corr": (pearson_corr + spearman_corr) / 2,
-    }
-
-
-def compute_metrics(task_name, preds, labels):
-    assert len(preds) == len(labels)
-    if task_name == "cola":
-        return {"mcc": matthews_corrcoef(labels, preds)}
-    elif task_name == "sst-2":
-        return {"acc": simple_accuracy(preds, labels)}
-    elif task_name == "mrpc":
-        return acc_and_f1(preds, labels)
-    elif task_name == "sts-b":
-        return pearson_and_spearman(preds, labels)
-    elif task_name == "qqp":
-        return acc_and_f1(preds, labels)
-    elif task_name == "mnli":
-        return {"acc": simple_accuracy(preds, labels)}
-    elif task_name == "mnli-mm":
-        return {"acc": simple_accuracy(preds, labels)}
-    elif task_name == "qnli":
-        return {"acc": simple_accuracy(preds, labels)}
-    elif task_name == "rte":
-        return {"acc": simple_accuracy(preds, labels)}
-    elif task_name == "wnli":
-        return {"acc": simple_accuracy(preds, labels)}
-    else:
-        raise KeyError(task_name)
\ No newline at end of file
diff --git a/pytorch_transformers/preprocessing/__init__.py b/pytorch_transformers/preprocessing/__init__.py
deleted file mode 100644
index 8550b80488..0000000000
--- a/pytorch_transformers/preprocessing/__init__.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .glue import (ColaProcessor,
-                  MnliProcessor,
-                  MnliMismatchedProcessor,
-                  MrpcProcessor,
-                  Sst2Processor,
-                  StsbProcessor,
-                  QqpProcessor,
-                  QnliProcessor,
-                  RteProcessor,
-                  WnliProcessor,
-                  convert_examples_to_glue_features,
-                  )
-
-from .utils import DataProcessor, simple_accuracy, acc_and_f1, pearson_and_spearman, compute_metrics
-
-processors = {
-    "cola": ColaProcessor,
-    "mnli": MnliProcessor,
-    "mnli-mm": MnliMismatchedProcessor,
-    "mrpc": MrpcProcessor,
-    "sst-2": Sst2Processor,
-    "sts-b": StsbProcessor,
-    "qqp": QqpProcessor,
-    "qnli": QnliProcessor,
-    "rte": RteProcessor,
-    "wnli": WnliProcessor,
-}
-
-output_modes = {
-    "cola": "classification",
-    "mnli": "classification",
-    "mnli-mm": "classification",
-    "mrpc": "classification",
-    "sst-2": "classification",
-    "sts-b": "regression",
-    "qqp": "classification",
-    "qnli": "classification",
-    "rte": "classification",
-    "wnli": "classification",
-}