Add cache_dir to save features in GLUE + Differentiate match/mismatch for MNLI metrics (#4621)

* Glue task cleaup

* Enable writing cache to cache_dir in case dataset lives in readOnly
filesystem.
* Differentiate match vs mismatch for MNLI metrics.

* Style

* Fix pytype

* Fix type

* Use cache_dir in mnli mismatch eval dataset

* Small Tweaks

Co-authored-by: Julien Chaumond <chaumond@gmail.com>
This commit is contained in:
Jin Young Sohn
2020-06-02 10:40:14 -07:00
committed by GitHub
parent 70f7423436
commit b231a413f5
4 changed files with 36 additions and 16 deletions

View File

@@ -70,6 +70,7 @@ class GlueDataset(Dataset):
tokenizer: PreTrainedTokenizer,
limit_length: Optional[int] = None,
mode: Union[str, Split] = Split.train,
cache_dir: Optional[str] = None,
):
self.args = args
self.processor = glue_processors[args.task_name]()
@@ -81,7 +82,7 @@ class GlueDataset(Dataset):
raise KeyError("mode is not a valid split name")
# Load data features from cache or dataset file
cached_features_file = os.path.join(
args.data_dir,
cache_dir if cache_dir is not None else args.data_dir,
"cached_{}_{}_{}_{}".format(
mode.value, tokenizer.__class__.__name__, str(args.max_seq_length), args.task_name,
),

View File

@@ -63,9 +63,9 @@ if _has_sklearn:
elif task_name == "qqp":
return acc_and_f1(preds, labels)
elif task_name == "mnli":
return {"acc": simple_accuracy(preds, labels)}
return {"mnli/acc": simple_accuracy(preds, labels)}
elif task_name == "mnli-mm":
return {"acc": simple_accuracy(preds, labels)}
return {"mnli-mm/acc": simple_accuracy(preds, labels)}
elif task_name == "qnli":
return {"acc": simple_accuracy(preds, labels)}
elif task_name == "rte":