From 4c721c6b6aba636372b0c9e0ba449e182c6488d5 Mon Sep 17 00:00:00 2001 From: Ananya Harsh Jha Date: Fri, 15 Mar 2019 23:21:24 -0400 Subject: [PATCH] added eval time metrics for GLUE tasks --- examples/run_classifier.py | 159 ++++++++++++++++++++++++++++--------- 1 file changed, 123 insertions(+), 36 deletions(-) diff --git a/examples/run_classifier.py b/examples/run_classifier.py index 972c110396..51d57be4dc 100644 --- a/examples/run_classifier.py +++ b/examples/run_classifier.py @@ -31,6 +31,10 @@ from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, from torch.utils.data.distributed import DistributedSampler from tqdm import tqdm, trange +from torch.nn import CrossEntropyLoss, MSELoss +from scipy.stats import pearsonr, spearmanr +from sklearn.metrics import matthews_corrcoef, f1_score + from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME from pytorch_pretrained_bert.tokenization import BertTokenizer @@ -401,13 +405,17 @@ class WnliProcessor(DataProcessor): return examples -def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer): +def convert_examples_to_features(examples, label_list, max_seq_length, + tokenizer, output_mode): """Loads a data file into a list of `InputBatch`s.""" label_map = {label : i for i, label in enumerate(label_list)} features = [] for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + logger.info("Writing example %d of %d" % (ex_index, len(examples))) + tokens_a = tokenizer.tokenize(example.text_a) tokens_b = None @@ -463,7 +471,13 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length - label_id = label_map[example.label] + if output_mode == "classification": + label_id = label_map[example.label] + elif output_mode == "regression": + label_id = float(example.label) + else: + raise KeyError(output_mode) + if ex_index < 5: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) @@ -499,9 +513,56 @@ def _truncate_seq_pair(tokens_a, tokens_b, max_length): else: tokens_b.pop() -def accuracy(out, labels): - outputs = np.argmax(out, axis=1) - return np.sum(outputs == labels) + +def simple_accuracy(preds, labels): + return (preds == labels).mean() + + +def acc_and_f1(preds, labels): + acc = simple_accuracy(preds, labels) + f1 = f1_score(y_true=labels, y_pred=preds) + return { + "acc": acc, + "f1": f1, + "acc_and_f1": (acc + f1) / 2, + } + + +def pearson_and_spearman(preds, labels): + pearson_corr = pearsonr(preds, labels)[0] + spearman_corr = spearmanr(preds, labels)[0] + return { + "pearson": pearson_corr, + "spearmanr": spearman_corr, + "corr": (pearson_corr + spearman_corr) / 2, + } + + +def compute_metrics(task_name, preds, labels): + assert len(preds) == len(labels) + if task_name == "cola": + return {"mcc": matthews_corrcoef(labels, preds)} + elif task_name == "sst-2": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "mrpc": + return acc_and_f1(preds, labels) + elif task_name == "sts-b": + return pearson_and_spearman(preds, labels) + elif task_name == "qqp": + return acc_and_f1(preds, labels) + elif task_name == "mnli": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "mnli-mm": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "qnli": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "rte": + return {"acc": simple_accuracy(preds, labels)} + elif task_name == "wnli": + return {"acc": simple_accuracy(preds, labels)} + else: + raise KeyError(task_name) + def main(): parser = argparse.ArgumentParser() @@ -605,6 +666,7 @@ def main(): processors = { "cola": ColaProcessor, "mnli": MnliProcessor, + "mnli-mm": MnliMismatchedProcessor, "mrpc": MrpcProcessor, "sst-2": Sst2Processor, "sts-b": StsbProcessor, @@ -617,6 +679,7 @@ def main(): output_modes = { "cola": "classification", "mnli": "classification", + "mnli-mm": "classification", "mrpc": "classification", "sst-2": "classification", "sts-b": "regression", @@ -626,13 +689,6 @@ def main(): "wnli": "classification", } - num_labels_task = { - "cola": 2, - "sst-2": 2, - "mnli": 3, - "mrpc": 2, - } - if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() @@ -671,8 +727,10 @@ def main(): raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() - num_labels = num_labels_task[task_name] + output_mode = output_modes[task_name] + label_list = processor.get_labels() + num_labels = len(label_list) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) @@ -689,7 +747,7 @@ def main(): cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)) model = BertForSequenceClassification.from_pretrained(args.bert_model, cache_dir=cache_dir, - num_labels = num_labels) + num_labels=num_labels) if args.fp16: model.half() model.to(device) @@ -737,7 +795,7 @@ def main(): tr_loss = 0 if args.do_train: train_features = convert_examples_to_features( - train_examples, label_list, args.max_seq_length, tokenizer) + train_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) @@ -745,7 +803,12 @@ def main(): all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) - all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) + + if output_mode == "classification": + all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) + elif output_mode == "regression": + all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) + train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) @@ -760,7 +823,17 @@ def main(): for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch - loss = model(input_ids, segment_ids, input_mask, label_ids) + + # define a new function to compute loss values for both output_modes + logits = model(input_ids, segment_ids, input_mask, labels=None) + + if output_mode == "classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) + elif output_mode == "regression": + loss_fct = MSELoss() + loss = loss_fct(logits.view(-1), label_ids.view(-1)) + if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: @@ -805,22 +878,28 @@ def main(): if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( - eval_examples, label_list, args.max_seq_length, tokenizer) + eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) - all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) + + if output_mode == "classification": + all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) + elif output_mode == "regression": + all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) + eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() - eval_loss, eval_accuracy = 0, 0 - nb_eval_steps, nb_eval_examples = 0, 0 + eval_loss = 0 + nb_eval_steps = 0 + preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) @@ -829,26 +908,34 @@ def main(): label_ids = label_ids.to(device) with torch.no_grad(): - tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) - logits = model(input_ids, segment_ids, input_mask) - - logits = logits.detach().cpu().numpy() - label_ids = label_ids.to('cpu').numpy() - tmp_eval_accuracy = accuracy(logits, label_ids) - + logits = model(input_ids, segment_ids, input_mask, labels=None) + + # create eval loss and other metric required by the task + if output_mode == "classification": + loss_fct = CrossEntropyLoss() + tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) + elif output_mode == "regression": + loss_fct = MSELoss() + tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) + eval_loss += tmp_eval_loss.mean().item() - eval_accuracy += tmp_eval_accuracy - - nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 + if len(preds) == 0: + preds.append(logits.detach().cpu().numpy()) + else: + preds[0] = np.append( + preds[0], logits.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps - eval_accuracy = eval_accuracy / nb_eval_examples + preds = preds[0] + if output_mode == "classification": + preds = np.argmax(preds, axis=1) + result = compute_metrics(task_name, preds, all_label_ids.numpy()) loss = tr_loss/nb_tr_steps if args.do_train else None - result = {'eval_loss': eval_loss, - 'eval_accuracy': eval_accuracy, - 'global_step': global_step, - 'loss': loss} + + result['eval_loss'] = eval_loss + result['global_step'] = global_step + result['loss'] = loss output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: