diff --git a/examples/run_glue.py b/examples/run_glue.py index da1e8d8123..1c2e921ef7 100644 --- a/examples/run_glue.py +++ b/examples/run_glue.py @@ -20,7 +20,6 @@ from __future__ import absolute_import, division, print_function import argparse import logging import os -import sys import random from tqdm import tqdm, trange @@ -30,7 +29,6 @@ import torch from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset) from torch.utils.data.distributed import DistributedSampler -from torch.nn import CrossEntropyLoss, MSELoss from tensorboardX import SummaryWriter @@ -45,6 +43,186 @@ from utils_glue import processors, output_modes, convert_examples_to_features, c logger = logging.getLogger(__name__) +def train(args, train_features, model): + """ Train the model """ + if args.local_rank in [-1, 0]: + tb_writer = SummaryWriter() + + # Convert in tensors and build dataloader + all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) + all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) + all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) + if args.output_mode == "classification": + all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) + elif args.output_mode == "regression": + all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) + + args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps + + train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) + train_sampler = RandomSampler(train_data) if args.local_rank == -1 else DistributedSampler(train_data) + train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) + + num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs + + # Prepare optimizer + param_optimizer = list(model.named_parameters()) + no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] + optimizer_grouped_parameters = [ + {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, + {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} + ] + if args.fp16: + try: + from apex.optimizers import FP16_Optimizer, FusedAdam + except ImportError: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") + optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) + if args.loss_scale == 0: + optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) + else: + optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) + warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, t_total=num_train_optimization_steps) + + else: + optimizer = BertAdam(optimizer_grouped_parameters, + lr=args.learning_rate, + warmup=args.warmup_proportion, + t_total=num_train_optimization_steps) + + # Train! + logger.info("***** Running training *****") + logger.info(" Num examples = %d", len(train_features)) + logger.info(" Batch size = %d", args.train_batch_size) + logger.info(" Num steps = %d", num_train_optimization_steps) + + global_step = 0 + tr_loss = 0 + model.train() + for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]): + for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): + batch = tuple(t.to(args.device) for t in batch) + input_ids, input_mask, segment_ids, label_ids = batch + + ouputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) + loss = ouputs[0] + + if args.n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu parallel training + if args.gradient_accumulation_steps > 1: + loss = loss / args.gradient_accumulation_steps + + loss.backward() if not args.fp16 else optimizer.backward(loss) + + tr_loss += loss.item() + if (step + 1) % args.gradient_accumulation_steps == 0: + if args.fp16: + # modify learning rate with special warm up BERT uses + # if args.fp16 is False, BertAdam is used that handles this automatically + lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion) + for param_group in optimizer.param_groups: + param_group['lr'] = lr_this_step + optimizer.step() + optimizer.zero_grad() + global_step += 1 + if args.local_rank in [-1, 0]: + if not args.fp16: + tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step) + tb_writer.add_scalar('loss', loss.item(), global_step) + + return global_step, tr_loss / global_step + + +def evalutate(args, eval_task, eval_output_dir, eval_features, model): + """ Evaluate the model """ + if os.path.exists(eval_output_dir) and os.listdir(eval_output_dir) and args.do_train and not args.overwrite_output_dir: + raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(eval_output_dir)) + if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: + os.makedirs(eval_output_dir) + + # Convert in tensors and build dataloader + all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) + all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) + all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) + if args.output_mode == "classification": + all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) + elif args.output_mode == "regression": + all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) + + eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) + # Note that DistributedSampler samples randomly + eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data) + eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) + + # Eval! + logger.info("***** Running evaluation *****") + logger.info(" Num examples = %d", len(eval_examples)) + logger.info(" Batch size = %d", args.eval_batch_size) + model.eval() + eval_loss = 0 + nb_eval_steps = 0 + preds = None + out_label_ids = None + for batch in tqdm(eval_dataloader, desc="Evaluating"): + batch = tuple(t.to(args.device) for t in batch) + input_ids, input_mask, segment_ids, label_ids = batch + + with torch.no_grad(): + outputs = model(input_ids, + token_type_ids=segment_ids, + attention_mask=input_mask, + labels=label_ids) + tmp_eval_loss, logits = outputs[:2] + + eval_loss += tmp_eval_loss.mean().item() + nb_eval_steps += 1 + if preds is None: + preds = logits.detach().cpu().numpy() + out_label_ids = label_ids.detach().cpu().numpy() + else: + preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) + out_label_ids = np.append(out_label_ids, label_ids.detach().cpu().numpy(), axis=0) + + eval_loss = eval_loss / nb_eval_steps + if args.output_mode == "classification": + preds = np.argmax(preds, axis=1) + elif args.output_mode == "regression": + preds = np.squeeze(preds) + result = compute_metrics(eval_task, preds, out_label_ids) + + output_eval_file = os.path.join(eval_output_dir, "eval_results.txt") + with open(output_eval_file, "w") as writer: + logger.info("***** Eval results *****") + for key in sorted(result.keys()): + logger.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + +def load_and_cache_examples(args, task, tokenizer, eval=False): + processor = processors[task]() + output_mode = output_modes[task] + label_list = processor.get_labels() + + # Load and cache data + processor = processors[task]() + examples = processor.get_dev_examples(args.data_dir) + cached_features_file = os.path.join(args.data_dir, '{}_{}_{}_{}'.format( + 'dev' if eval else 'train', + list(filter(None, args.bert_model.split('/'))).pop(), + str(args.max_seq_length), + str(task))) + + if os.path.exists(cached_features_file): + features = torch.load(cached_features_file) + else: + features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode) + if args.local_rank == -1 or torch.distributed.get_rank() == 0: + logger.info(" Saving eval features into cached file %s", cached_features_file) + torch.save(features, cached_features_file) + + return features + + def main(): parser = argparse.ArgumentParser() @@ -118,40 +296,32 @@ def main(): # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") - n_gpu = torch.cuda.device_count() + args.n_gpu = torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl') - n_gpu = 1 + args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( - device, n_gpu, bool(args.local_rank != -1), args.fp16)) + device, args.n_gpu, bool(args.local_rank != -1), args.fp16)) # Setup seeds random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) - if n_gpu > 0: + if args.n_gpu > 0: torch.cuda.manual_seed_all(args.seed) - # Safety checks and create output directory - if not args.do_train and not args.do_eval: - raise ValueError("At least one of `do_train` or `do_eval` must be True.") - if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: - raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) - if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: - os.makedirs(args.output_dir) - # Prepare GLUE task - task_name = args.task_name.lower() - if task_name not in processors: - raise ValueError("Task not found: %s" % (task_name)) - processor = processors[task_name]() - output_mode = output_modes[task_name] + args.task_name = args.task_name.lower() + if args.task_name not in processors: + raise ValueError("Task not found: %s" % (args.task_name)) + processor = processors[args.task_name]() + args.output_mode = output_modes[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) @@ -169,122 +339,23 @@ def main(): # Distributed, parrallel and fp16 model if args.fp16: model.half() - model.to(device) + model.to(args.device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) - elif n_gpu > 1: + elif args.n_gpu > 1: model = torch.nn.DataParallel(model) - global_step = 0 - tr_loss = 0 + # Training if args.do_train: - if args.local_rank in [-1, 0]: - tb_writer = SummaryWriter() + train_features = load_and_cache_examples(args, args.task_name, tokenizer, eval=False) + global_step, tr_loss = train(args, train_features, model) + logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) - # Load and cache data - train_examples = processor.get_train_examples(args.data_dir) - cached_train_features_file = os.path.join(args.data_dir, 'train_{0}_{1}_{2}'.format( - list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(task_name))) - if os.path.exists(cached_train_features_file): - train_features = torch.load(cached_train_features_file) - else: - train_features = convert_examples_to_features( - train_examples, label_list, args.max_seq_length, tokenizer, output_mode) - if args.local_rank == -1 or torch.distributed.get_rank() == 0: - logger.info(" Saving train features into cached file %s", cached_train_features_file) - torch.save(train_features, cached_train_features_file) - # Convert in tensors and build dataloader - all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) - all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) - all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) - if output_mode == "classification": - all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) - elif output_mode == "regression": - all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) - - args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps - - train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) - train_sampler = RandomSampler(train_data) if args.local_rank == -1 else DistributedSampler(train_data) - train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) - - num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs - - # Prepare optimizer - param_optimizer = list(model.named_parameters()) - no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] - optimizer_grouped_parameters = [ - {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, - {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] - if args.fp16: - try: - from apex.optimizers import FP16_Optimizer, FusedAdam - except ImportError: - raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") - optimizer = FusedAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - bias_correction=False, - max_grad_norm=1.0) - if args.loss_scale == 0: - optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) - else: - optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) - warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion, - t_total=num_train_optimization_steps) - - else: - optimizer = BertAdam(optimizer_grouped_parameters, - lr=args.learning_rate, - warmup=args.warmup_proportion, - t_total=num_train_optimization_steps) - - # Train! - logger.info("***** Running training *****") - logger.info(" Num examples = %d", len(train_examples)) - logger.info(" Batch size = %d", args.train_batch_size) - logger.info(" Num steps = %d", num_train_optimization_steps) - model.train() - for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]): - for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): - batch = tuple(t.to(device) for t in batch) - input_ids, input_mask, segment_ids, label_ids = batch - - ouputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) - loss = ouputs[0] - - if n_gpu > 1: - loss = loss.mean() # mean() to average on multi-gpu parallel training - if args.gradient_accumulation_steps > 1: - loss = loss / args.gradient_accumulation_steps - - if args.fp16: - optimizer.backward(loss) - else: - loss.backward() - - tr_loss += loss.item() - if (step + 1) % args.gradient_accumulation_steps == 0: - if args.fp16: - # modify learning rate with special warm up BERT uses - # if args.fp16 is False, BertAdam is used that handles this automatically - lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion) - for param_group in optimizer.param_groups: - param_group['lr'] = lr_this_step - optimizer.step() - optimizer.zero_grad() - global_step += 1 - if args.local_rank in [-1, 0]: - if not args.fp16: - tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step) - tb_writer.add_scalar('loss', loss.item(), global_step) - - ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() - ### Example: + # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self @@ -307,94 +378,18 @@ def main(): else: model = BertForSequenceClassification.from_pretrained(args.bert_model) - model.to(device) + model.to(args.device) - ### Evaluation + # Evaluation if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): - eval_task_names = ("mnli", "mnli-mm") if task_name == "mnli" else (task_name,) - eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if task_name == "mnli" else (args.output_dir,) - for eval_task, output_dir in zip(eval_task_names, eval_outputs_dirs): - if os.path.exists(output_dir) and os.listdir(output_dir) and args.do_train: - raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) - if not os.path.exists(output_dir): - os.makedirs(output_dir) + # Handle MNLI double evaluation + eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) + eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) - # Load and cache data - processor = processors[eval_task]() - eval_examples = processor.get_dev_examples(args.data_dir) - cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format( - list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(eval_task))) - if os.path.exists(cached_eval_features_file): - eval_features = torch.load(cached_eval_features_file) - else: - eval_features = convert_examples_to_features( - eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) - if args.local_rank == -1 or torch.distributed.get_rank() == 0: - logger.info(" Saving eval features into cached file %s", cached_eval_features_file) - torch.save(eval_features, cached_eval_features_file) + for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): + eval_features = load_and_cache_examples(args, eval_task, tokenizer, eval=True) - # Convert in tensors and build dataloader - all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) - all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) - all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) - if output_mode == "classification": - all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) - elif output_mode == "regression": - all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) - - eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) - # Note that DistributedSampler samples randomly - eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data) - eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) - - # Eval! - logger.info("***** Running evaluation *****") - logger.info(" Num examples = %d", len(eval_examples)) - logger.info(" Batch size = %d", args.eval_batch_size) - model.eval() - eval_loss = 0 - nb_eval_steps = 0 - preds = None - out_label_ids = None - for batch in tqdm(eval_dataloader, desc="Evaluating"): - batch = tuple(t.to(device) for t in batch) - input_ids, input_mask, segment_ids, label_ids = batch - - with torch.no_grad(): - outputs = model(input_ids, - token_type_ids=segment_ids, - attention_mask=input_mask, - labels=label_ids) - tmp_eval_loss, logits = outputs[:2] - - eval_loss += tmp_eval_loss.mean().item() - nb_eval_steps += 1 - if preds is None: - preds = logits.detach().cpu().numpy() - out_label_ids = label_ids.detach().cpu().numpy() - else: - preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) - out_label_ids = np.append(out_label_ids, label_ids.detach().cpu().numpy(), axis=0) - - eval_loss = eval_loss / nb_eval_steps - if output_mode == "classification": - preds = np.argmax(preds, axis=1) - elif output_mode == "regression": - preds = np.squeeze(preds) - result = compute_metrics(eval_task, preds, out_label_ids) - - loss = tr_loss/global_step if args.do_train else None - - result['eval_loss'] = eval_loss - result['global_step'] = global_step - result['loss'] = loss - - output_eval_file = os.path.join(output_dir, "eval_results.txt") - with open(output_eval_file, "w") as writer: - logger.info("***** Eval results *****") - for key in sorted(result.keys()): - logger.info(" %s = %s", key, str(result[key])) - writer.write("%s = %s\n" % (key, str(result[key]))) + evalutate(args, eval_task, eval_output_dir, eval_features, model) if __name__ == "__main__":