diff --git a/examples/run_classifier.py b/examples/run_classifier.py index eff48ca97d..908559d577 100644 --- a/examples/run_classifier.py +++ b/examples/run_classifier.py @@ -736,9 +736,28 @@ def main(): tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) - train_examples = None - num_train_optimization_steps = None + # Prepare model + cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)) + model = BertForSequenceClassification.from_pretrained(args.bert_model, + cache_dir=cache_dir, + num_labels=num_labels) + if args.fp16: + model.half() + model.to(device) + if args.local_rank != -1: + try: + from apex.parallel import DistributedDataParallel as DDP + except ImportError: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + + model = DDP(model) + elif n_gpu > 1: + model = torch.nn.DataParallel(model) + if args.do_train: + + # Prepare data loader + train_examples = processor.get_train_examples(args.data_dir) train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, output_mode) @@ -762,26 +781,8 @@ def main(): if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() - # Prepare model - cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)) - model = BertForSequenceClassification.from_pretrained(args.bert_model, - cache_dir=cache_dir, - num_labels=num_labels) - if args.fp16: - model.half() - model.to(device) - if args.local_rank != -1: - try: - from apex.parallel import DistributedDataParallel as DDP - except ImportError: - raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + # Prepare optimizer - model = DDP(model) - elif n_gpu > 1: - model = torch.nn.DataParallel(model) - - # Prepare optimizer - if args.do_train: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ @@ -812,10 +813,10 @@ def main(): warmup=args.warmup_proportion, t_total=num_train_optimization_steps) - global_step = 0 - nb_tr_steps = 0 - tr_loss = 0 - if args.do_train: + global_step = 0 + nb_tr_steps = 0 + tr_loss = 0 + logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) diff --git a/examples/run_squad.py b/examples/run_squad.py index ae163afc91..8ce8b60294 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -894,14 +894,31 @@ def main(): tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) - train_examples = None - num_train_optimization_steps = None + # Prepare model + model = BertForQuestionAnswering.from_pretrained(args.bert_model, + cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) + + if args.fp16: + model.half() + model.to(device) + if args.local_rank != -1: + try: + from apex.parallel import DistributedDataParallel as DDP + except ImportError: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + + model = DDP(model) + elif n_gpu > 1: + model = torch.nn.DataParallel(model) + if args.do_train: + + # Prepare data loader + train_examples = read_squad_examples( input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative) cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format( list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length)) - train_features = None try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) @@ -933,25 +950,8 @@ def main(): if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() - # Prepare model - model = BertForQuestionAnswering.from_pretrained(args.bert_model, - cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank))) + # Prepare optimizer - if args.fp16: - model.half() - model.to(device) - if args.local_rank != -1: - try: - from apex.parallel import DistributedDataParallel as DDP - except ImportError: - raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") - - model = DDP(model) - elif n_gpu > 1: - model = torch.nn.DataParallel(model) - - # Prepare optimizer - if args.do_train: param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used @@ -987,8 +987,8 @@ def main(): warmup=args.warmup_proportion, t_total=num_train_optimization_steps) - global_step = 0 - if args.do_train: + global_step = 0 + logger.info("***** Running training *****") logger.info(" Num orig examples = %d", len(train_examples)) logger.info(" Num split examples = %d", len(train_features)) diff --git a/examples/run_swag.py b/examples/run_swag.py index bdc256cf14..daae3971f7 100644 --- a/examples/run_swag.py +++ b/examples/run_swag.py @@ -358,9 +358,27 @@ def main(): tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) - train_examples = None - num_train_optimization_steps = None + # Prepare model + model = BertForMultipleChoice.from_pretrained(args.bert_model, + cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)), + num_choices=4) + if args.fp16: + model.half() + model.to(device) + if args.local_rank != -1: + try: + from apex.parallel import DistributedDataParallel as DDP + except ImportError: + raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + + model = DDP(model) + elif n_gpu > 1: + model = torch.nn.DataParallel(model) + if args.do_train: + + # Prepare data loader + train_examples = read_swag_examples(os.path.join(args.data_dir, 'train.csv'), is_training = True) train_features = convert_examples_to_features( train_examples, tokenizer, args.max_seq_length, True) @@ -379,25 +397,8 @@ def main(): if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() - # Prepare model - model = BertForMultipleChoice.from_pretrained(args.bert_model, - cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE), 'distributed_{}'.format(args.local_rank)), - num_choices=4) - if args.fp16: - model.half() - model.to(device) - if args.local_rank != -1: - try: - from apex.parallel import DistributedDataParallel as DDP - except ImportError: - raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") + # Prepare optimizer - model = DDP(model) - elif n_gpu > 1: - model = torch.nn.DataParallel(model) - - # Prepare optimizer - if args.do_train: param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used @@ -432,8 +433,8 @@ def main(): warmup=args.warmup_proportion, t_total=num_train_optimization_steps) - global_step = 0 - if args.do_train: + global_step = 0 + logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size)