From adb3ef636877586ab64ea9be97f3407433d053d8 Mon Sep 17 00:00:00 2001 From: zijunsun Date: Thu, 25 Jul 2019 13:09:10 +0800 Subject: [PATCH] multi-gpu training also should be after apex fp16 --- examples/run_glue.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/run_glue.py b/examples/run_glue.py index 5d9abd06fc..0d4ffaa390 100644 --- a/examples/run_glue.py +++ b/examples/run_glue.py @@ -92,6 +92,10 @@ def train(args, train_dataset, model, tokenizer): raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) + # multi-gpu training (should be after apex fp16 initialization) + if args.n_gpu > 1: + model = torch.nn.DataParallel(model) + # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], @@ -418,8 +422,6 @@ def main(): torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab model.to(args.device) - if args.n_gpu > 1: - model = torch.nn.DataParallel(model) logger.info("Training/evaluation parameters %s", args)