fix #868
This commit is contained in:
@@ -92,6 +92,12 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
|
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
|
||||||
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
|
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
|
||||||
|
|
||||||
|
# Distributed training (should be after apex fp16 initialization)
|
||||||
|
if args.local_rank != -1:
|
||||||
|
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
|
||||||
|
output_device=args.local_rank,
|
||||||
|
find_unused_parameters=True)
|
||||||
|
|
||||||
# Train!
|
# Train!
|
||||||
logger.info("***** Running training *****")
|
logger.info("***** Running training *****")
|
||||||
logger.info(" Num examples = %d", len(train_dataset))
|
logger.info(" Num examples = %d", len(train_dataset))
|
||||||
@@ -411,13 +417,8 @@ def main():
|
|||||||
if args.local_rank == 0:
|
if args.local_rank == 0:
|
||||||
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
|
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
|
||||||
|
|
||||||
# Distributed and parallel training
|
|
||||||
model.to(args.device)
|
model.to(args.device)
|
||||||
if args.local_rank != -1:
|
if args.n_gpu > 1:
|
||||||
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
|
|
||||||
output_device=args.local_rank,
|
|
||||||
find_unused_parameters=True)
|
|
||||||
elif args.n_gpu > 1:
|
|
||||||
model = torch.nn.DataParallel(model)
|
model = torch.nn.DataParallel(model)
|
||||||
|
|
||||||
logger.info("Training/evaluation parameters %s", args)
|
logger.info("Training/evaluation parameters %s", args)
|
||||||
|
|||||||
@@ -101,6 +101,12 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
|
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
|
||||||
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
|
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
|
||||||
|
|
||||||
|
# Distributed training (should be after apex fp16 initialization)
|
||||||
|
if args.local_rank != -1:
|
||||||
|
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
|
||||||
|
output_device=args.local_rank,
|
||||||
|
find_unused_parameters=True)
|
||||||
|
|
||||||
# Train!
|
# Train!
|
||||||
logger.info("***** Running training *****")
|
logger.info("***** Running training *****")
|
||||||
logger.info(" Num examples = %d", len(train_dataset))
|
logger.info(" Num examples = %d", len(train_dataset))
|
||||||
@@ -450,13 +456,8 @@ def main():
|
|||||||
if args.local_rank == 0:
|
if args.local_rank == 0:
|
||||||
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
|
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
|
||||||
|
|
||||||
# Distributed and parrallel training
|
|
||||||
model.to(args.device)
|
model.to(args.device)
|
||||||
if args.local_rank != -1:
|
if args.n_gpu > 1:
|
||||||
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
|
|
||||||
output_device=args.local_rank,
|
|
||||||
find_unused_parameters=True)
|
|
||||||
elif args.n_gpu > 1:
|
|
||||||
model = torch.nn.DataParallel(model)
|
model = torch.nn.DataParallel(model)
|
||||||
|
|
||||||
logger.info("Training/evaluation parameters %s", args)
|
logger.info("Training/evaluation parameters %s", args)
|
||||||
|
|||||||
Reference in New Issue
Block a user