[trainer] a few fixes (#9993)
* trainer fixes * don't switch the model just for deepspeed and mp * correct the fix
This commit is contained in:
@@ -264,11 +264,14 @@ class Trainer:
|
||||
self.eval_dataset = eval_dataset
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
# Model parallel
|
||||
if not self.is_model_parallel:
|
||||
# postpone switching model to cuda when:
|
||||
# 1. MP - since we are trying to fit a much bigger than 1 gpu model
|
||||
# 2. fp16-enabled DeepSpeed loads the model in half the size and it doesn't need .to() anyway
|
||||
if not (self.is_model_parallel or args.deepspeed):
|
||||
model = model.to(args.device)
|
||||
else:
|
||||
# Force n_gpu to 1 to avoid DataParallel.
|
||||
|
||||
# Force n_gpu to 1 to avoid DataParallel as MP will manage the GPUs
|
||||
if self.is_model_parallel:
|
||||
self.args._n_gpu = 1
|
||||
|
||||
# later use `self.model is self.model_wrapped` to check if it's wrapped or not
|
||||
@@ -790,6 +793,8 @@ class Trainer:
|
||||
model = ShardedDDP(model, self.optimizer)
|
||||
elif is_sagemaker_distributed_available():
|
||||
model = DDP(model, device_ids=[dist.get_local_rank()], broadcast_buffers=False)
|
||||
if self.deepspeed:
|
||||
pass # already initialized its own DDP earlier
|
||||
elif self.args.local_rank != -1:
|
||||
if self.args.ddp_find_unused_parameters is not None:
|
||||
find_unused_parameters = self.args.ddp_find_unused_parameters
|
||||
|
||||
Reference in New Issue
Block a user