From 8c3b1fcb6712ff38de4f134224474c58d14f57c1 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Thu, 4 Feb 2021 07:44:56 -0800 Subject: [PATCH] [trainer] a few fixes (#9993) * trainer fixes * don't switch the model just for deepspeed and mp * correct the fix --- src/transformers/trainer.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index a08938caa7..c18edc3455 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -264,11 +264,14 @@ class Trainer: self.eval_dataset = eval_dataset self.tokenizer = tokenizer - # Model parallel - if not self.is_model_parallel: + # postpone switching model to cuda when: + # 1. MP - since we are trying to fit a much bigger than 1 gpu model + # 2. fp16-enabled DeepSpeed loads the model in half the size and it doesn't need .to() anyway + if not (self.is_model_parallel or args.deepspeed): model = model.to(args.device) - else: - # Force n_gpu to 1 to avoid DataParallel. + + # Force n_gpu to 1 to avoid DataParallel as MP will manage the GPUs + if self.is_model_parallel: self.args._n_gpu = 1 # later use `self.model is self.model_wrapped` to check if it's wrapped or not @@ -790,6 +793,8 @@ class Trainer: model = ShardedDDP(model, self.optimizer) elif is_sagemaker_distributed_available(): model = DDP(model, device_ids=[dist.get_local_rank()], broadcast_buffers=False) + if self.deepspeed: + pass # already initialized its own DDP earlier elif self.args.local_rank != -1: if self.args.ddp_find_unused_parameters is not None: find_unused_parameters = self.args.ddp_find_unused_parameters