[trainer] deepspeed bug fixes and tests (#10039)

* deepspeed bug fixes and tests * manual wrap?
2021-02-08 09:44:02 -08:00
parent f285e4c3ad
commit 322037e842
3 changed files with 151 additions and 36 deletions
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -932,7 +932,11 @@ class Trainer:
                if (step + 1) % self.args.gradient_accumulation_steps == 0:
                    self.control = self.callback_handler.on_step_begin(self.args, self.state, self.control)

-                if ((step + 1) % self.args.gradient_accumulation_steps != 0) and self.args.local_rank != -1:
+                if (
+                    ((step + 1) % self.args.gradient_accumulation_steps != 0)
+                    and self.args.local_rank != -1
+                    and not self.args.deepspeed
+                ):
                    # Avoid unnecessary DDP synchronization since there will be no backward pass on this example.
                    with model.no_sync():
                        tr_loss += self.training_step(model, inputs)
@@ -1588,7 +1592,17 @@ class Trainer:
            prediction_loss_only if prediction_loss_only is not None else self.args.prediction_loss_only
        )

+        if self.args.deepspeed and not self.args.do_train:
+            # In the future we probably can run deepspeed for inference too, but this will require
+            # some thinking about how to best run it - since while it works DeepSpeed wasn't
+            # designed for inference
+
+            # since we have to postpone model.to() till training for DeepSpeed, if there was no
+            # training, we must put the model on the right device
+            self.model = self.model.to(self.args.device)
+
        model = self.model
+
        # multi-gpu eval
        if self.args.n_gpu > 1:
            model = torch.nn.DataParallel(model)