From bf14ef75f16d0c8dc54ed587d887cae7c66050df Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Mon, 18 May 2020 23:13:33 -0400
Subject: [PATCH] [Trainer] move model to device before setting optimizer
 (#4450)

---
 src/transformers/trainer.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 8db4eb0b81..9aca17b8fc 100644
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -188,7 +188,7 @@ class Trainer:
             prediction_loss_only:
                 (Optional) in evaluation and prediction, only return the loss
         """
-        self.model = model
+        self.model = model.to(args.device)
         self.args = args
         if data_collator is not None:
             self.data_collator = data_collator
@@ -393,7 +393,6 @@ class Trainer:
             scheduler.load_state_dict(torch.load(os.path.join(model_path, "scheduler.pt")))
 
         model = self.model
-        model.to(self.args.device)
         if self.args.fp16:
             if not is_apex_available():
                 raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
@@ -726,7 +725,6 @@ class Trainer:
         prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else self.prediction_loss_only
 
         model = self.model
-        model.to(self.args.device)
         # multi-gpu eval
         if self.args.n_gpu > 1:
             model = torch.nn.DataParallel(model)