From 4a20b7c450093a5521a9481c8d26277084b22664 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Wed, 20 Jan 2021 16:50:21 -0800
Subject: [PATCH] [trainer] no --deepspeed and --sharded_ddp together (#9712)

* no --deepspeed and --sharded_ddp together

* Update src/transformers/trainer.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* style

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 src/transformers/trainer.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index c5e745577e..edc7a09cec 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -337,12 +337,15 @@ class Trainer:
         # Setup Sharded DDP training
         self.sharded_dpp = False
         if args.sharded_ddp:
+            if args.deepspeed:
+                raise ValueError(
+                    "Using --sharded_ddp together with --deepspeed is not possible, deactivate one of those flags."
+                )
+
             if args.local_rank == -1:
                 raise ValueError("Using sharded DDP only works in distributed training.")
             elif not is_fairscale_available():
                 raise ImportError("Sharded DDP training requires fairscale: `pip install fairscale`.")
-            elif args.deepspeed:
-                raise ValueError("can't use --sharded_ddp together with --deepspeed.")
             else:
                 self.sharded_dpp = True