Revert DeepSpeed stuff from accelerate integration (#22899)

2023-04-20 14:23:59 -04:00
parent f143037789
commit 5764e67cee
1 changed files with 62 additions and 47 deletions
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1544,24 +1544,39 @@ class TrainingArguments:
            self._n_gpu = 1
            torch.cuda.set_device(device)
        elif self.deepspeed:
-            self.distributed_state = PartialState(timeout=timedelta(seconds=self.ddp_timeout))
+            # deepspeed inits torch.distributed internally
+            from .deepspeed import is_deepspeed_available
+
+            if not is_deepspeed_available():
+                raise ImportError("--deepspeed requires deepspeed: `pip install deepspeed`.")
+            import deepspeed
+
+            deepspeed.init_distributed(timeout=timedelta(seconds=self.ddp_timeout))
+
+            # workaround for setups like notebooks where the launcher can't be used,
+            # but deepspeed requires a dist env.
+            # env LOCAL_RANK could be set manually by the user, or via init_distributed if mpi4py is installed
+            self.local_rank = int(os.environ.get("LOCAL_RANK", "-1"))
+
+            device = torch.device("cuda", self.local_rank)
            self._n_gpu = 1
        else:
            self.distributed_state = PartialState(backend=self.xpu_backend)
            self._n_gpu = 1
-        if not is_sagemaker_mp_enabled():
+        if not is_sagemaker_mp_enabled() and not self.deepspeed:
            device = self.distributed_state.device
            self.local_rank = self.distributed_state.local_process_index
        if (
            torch.distributed.is_available()
            and torch.distributed.is_initialized()
+            and hasattr(self, "distributed_state")
            and self.distributed_state.distributed_type == DistributedType.NO
        ):
            logger.warning(
                "torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. "
                "In order to use Torch DDP, launch your script with `python -m torch.distributed.launch"
            )
-
+        if not self.deepspeed:
            if is_torch_tpu_available():
                device = self.distributed_state.device
                self._n_gpu = 0
@@ -1649,7 +1664,7 @@ class TrainingArguments:
            return ParallelMode.SAGEMAKER_MODEL_PARALLEL
        elif is_sagemaker_dp_enabled():
            return ParallelMode.SAGEMAKER_DATA_PARALLEL
-        elif hasattr(self, "distributed_state") and (self.distributed_state.distributed_type != DistributedType.NO):
+        elif self.deepspeed or self.distributed_state.distributed_type != DistributedType.NO:
            return ParallelMode.DISTRIBUTED
        elif self.n_gpu > 1:
            return ParallelMode.NOT_DISTRIBUTED