From 8b129030cbce6e5a702dae873e52221d2b35c9fb Mon Sep 17 00:00:00 2001
From: Zachary Mueller <muellerzr@gmail.com>
Date: Wed, 26 Apr 2023 15:35:59 -0400
Subject: [PATCH] Bring back PartialState DeepSpeed (#22921)

* Bring back deepspeed integration

* Branchname

* Self-scheduled

* newline

* Use deepspeed env var

* Remove comment

* Del env var after partialstate
---
 src/transformers/training_args.py | 110 +++++++++++++-----------------
 1 file changed, 48 insertions(+), 62 deletions(-)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index a2a12862f7..55945a4382 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1544,84 +1544,70 @@ class TrainingArguments:
             self._n_gpu = 1
             torch.cuda.set_device(device)
         elif self.deepspeed:
-            # deepspeed inits torch.distributed internally
-            from .deepspeed import is_deepspeed_available
-
-            if not is_deepspeed_available():
-                raise ImportError("--deepspeed requires deepspeed: `pip install deepspeed`.")
-            import deepspeed
-
-            deepspeed.init_distributed(timeout=timedelta(seconds=self.ddp_timeout))
-
-            # workaround for setups like notebooks where the launcher can't be used,
-            # but deepspeed requires a dist env.
-            # env LOCAL_RANK could be set manually by the user, or via init_distributed if mpi4py is installed
-            self.local_rank = int(os.environ.get("LOCAL_RANK", "-1"))
-
-            device = torch.device("cuda", self.local_rank)
+            # Need to do similar for Accelerator init
+            os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
+            self.distributed_state = PartialState(timeout=timedelta(seconds=self.ddp_timeout))
+            del os.environ["ACCELERATE_USE_DEEPSPEED"]
             self._n_gpu = 1
         else:
             self.distributed_state = PartialState(backend=self.xpu_backend)
             self._n_gpu = 1
-        if not is_sagemaker_mp_enabled() and not self.deepspeed:
+        if not is_sagemaker_mp_enabled():
             device = self.distributed_state.device
             self.local_rank = self.distributed_state.local_process_index
         if (
             torch.distributed.is_available()
             and torch.distributed.is_initialized()
-            and hasattr(self, "distributed_state")
             and self.distributed_state.distributed_type == DistributedType.NO
         ):
             logger.warning(
                 "torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. "
                 "In order to use Torch DDP, launch your script with `python -m torch.distributed.launch"
             )
-        if not self.deepspeed:
-            if is_torch_tpu_available():
-                device = self.distributed_state.device
-                self._n_gpu = 0
-            elif is_sagemaker_dp_enabled():
-                self._n_gpu = 1
-            elif self.distributed_state.distributed_type == DistributedType.NO:
-                if self.use_mps_device:
-                    if not torch.backends.mps.is_available():
-                        if not torch.backends.mps.is_built():
-                            raise AssertionError(
-                                "MPS not available because the current PyTorch install was not "
-                                "built with MPS enabled. Please install torch version >=1.12.0 on "
-                                "your Apple silicon Mac running macOS 12.3 or later with a native "
-                                "version (arm64) of Python"
-                            )
-                        else:
-                            raise AssertionError(
-                                "MPS not available because the current MacOS version is not 12.3+ "
-                                "and/or you do not have an MPS-enabled device on this machine."
-                            )
+        if is_torch_tpu_available():
+            device = self.distributed_state.device
+            self._n_gpu = 0
+        elif is_sagemaker_dp_enabled():
+            self._n_gpu = 1
+        elif self.distributed_state.distributed_type == DistributedType.NO:
+            if self.use_mps_device:
+                if not torch.backends.mps.is_available():
+                    if not torch.backends.mps.is_built():
+                        raise AssertionError(
+                            "MPS not available because the current PyTorch install was not "
+                            "built with MPS enabled. Please install torch version >=1.12.0 on "
+                            "your Apple silicon Mac running macOS 12.3 or later with a native "
+                            "version (arm64) of Python"
+                        )
                     else:
-                        if not version.parse(version.parse(torch.__version__).base_version) > version.parse("1.12.0"):
-                            warnings.warn(
-                                "We strongly recommend to install PyTorch >= 1.13 (nightly version at the time of writing)"
-                                " on your MacOS machine. It has major fixes related to model correctness and performance"
-                                " improvements for transformer based models. Please refer to"
-                                " https://github.com/pytorch/pytorch/issues/82707 for more details."
-                            )
-                        device = torch.device("mps")
-                        self._n_gpu = 1
-
+                        raise AssertionError(
+                            "MPS not available because the current MacOS version is not 12.3+ "
+                            "and/or you do not have an MPS-enabled device on this machine."
+                        )
                 else:
-                    # if n_gpu is > 1 we'll use nn.DataParallel.
-                    # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
-                    # Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will
-                    # trigger an error that a device index is missing. Index 0 takes into account the
-                    # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0`
-                    # will use the first GPU in that env, i.e. GPU#1
-                    # device = self.distributed_state.device
-                    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-                    # Sometimes the line in the postinit has not been run before we end up here, so just checking we're not at
-                    # the default value.
-                    self._n_gpu = torch.cuda.device_count()
-                    if device.type == "cuda":
-                        torch.cuda.set_device(device)
+                    if not version.parse(version.parse(torch.__version__).base_version) > version.parse("1.12.0"):
+                        warnings.warn(
+                            "We strongly recommend to install PyTorch >= 1.13 (nightly version at the time of writing)"
+                            " on your MacOS machine. It has major fixes related to model correctness and performance"
+                            " improvements for transformer based models. Please refer to"
+                            " https://github.com/pytorch/pytorch/issues/82707 for more details."
+                        )
+                    device = torch.device("mps")
+                    self._n_gpu = 1
+
+            else:
+                # if n_gpu is > 1 we'll use nn.DataParallel.
+                # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
+                # Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will
+                # trigger an error that a device index is missing. Index 0 takes into account the
+                # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0`
+                # will use the first GPU in that env, i.e. GPU#1
+                device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+                # Sometimes the line in the postinit has not been run before we end up here, so just checking we're not at
+                # the default value.
+                self._n_gpu = torch.cuda.device_count()
+                if device.type == "cuda":
+                    torch.cuda.set_device(device)
         return device
 
     @property
@@ -1664,7 +1650,7 @@ class TrainingArguments:
             return ParallelMode.SAGEMAKER_MODEL_PARALLEL
         elif is_sagemaker_dp_enabled():
             return ParallelMode.SAGEMAKER_DATA_PARALLEL
-        elif self.deepspeed or self.distributed_state.distributed_type != DistributedType.NO:
+        elif hasattr(self, "distributed_state") and self.distributed_state.distributed_type != DistributedType.NO:
             return ParallelMode.DISTRIBUTED
         elif self.n_gpu > 1:
             return ParallelMode.NOT_DISTRIBUTED