Revert DeepSpeed stuff from accelerate integration (#22899)
This commit is contained in:
@@ -1544,24 +1544,39 @@ class TrainingArguments:
|
|||||||
self._n_gpu = 1
|
self._n_gpu = 1
|
||||||
torch.cuda.set_device(device)
|
torch.cuda.set_device(device)
|
||||||
elif self.deepspeed:
|
elif self.deepspeed:
|
||||||
self.distributed_state = PartialState(timeout=timedelta(seconds=self.ddp_timeout))
|
# deepspeed inits torch.distributed internally
|
||||||
|
from .deepspeed import is_deepspeed_available
|
||||||
|
|
||||||
|
if not is_deepspeed_available():
|
||||||
|
raise ImportError("--deepspeed requires deepspeed: `pip install deepspeed`.")
|
||||||
|
import deepspeed
|
||||||
|
|
||||||
|
deepspeed.init_distributed(timeout=timedelta(seconds=self.ddp_timeout))
|
||||||
|
|
||||||
|
# workaround for setups like notebooks where the launcher can't be used,
|
||||||
|
# but deepspeed requires a dist env.
|
||||||
|
# env LOCAL_RANK could be set manually by the user, or via init_distributed if mpi4py is installed
|
||||||
|
self.local_rank = int(os.environ.get("LOCAL_RANK", "-1"))
|
||||||
|
|
||||||
|
device = torch.device("cuda", self.local_rank)
|
||||||
self._n_gpu = 1
|
self._n_gpu = 1
|
||||||
else:
|
else:
|
||||||
self.distributed_state = PartialState(backend=self.xpu_backend)
|
self.distributed_state = PartialState(backend=self.xpu_backend)
|
||||||
self._n_gpu = 1
|
self._n_gpu = 1
|
||||||
if not is_sagemaker_mp_enabled():
|
if not is_sagemaker_mp_enabled() and not self.deepspeed:
|
||||||
device = self.distributed_state.device
|
device = self.distributed_state.device
|
||||||
self.local_rank = self.distributed_state.local_process_index
|
self.local_rank = self.distributed_state.local_process_index
|
||||||
if (
|
if (
|
||||||
torch.distributed.is_available()
|
torch.distributed.is_available()
|
||||||
and torch.distributed.is_initialized()
|
and torch.distributed.is_initialized()
|
||||||
|
and hasattr(self, "distributed_state")
|
||||||
and self.distributed_state.distributed_type == DistributedType.NO
|
and self.distributed_state.distributed_type == DistributedType.NO
|
||||||
):
|
):
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. "
|
"torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. "
|
||||||
"In order to use Torch DDP, launch your script with `python -m torch.distributed.launch"
|
"In order to use Torch DDP, launch your script with `python -m torch.distributed.launch"
|
||||||
)
|
)
|
||||||
|
if not self.deepspeed:
|
||||||
if is_torch_tpu_available():
|
if is_torch_tpu_available():
|
||||||
device = self.distributed_state.device
|
device = self.distributed_state.device
|
||||||
self._n_gpu = 0
|
self._n_gpu = 0
|
||||||
@@ -1649,7 +1664,7 @@ class TrainingArguments:
|
|||||||
return ParallelMode.SAGEMAKER_MODEL_PARALLEL
|
return ParallelMode.SAGEMAKER_MODEL_PARALLEL
|
||||||
elif is_sagemaker_dp_enabled():
|
elif is_sagemaker_dp_enabled():
|
||||||
return ParallelMode.SAGEMAKER_DATA_PARALLEL
|
return ParallelMode.SAGEMAKER_DATA_PARALLEL
|
||||||
elif hasattr(self, "distributed_state") and (self.distributed_state.distributed_type != DistributedType.NO):
|
elif self.deepspeed or self.distributed_state.distributed_type != DistributedType.NO:
|
||||||
return ParallelMode.DISTRIBUTED
|
return ParallelMode.DISTRIBUTED
|
||||||
elif self.n_gpu > 1:
|
elif self.n_gpu > 1:
|
||||||
return ParallelMode.NOT_DISTRIBUTED
|
return ParallelMode.NOT_DISTRIBUTED
|
||||||
|
|||||||
Reference in New Issue
Block a user