From c59d71b28235bd75cf78ca31cee3b559284a232a Mon Sep 17 00:00:00 2001 From: jeffhataws <56947987+jeffhataws@users.noreply.github.com> Date: Wed, 18 Jan 2023 08:21:19 -0800 Subject: [PATCH] Add AWS Neuron torchrun support (#20806) * Add XLA torchrun support * Clarify that currently DDP doesn't work with torch.distributed XLA backend yet * Enable DDP with torchrun and XLA (now available in PT-XLA 1.13) * Add check for AWS Neuron availability and AWS Neuron specific compiler flag * Change the new test's name to TestTrainerDistributedNeuronCore * Remove "assert" and replace raised exception * Remove compiler flag as it is optional. If needed, will be another PR. * Use TORCHELASTIC_RUN_ID to determine whether torchrun is used --- src/transformers/__init__.py | 2 ++ src/transformers/testing_utils.py | 10 ++++++++++ src/transformers/training_args.py | 12 ++++++++++++ src/transformers/utils/__init__.py | 1 + src/transformers/utils/import_utils.py | 7 +++++++ tests/trainer/test_trainer_distributed.py | 18 ++++++++++++++++++ 6 files changed, 50 insertions(+) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 62f10ace2f..e3b6750246 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -577,6 +577,7 @@ _import_structure = { "is_timm_available", "is_tokenizers_available", "is_torch_available", + "is_torch_neuroncore_available", "is_torch_tpu_available", "is_vision_available", "logging", @@ -3947,6 +3948,7 @@ if TYPE_CHECKING: is_timm_available, is_tokenizers_available, is_torch_available, + is_torch_neuroncore_available, is_torch_tpu_available, is_vision_available, logging, diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 31760557aa..149b317584 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -83,6 +83,7 @@ from .utils import ( is_torch_available, is_torch_bf16_cpu_available, is_torch_bf16_gpu_available, + is_torch_neuroncore_available, is_torch_tensorrt_fx_available, is_torch_tf32_available, is_torch_tpu_available, @@ -500,6 +501,15 @@ def require_torch_tpu(test_case): return unittest.skipUnless(is_torch_tpu_available(check_device=False), "test requires PyTorch TPU")(test_case) +def require_torch_neuroncore(test_case): + """ + Decorator marking a test that requires NeuronCore (in PyTorch). + """ + return unittest.skipUnless(is_torch_neuroncore_available(check_device=False), "test requires PyTorch NeuronCore")( + test_case + ) + + if is_torch_available(): # Set env var CUDA_VISIBLE_DEVICES="" to force cpu-mode import torch diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index b5c6025176..1a907107f8 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -46,6 +46,7 @@ from .utils import ( is_torch_available, is_torch_bf16_cpu_available, is_torch_bf16_gpu_available, + is_torch_neuroncore_available, is_torch_tf32_available, is_torch_tpu_available, logging, @@ -60,6 +61,17 @@ if is_torch_available(): if is_torch_tpu_available(check_device=False): import torch_xla.core.xla_model as xm +if is_torch_neuroncore_available(check_device=False): + # torchrun support + # https://github.com/pytorch/xla/pull/3609 + if os.environ.get("TORCHELASTIC_RUN_ID"): + import torch_xla.distributed.xla_backend as xbn + + if not isinstance(torch.distributed.group.WORLD, xbn.ProcessGroupXla): + torch.distributed.init_process_group(backend="xla") + if not isinstance(torch.distributed.group.WORLD, xbn.ProcessGroupXla): + raise AssertionError("Failed to initialize torch.distributed process group using XLA backend.") + if is_sagemaker_mp_enabled(): import smdistributed.modelparallel.torch as smp diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py index 353fe45e8e..6e98c57166 100644 --- a/src/transformers/utils/__init__.py +++ b/src/transformers/utils/__init__.py @@ -153,6 +153,7 @@ from .import_utils import ( is_torch_cuda_available, is_torch_fx_available, is_torch_fx_proxy, + is_torch_neuroncore_available, is_torch_onnx_dict_inputs_support_available, is_torch_tensorrt_fx_available, is_torch_tf32_available, diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index 80ffd38c10..d1457b6709 100644 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -451,6 +451,13 @@ def is_torch_tpu_available(check_device=True): return False +@lru_cache() +def is_torch_neuroncore_available(check_device=True): + if importlib.util.find_spec("torch_neuronx") is not None: + return is_torch_tpu_available(check_device) + return False + + def is_torchdynamo_available(): if not is_torch_available(): return False diff --git a/tests/trainer/test_trainer_distributed.py b/tests/trainer/test_trainer_distributed.py index 6ed74efe51..68d07e0f60 100644 --- a/tests/trainer/test_trainer_distributed.py +++ b/tests/trainer/test_trainer_distributed.py @@ -21,6 +21,7 @@ from transformers.testing_utils import ( execute_subprocess_async, get_torch_dist_unique_port, require_torch_multi_gpu, + require_torch_neuroncore, ) from transformers.utils import logging @@ -62,6 +63,23 @@ if is_torch_available(): return input_ids +class TestTrainerDistributedNeuronCore(TestCasePlus): + @require_torch_neuroncore + def test_trainer(self): + + distributed_args = f""" + -m torch.distributed.launch + --nproc_per_node=2 + --master_port={get_torch_dist_unique_port()} + {self.test_file_dir}/test_trainer_distributed.py + """.split() + output_dir = self.get_auto_remove_tmp_dir() + args = f"--output_dir {output_dir}".split() + cmd = [sys.executable] + distributed_args + args + execute_subprocess_async(cmd, env=self.get_env()) + # successful return here == success - any errors would have caused an error in the sub-call + + class TestTrainerDistributed(TestCasePlus): @require_torch_multi_gpu def test_trainer(self):