Add AWS Neuron torchrun support (#20806)

* Add XLA torchrun support

* Clarify that currently DDP doesn't work with torch.distributed XLA backend yet

* Enable DDP with torchrun and XLA (now available in PT-XLA 1.13)

* Add check for AWS Neuron availability and AWS Neuron specific compiler flag

* Change the new test's name to TestTrainerDistributedNeuronCore

* Remove "assert" and replace raised exception

* Remove compiler flag as it is optional. If needed, will be another PR.

* Use TORCHELASTIC_RUN_ID to determine whether torchrun is used
This commit is contained in:
jeffhataws
2023-01-18 08:21:19 -08:00
committed by GitHub
parent f70ee51029
commit c59d71b282
6 changed files with 50 additions and 0 deletions

View File

@@ -21,6 +21,7 @@ from transformers.testing_utils import (
execute_subprocess_async,
get_torch_dist_unique_port,
require_torch_multi_gpu,
require_torch_neuroncore,
)
from transformers.utils import logging
@@ -62,6 +63,23 @@ if is_torch_available():
return input_ids
class TestTrainerDistributedNeuronCore(TestCasePlus):
@require_torch_neuroncore
def test_trainer(self):
distributed_args = f"""
-m torch.distributed.launch
--nproc_per_node=2
--master_port={get_torch_dist_unique_port()}
{self.test_file_dir}/test_trainer_distributed.py
""".split()
output_dir = self.get_auto_remove_tmp_dir()
args = f"--output_dir {output_dir}".split()
cmd = [sys.executable] + distributed_args + args
execute_subprocess_async(cmd, env=self.get_env())
# successful return here == success - any errors would have caused an error in the sub-call
class TestTrainerDistributed(TestCasePlus):
@require_torch_multi_gpu
def test_trainer(self):