Add AWS Neuron torchrun support (#20806)

* Add XLA torchrun support * Clarify that currently DDP doesn't work with torch.distributed XLA backend yet * Enable DDP with torchrun and XLA (now available in PT-XLA 1.13) * Add check for AWS Neuron availability and AWS Neuron specific compiler flag * Change the new test's name to TestTrainerDistributedNeuronCore * Remove "assert" and replace raised exception * Remove compiler flag as it is optional. If needed, will be another PR. * Use TORCHELASTIC_RUN_ID to determine whether torchrun is used
2023-01-18 08:21:19 -08:00
parent f70ee51029
commit c59d71b282
6 changed files with 50 additions and 0 deletions
--- a/tests/trainer/test_trainer_distributed.py
+++ b/tests/trainer/test_trainer_distributed.py
@@ -21,6 +21,7 @@ from transformers.testing_utils import (
    execute_subprocess_async,
    get_torch_dist_unique_port,
    require_torch_multi_gpu,
+    require_torch_neuroncore,
 )
 from transformers.utils import logging

@@ -62,6 +63,23 @@ if is_torch_available():
                return input_ids


+class TestTrainerDistributedNeuronCore(TestCasePlus):
+    @require_torch_neuroncore
+    def test_trainer(self):
+
+        distributed_args = f"""
+            -m torch.distributed.launch
+            --nproc_per_node=2
+            --master_port={get_torch_dist_unique_port()}
+            {self.test_file_dir}/test_trainer_distributed.py
+        """.split()
+        output_dir = self.get_auto_remove_tmp_dir()
+        args = f"--output_dir {output_dir}".split()
+        cmd = [sys.executable] + distributed_args + args
+        execute_subprocess_async(cmd, env=self.get_env())
+        # successful return here == success - any errors would have caused an error in the sub-call
+
+
 class TestTrainerDistributed(TestCasePlus):
    @require_torch_multi_gpu
    def test_trainer(self):