[tests] switch to torchrun (#22712)
This commit is contained in:
@@ -366,7 +366,7 @@ class TestTrainerExt(TestCasePlus):
|
|||||||
n_gpus_to_use = get_gpu_count()
|
n_gpus_to_use = get_gpu_count()
|
||||||
master_port = get_torch_dist_unique_port()
|
master_port = get_torch_dist_unique_port()
|
||||||
distributed_args = f"""
|
distributed_args = f"""
|
||||||
-m torch.distributed.launch
|
-m torch.distributed.run
|
||||||
--nproc_per_node={n_gpus_to_use}
|
--nproc_per_node={n_gpus_to_use}
|
||||||
--master_port={master_port}
|
--master_port={master_port}
|
||||||
{self.examples_dir_str}/pytorch/translation/run_translation.py
|
{self.examples_dir_str}/pytorch/translation/run_translation.py
|
||||||
|
|||||||
@@ -67,7 +67,7 @@ class TestTrainerDistributedNeuronCore(TestCasePlus):
|
|||||||
@require_torch_neuroncore
|
@require_torch_neuroncore
|
||||||
def test_trainer(self):
|
def test_trainer(self):
|
||||||
distributed_args = f"""
|
distributed_args = f"""
|
||||||
-m torch.distributed.launch
|
-m torch.distributed.run
|
||||||
--nproc_per_node=2
|
--nproc_per_node=2
|
||||||
--master_port={get_torch_dist_unique_port()}
|
--master_port={get_torch_dist_unique_port()}
|
||||||
{self.test_file_dir}/test_trainer_distributed.py
|
{self.test_file_dir}/test_trainer_distributed.py
|
||||||
@@ -83,7 +83,7 @@ class TestTrainerDistributed(TestCasePlus):
|
|||||||
@require_torch_multi_gpu
|
@require_torch_multi_gpu
|
||||||
def test_trainer(self):
|
def test_trainer(self):
|
||||||
distributed_args = f"""
|
distributed_args = f"""
|
||||||
-m torch.distributed.launch
|
-m torch.distributed.run
|
||||||
--nproc_per_node={torch.cuda.device_count()}
|
--nproc_per_node={torch.cuda.device_count()}
|
||||||
--master_port={get_torch_dist_unique_port()}
|
--master_port={get_torch_dist_unique_port()}
|
||||||
{self.test_file_dir}/test_trainer_distributed.py
|
{self.test_file_dir}/test_trainer_distributed.py
|
||||||
@@ -98,7 +98,7 @@ class TestTrainerDistributed(TestCasePlus):
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# The script below is meant to be run under torch.distributed, on a machine with multiple GPUs:
|
# The script below is meant to be run under torch.distributed, on a machine with multiple GPUs:
|
||||||
#
|
#
|
||||||
# PYTHONPATH="src" python -m torch.distributed.launch --nproc_per_node 2 --output_dir output_dir ./tests/test_trainer_distributed.py
|
# PYTHONPATH="src" python -m torch.distributed.run --nproc_per_node 2 --output_dir output_dir ./tests/test_trainer_distributed.py
|
||||||
|
|
||||||
parser = HfArgumentParser((TrainingArguments,))
|
parser = HfArgumentParser((TrainingArguments,))
|
||||||
training_args = parser.parse_args_into_dataclasses()[0]
|
training_args = parser.parse_args_into_dataclasses()[0]
|
||||||
|
|||||||
Reference in New Issue
Block a user