[testing] ensure concurrent pytest workers use a unique port for torch.dist (#12166)

* ensure concurrent pytest workers use a unique port for torch.distributed.launch

* reword
This commit is contained in:
Stas Bekman
2021-06-15 11:12:59 -07:00
committed by GitHub
parent b9d66f4c4b
commit 6e7cc5cc51
3 changed files with 32 additions and 1 deletions

View File

@@ -25,6 +25,7 @@ from transformers.testing_utils import (
TestCasePlus,
execute_subprocess_async,
get_gpu_count,
get_torch_dist_unique_port,
require_torch_gpu,
require_torch_multi_gpu,
require_torch_non_multi_gpu,
@@ -223,9 +224,11 @@ class TestTrainerExt(TestCasePlus):
if distributed:
n_gpu = get_gpu_count()
master_port = get_torch_dist_unique_port()
distributed_args = f"""
-m torch.distributed.launch
--nproc_per_node={n_gpu}
--master_port={master_port}
{self.examples_dir_str}/pytorch/translation/run_translation.py
""".split()
cmd = [sys.executable] + distributed_args + args