From d0e96c6de64a496f99fb7675257530bd364655e4 Mon Sep 17 00:00:00 2001 From: Jeff Rasley Date: Mon, 8 Nov 2021 12:40:29 -0800 Subject: [PATCH] [deepspeed] Enable multiple test runs on single box, defer to DS_TEST_PORT if set (#14331) * defer to DS_TEST_PORT if set * style Co-authored-by: Stas Bekman --- tests/deepspeed/test_deepspeed.py | 12 +++++++++--- tests/deepspeed/test_model_zoo.py | 6 +++++- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py index a7ba14b022..70eb95e9e5 100644 --- a/tests/deepspeed/test_deepspeed.py +++ b/tests/deepspeed/test_deepspeed.py @@ -51,6 +51,9 @@ with ExtendSysPath(tests_dir): set_seed(42) +# default torch.distributed port +DEFAULT_MASTER_PORT = "10999" + T5_SMALL = "t5-small" T5_TINY = "patrickvonplaten/t5-tiny-random" GPT2_TINY = "sshleifer/tiny-gpt2" @@ -89,7 +92,8 @@ def get_launcher(distributed=False): # 2. for now testing with just 2 gpus max (since some quality tests may give different # results with mode gpus because we use very little data) num_gpus = min(2, get_gpu_count()) if distributed else 1 - return f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split() + master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT) + return f"deepspeed --num_nodes 1 --num_gpus {num_gpus} --master_port {master_port}".split() ZERO2 = "zero2" @@ -107,8 +111,9 @@ class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): def setUp(self): super().setUp() + master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT) self.dist_env_1_gpu = dict( - MASTER_ADDR="localhost", MASTER_PORT="10999", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1" + MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1" ) def test_init_zero3(self): @@ -176,8 +181,9 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): self.n_epochs = args.num_train_epochs self.batch_size = args.train_batch_size + master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT) self.dist_env_1_gpu = dict( - MASTER_ADDR="localhost", MASTER_PORT="10999", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1" + MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1" ) self.ds_config_file = dict( diff --git a/tests/deepspeed/test_model_zoo.py b/tests/deepspeed/test_model_zoo.py index 89548b7acc..321e8b2bf0 100644 --- a/tests/deepspeed/test_model_zoo.py +++ b/tests/deepspeed/test_model_zoo.py @@ -41,6 +41,9 @@ with ExtendSysPath(tests_dir): set_seed(42) +# default torch.distributed port +DEFAULT_MASTER_PORT = "10999" + # translation FSMT_TINY = "stas/tiny-wmt19-en-de" BART_TINY = "sshleifer/bart-tiny-random" @@ -89,7 +92,8 @@ def get_launcher(distributed=False): # 2. for now testing with just 2 gpus max (since some quality tests may give different # results with mode gpus because we use very little data) num_gpus = min(2, get_gpu_count()) if distributed else 1 - return f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split() + master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT) + return f"deepspeed --num_nodes 1 --num_gpus {num_gpus} --master_port {master_port}".split() def make_task_cmds():