From 1c76a51615ccd5c8e60570771b29ef16a9c3bc17 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 10 Nov 2021 19:11:45 -0800 Subject: [PATCH] solve the port conflict (#14362) --- tests/deepspeed/test_deepspeed.py | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py index 70eb95e9e5..5c06d8b57f 100644 --- a/tests/deepspeed/test_deepspeed.py +++ b/tests/deepspeed/test_deepspeed.py @@ -64,6 +64,28 @@ def load_json(path): return json.load(f) +def get_master_port(real_launcher=False): + """ + When using a single gpu launcher emulation (i.e. not deepspeed or python -m torch.distributed) + the issue is that once the port is tied it can't be used anywhere else outside of this process, + since torch.dist doesn't free the port until the process exits. Therefore for the sake of being + able to run both emulated launcher and normal launcher tests we need 2 distinct ports. + + This function will give the right port in the right context. For real launcher it'll give the + base port, for emulated launcher it'll give the base port + 1. In both cases a string is + returned. + + Args: + `real_launcher`: whether a real launcher is going to be used, or the emulated one + + """ + + master_port_base = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT) + if not real_launcher: + master_port_base = str(int(master_port_base) + 1) + return master_port_base + + def require_deepspeed_aio(test_case): """ Decorator marking a test that requires deepspeed aio (nvme) @@ -92,7 +114,7 @@ def get_launcher(distributed=False): # 2. for now testing with just 2 gpus max (since some quality tests may give different # results with mode gpus because we use very little data) num_gpus = min(2, get_gpu_count()) if distributed else 1 - master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT) + master_port = get_master_port(real_launcher=True) return f"deepspeed --num_nodes 1 --num_gpus {num_gpus} --master_port {master_port}".split() @@ -111,7 +133,7 @@ class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): def setUp(self): super().setUp() - master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT) + master_port = get_master_port(real_launcher=False) self.dist_env_1_gpu = dict( MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1" ) @@ -181,7 +203,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): self.n_epochs = args.num_train_epochs self.batch_size = args.train_batch_size - master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT) + master_port = get_master_port(real_launcher=False) self.dist_env_1_gpu = dict( MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1" )