solve the port conflict (#14362)

This commit is contained in:
Stas Bekman
2021-11-10 19:11:45 -08:00
committed by GitHub
parent 9e37c5cdf8
commit 1c76a51615

View File

@@ -64,6 +64,28 @@ def load_json(path):
return json.load(f)
def get_master_port(real_launcher=False):
"""
When using a single gpu launcher emulation (i.e. not deepspeed or python -m torch.distributed)
the issue is that once the port is tied it can't be used anywhere else outside of this process,
since torch.dist doesn't free the port until the process exits. Therefore for the sake of being
able to run both emulated launcher and normal launcher tests we need 2 distinct ports.
This function will give the right port in the right context. For real launcher it'll give the
base port, for emulated launcher it'll give the base port + 1. In both cases a string is
returned.
Args:
`real_launcher`: whether a real launcher is going to be used, or the emulated one
"""
master_port_base = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
if not real_launcher:
master_port_base = str(int(master_port_base) + 1)
return master_port_base
def require_deepspeed_aio(test_case):
"""
Decorator marking a test that requires deepspeed aio (nvme)
@@ -92,7 +114,7 @@ def get_launcher(distributed=False):
# 2. for now testing with just 2 gpus max (since some quality tests may give different
# results with mode gpus because we use very little data)
num_gpus = min(2, get_gpu_count()) if distributed else 1
master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
master_port = get_master_port(real_launcher=True)
return f"deepspeed --num_nodes 1 --num_gpus {num_gpus} --master_port {master_port}".split()
@@ -111,7 +133,7 @@ class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
def setUp(self):
super().setUp()
master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
master_port = get_master_port(real_launcher=False)
self.dist_env_1_gpu = dict(
MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
)
@@ -181,7 +203,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
self.n_epochs = args.num_train_epochs
self.batch_size = args.train_batch_size
master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
master_port = get_master_port(real_launcher=False)
self.dist_env_1_gpu = dict(
MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
)