solve the port conflict (#14362)
This commit is contained in:
@@ -64,6 +64,28 @@ def load_json(path):
|
|||||||
return json.load(f)
|
return json.load(f)
|
||||||
|
|
||||||
|
|
||||||
|
def get_master_port(real_launcher=False):
|
||||||
|
"""
|
||||||
|
When using a single gpu launcher emulation (i.e. not deepspeed or python -m torch.distributed)
|
||||||
|
the issue is that once the port is tied it can't be used anywhere else outside of this process,
|
||||||
|
since torch.dist doesn't free the port until the process exits. Therefore for the sake of being
|
||||||
|
able to run both emulated launcher and normal launcher tests we need 2 distinct ports.
|
||||||
|
|
||||||
|
This function will give the right port in the right context. For real launcher it'll give the
|
||||||
|
base port, for emulated launcher it'll give the base port + 1. In both cases a string is
|
||||||
|
returned.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
`real_launcher`: whether a real launcher is going to be used, or the emulated one
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
master_port_base = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
|
||||||
|
if not real_launcher:
|
||||||
|
master_port_base = str(int(master_port_base) + 1)
|
||||||
|
return master_port_base
|
||||||
|
|
||||||
|
|
||||||
def require_deepspeed_aio(test_case):
|
def require_deepspeed_aio(test_case):
|
||||||
"""
|
"""
|
||||||
Decorator marking a test that requires deepspeed aio (nvme)
|
Decorator marking a test that requires deepspeed aio (nvme)
|
||||||
@@ -92,7 +114,7 @@ def get_launcher(distributed=False):
|
|||||||
# 2. for now testing with just 2 gpus max (since some quality tests may give different
|
# 2. for now testing with just 2 gpus max (since some quality tests may give different
|
||||||
# results with mode gpus because we use very little data)
|
# results with mode gpus because we use very little data)
|
||||||
num_gpus = min(2, get_gpu_count()) if distributed else 1
|
num_gpus = min(2, get_gpu_count()) if distributed else 1
|
||||||
master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
|
master_port = get_master_port(real_launcher=True)
|
||||||
return f"deepspeed --num_nodes 1 --num_gpus {num_gpus} --master_port {master_port}".split()
|
return f"deepspeed --num_nodes 1 --num_gpus {num_gpus} --master_port {master_port}".split()
|
||||||
|
|
||||||
|
|
||||||
@@ -111,7 +133,7 @@ class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
def setUp(self):
|
def setUp(self):
|
||||||
super().setUp()
|
super().setUp()
|
||||||
|
|
||||||
master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
|
master_port = get_master_port(real_launcher=False)
|
||||||
self.dist_env_1_gpu = dict(
|
self.dist_env_1_gpu = dict(
|
||||||
MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
|
MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
|
||||||
)
|
)
|
||||||
@@ -181,7 +203,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
self.n_epochs = args.num_train_epochs
|
self.n_epochs = args.num_train_epochs
|
||||||
self.batch_size = args.train_batch_size
|
self.batch_size = args.train_batch_size
|
||||||
|
|
||||||
master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
|
master_port = get_master_port(real_launcher=False)
|
||||||
self.dist_env_1_gpu = dict(
|
self.dist_env_1_gpu = dict(
|
||||||
MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
|
MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user