solve the port conflict (#14362)

2021-11-10 19:11:45 -08:00
parent 9e37c5cdf8
commit 1c76a51615
1 changed files with 25 additions and 3 deletions
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -64,6 +64,28 @@ def load_json(path):
        return json.load(f)
 def get_master_port(real_launcher=False):
    """
    When using a single gpu launcher emulation (i.e. not deepspeed or python -m torch.distributed)
    the issue is that once the port is tied it can't be used anywhere else outside of this process,
    since torch.dist doesn't free the port until the process exits. Therefore for the sake of being
    able to run both emulated launcher and normal launcher tests we need 2 distinct ports.
    This function will give the right port in the right context. For real launcher it'll give the
    base port, for emulated launcher it'll give the base port + 1. In both cases a string is
    returned.
    Args:
        `real_launcher`: whether a real launcher is going to be used, or the emulated one
    """
    master_port_base = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
    if not real_launcher:
        master_port_base = str(int(master_port_base) + 1)
    return master_port_base
 def require_deepspeed_aio(test_case):
    """
    Decorator marking a test that requires deepspeed aio (nvme)
@@ -92,7 +114,7 @@ def get_launcher(distributed=False):
    # 2. for now testing with just 2 gpus max (since some quality tests may give different
    # results with mode gpus because we use very little data)
    num_gpus = min(2, get_gpu_count()) if distributed else 1
-    master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
+    master_port = get_master_port(real_launcher=True)
    return f"deepspeed --num_nodes 1 --num_gpus {num_gpus} --master_port {master_port}".split()
@@ -111,7 +133,7 @@ class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
    def setUp(self):
        super().setUp()
-        master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
+        master_port = get_master_port(real_launcher=False)
        self.dist_env_1_gpu = dict(
            MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
        )
@@ -181,7 +203,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
        self.n_epochs = args.num_train_epochs
        self.batch_size = args.train_batch_size
-        master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
+        master_port = get_master_port(real_launcher=False)
        self.dist_env_1_gpu = dict(
            MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
        )