[deepspeed] Enable multiple test runs on single box, defer to DS_TEST_PORT if set (#14331)
* defer to DS_TEST_PORT if set * style Co-authored-by: Stas Bekman <stas@stason.org>
This commit is contained in:
@@ -51,6 +51,9 @@ with ExtendSysPath(tests_dir):
|
|||||||
|
|
||||||
set_seed(42)
|
set_seed(42)
|
||||||
|
|
||||||
|
# default torch.distributed port
|
||||||
|
DEFAULT_MASTER_PORT = "10999"
|
||||||
|
|
||||||
T5_SMALL = "t5-small"
|
T5_SMALL = "t5-small"
|
||||||
T5_TINY = "patrickvonplaten/t5-tiny-random"
|
T5_TINY = "patrickvonplaten/t5-tiny-random"
|
||||||
GPT2_TINY = "sshleifer/tiny-gpt2"
|
GPT2_TINY = "sshleifer/tiny-gpt2"
|
||||||
@@ -89,7 +92,8 @@ def get_launcher(distributed=False):
|
|||||||
# 2. for now testing with just 2 gpus max (since some quality tests may give different
|
# 2. for now testing with just 2 gpus max (since some quality tests may give different
|
||||||
# results with mode gpus because we use very little data)
|
# results with mode gpus because we use very little data)
|
||||||
num_gpus = min(2, get_gpu_count()) if distributed else 1
|
num_gpus = min(2, get_gpu_count()) if distributed else 1
|
||||||
return f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split()
|
master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
|
||||||
|
return f"deepspeed --num_nodes 1 --num_gpus {num_gpus} --master_port {master_port}".split()
|
||||||
|
|
||||||
|
|
||||||
ZERO2 = "zero2"
|
ZERO2 = "zero2"
|
||||||
@@ -107,8 +111,9 @@ class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
def setUp(self):
|
def setUp(self):
|
||||||
super().setUp()
|
super().setUp()
|
||||||
|
|
||||||
|
master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
|
||||||
self.dist_env_1_gpu = dict(
|
self.dist_env_1_gpu = dict(
|
||||||
MASTER_ADDR="localhost", MASTER_PORT="10999", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
|
MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_init_zero3(self):
|
def test_init_zero3(self):
|
||||||
@@ -176,8 +181,9 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
self.n_epochs = args.num_train_epochs
|
self.n_epochs = args.num_train_epochs
|
||||||
self.batch_size = args.train_batch_size
|
self.batch_size = args.train_batch_size
|
||||||
|
|
||||||
|
master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
|
||||||
self.dist_env_1_gpu = dict(
|
self.dist_env_1_gpu = dict(
|
||||||
MASTER_ADDR="localhost", MASTER_PORT="10999", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
|
MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
|
||||||
)
|
)
|
||||||
|
|
||||||
self.ds_config_file = dict(
|
self.ds_config_file = dict(
|
||||||
|
|||||||
@@ -41,6 +41,9 @@ with ExtendSysPath(tests_dir):
|
|||||||
|
|
||||||
set_seed(42)
|
set_seed(42)
|
||||||
|
|
||||||
|
# default torch.distributed port
|
||||||
|
DEFAULT_MASTER_PORT = "10999"
|
||||||
|
|
||||||
# translation
|
# translation
|
||||||
FSMT_TINY = "stas/tiny-wmt19-en-de"
|
FSMT_TINY = "stas/tiny-wmt19-en-de"
|
||||||
BART_TINY = "sshleifer/bart-tiny-random"
|
BART_TINY = "sshleifer/bart-tiny-random"
|
||||||
@@ -89,7 +92,8 @@ def get_launcher(distributed=False):
|
|||||||
# 2. for now testing with just 2 gpus max (since some quality tests may give different
|
# 2. for now testing with just 2 gpus max (since some quality tests may give different
|
||||||
# results with mode gpus because we use very little data)
|
# results with mode gpus because we use very little data)
|
||||||
num_gpus = min(2, get_gpu_count()) if distributed else 1
|
num_gpus = min(2, get_gpu_count()) if distributed else 1
|
||||||
return f"deepspeed --num_nodes 1 --num_gpus {num_gpus}".split()
|
master_port = os.environ.get("DS_TEST_PORT", DEFAULT_MASTER_PORT)
|
||||||
|
return f"deepspeed --num_nodes 1 --num_gpus {num_gpus} --master_port {master_port}".split()
|
||||||
|
|
||||||
|
|
||||||
def make_task_cmds():
|
def make_task_cmds():
|
||||||
|
|||||||
Reference in New Issue
Block a user