[test] support more than 2 gpus (#12074)
* support more than 2 gpus * style
This commit is contained in:
@@ -34,6 +34,7 @@ from transformers.testing_utils import (
|
|||||||
PASS,
|
PASS,
|
||||||
USER,
|
USER,
|
||||||
TestCasePlus,
|
TestCasePlus,
|
||||||
|
get_gpu_count,
|
||||||
get_tests_dir,
|
get_tests_dir,
|
||||||
is_staging_test,
|
is_staging_test,
|
||||||
require_datasets,
|
require_datasets,
|
||||||
@@ -1113,15 +1114,17 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
# this is a sensitive test so let's keep debugging printouts in place for quick diagnosis.
|
# this is a sensitive test so let's keep debugging printouts in place for quick diagnosis.
|
||||||
# it's using pretty large safety margins, but small enough to detect broken functionality.
|
# it's using pretty large safety margins, but small enough to detect broken functionality.
|
||||||
debug = 0
|
debug = 0
|
||||||
|
n_gpus = get_gpu_count()
|
||||||
|
|
||||||
bs = 8
|
bs = 8
|
||||||
|
eval_len = 16 * n_gpus
|
||||||
# make the params somewhat big so that there will be enough RAM consumed to be able to
|
# make the params somewhat big so that there will be enough RAM consumed to be able to
|
||||||
# measure things. We should get about 64KB for a+b in fp32
|
# measure things. We should get about 64KB for a+b in fp32
|
||||||
a = torch.ones(1000, bs) + 0.001
|
a = torch.ones(1000, bs) + 0.001
|
||||||
b = torch.ones(1000, bs) - 0.001
|
b = torch.ones(1000, bs) - 0.001
|
||||||
|
|
||||||
# 1. with mem metrics enabled
|
# 1. with mem metrics enabled
|
||||||
trainer = get_regression_trainer(a=a, b=b, eval_len=16, skip_memory_metrics=False)
|
trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, skip_memory_metrics=False)
|
||||||
metrics = trainer.evaluate()
|
metrics = trainer.evaluate()
|
||||||
del trainer
|
del trainer
|
||||||
gc.collect()
|
gc.collect()
|
||||||
@@ -1142,7 +1145,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
self.assertLess(fp32_eval, 5_000)
|
self.assertLess(fp32_eval, 5_000)
|
||||||
|
|
||||||
# 2. with mem metrics disabled
|
# 2. with mem metrics disabled
|
||||||
trainer = get_regression_trainer(a=a, b=b, eval_len=16, fp16_full_eval=True, skip_memory_metrics=False)
|
trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, fp16_full_eval=True, skip_memory_metrics=False)
|
||||||
metrics = trainer.evaluate()
|
metrics = trainer.evaluate()
|
||||||
fp16_init = metrics["init_mem_gpu_alloc_delta"]
|
fp16_init = metrics["init_mem_gpu_alloc_delta"]
|
||||||
fp16_eval = metrics["eval_mem_gpu_alloc_delta"]
|
fp16_eval = metrics["eval_mem_gpu_alloc_delta"]
|
||||||
|
|||||||
Reference in New Issue
Block a user