From b1a8aa94f0a2ccea7c68b79066141aa822b96e42 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 9 Jun 2021 09:23:47 -0700 Subject: [PATCH] [test] support more than 2 gpus (#12074) * support more than 2 gpus * style --- tests/test_trainer.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/test_trainer.py b/tests/test_trainer.py index 89a68792c8..3610f98d81 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -34,6 +34,7 @@ from transformers.testing_utils import ( PASS, USER, TestCasePlus, + get_gpu_count, get_tests_dir, is_staging_test, require_datasets, @@ -1113,15 +1114,17 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon): # this is a sensitive test so let's keep debugging printouts in place for quick diagnosis. # it's using pretty large safety margins, but small enough to detect broken functionality. debug = 0 + n_gpus = get_gpu_count() bs = 8 + eval_len = 16 * n_gpus # make the params somewhat big so that there will be enough RAM consumed to be able to # measure things. We should get about 64KB for a+b in fp32 a = torch.ones(1000, bs) + 0.001 b = torch.ones(1000, bs) - 0.001 # 1. with mem metrics enabled - trainer = get_regression_trainer(a=a, b=b, eval_len=16, skip_memory_metrics=False) + trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, skip_memory_metrics=False) metrics = trainer.evaluate() del trainer gc.collect() @@ -1142,7 +1145,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon): self.assertLess(fp32_eval, 5_000) # 2. with mem metrics disabled - trainer = get_regression_trainer(a=a, b=b, eval_len=16, fp16_full_eval=True, skip_memory_metrics=False) + trainer = get_regression_trainer(a=a, b=b, eval_len=eval_len, fp16_full_eval=True, skip_memory_metrics=False) metrics = trainer.evaluate() fp16_init = metrics["init_mem_gpu_alloc_delta"] fp16_eval = metrics["eval_mem_gpu_alloc_delta"]