[Trainer] memory tracker metrics (#10225)

* memory tracker metrics

* go back to eval for somewhat consistency

* handle no-gpu case

* deal with stackable eval calls

* restore callback order

* style

* simplify the API

* add test

* docs

* consistently use eval_ prefix

* improve docs

* Update src/transformers/trainer_utils.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* rename method

* style

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
Stas Bekman
2021-02-18 09:27:32 -08:00
committed by GitHub
parent d7f38c5d1d
commit 97e688bc22
7 changed files with 294 additions and 14 deletions

View File

@@ -884,6 +884,34 @@ class TrainerIntegrationTest(unittest.TestCase):
trainer.train()
self.assertTrue(isinstance(trainer.state.total_flos, float))
def check_mem_metrics(self, trainer, check_func):
metrics = trainer.train().metrics
check_func("init_mem_cpu_alloc_delta", metrics)
check_func("train_mem_cpu_alloc_delta", metrics)
if torch.cuda.device_count() > 0:
check_func("init_mem_gpu_alloc_delta", metrics)
check_func("train_mem_gpu_alloc_delta", metrics)
metrics = trainer.evaluate()
check_func("eval_mem_cpu_alloc_delta", metrics)
if torch.cuda.device_count() > 0:
check_func("eval_mem_gpu_alloc_delta", metrics)
metrics = trainer.predict(RegressionDataset()).metrics
check_func("test_mem_cpu_alloc_delta", metrics)
if torch.cuda.device_count() > 0:
check_func("test_mem_gpu_alloc_delta", metrics)
def test_mem_metrics(self):
# with mem metrics enabled
trainer = get_regression_trainer()
self.check_mem_metrics(trainer, self.assertIn)
# with mem metrics disabled
trainer = get_regression_trainer(skip_memory_metrics=True)
self.check_mem_metrics(trainer, self.assertNotIn)
@require_torch
@require_optuna