[Trainer] memory tracker metrics (#10225)

* memory tracker metrics * go back to eval for somewhat consistency * handle no-gpu case * deal with stackable eval calls * restore callback order * style * simplify the API * add test * docs * consistently use eval_ prefix * improve docs * Update src/transformers/trainer_utils.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * rename method * style Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
2021-02-18 09:27:32 -08:00
parent d7f38c5d1d
commit 97e688bc22
7 changed files with 294 additions and 14 deletions
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -884,6 +884,34 @@ class TrainerIntegrationTest(unittest.TestCase):
        trainer.train()
        self.assertTrue(isinstance(trainer.state.total_flos, float))

+    def check_mem_metrics(self, trainer, check_func):
+        metrics = trainer.train().metrics
+        check_func("init_mem_cpu_alloc_delta", metrics)
+        check_func("train_mem_cpu_alloc_delta", metrics)
+        if torch.cuda.device_count() > 0:
+            check_func("init_mem_gpu_alloc_delta", metrics)
+            check_func("train_mem_gpu_alloc_delta", metrics)
+
+        metrics = trainer.evaluate()
+        check_func("eval_mem_cpu_alloc_delta", metrics)
+        if torch.cuda.device_count() > 0:
+            check_func("eval_mem_gpu_alloc_delta", metrics)
+
+        metrics = trainer.predict(RegressionDataset()).metrics
+        check_func("test_mem_cpu_alloc_delta", metrics)
+        if torch.cuda.device_count() > 0:
+            check_func("test_mem_gpu_alloc_delta", metrics)
+
+    def test_mem_metrics(self):
+
+        # with mem metrics enabled
+        trainer = get_regression_trainer()
+        self.check_mem_metrics(trainer, self.assertIn)
+
+        # with mem metrics disabled
+        trainer = get_regression_trainer(skip_memory_metrics=True)
+        self.check_mem_metrics(trainer, self.assertNotIn)
+

@require_torch
@require_optuna