Trainer - add cache clearing and the option for batched eval metrics computation (#28769)

* Added cache clearing for GPU efficiency. * Added cache clearing for GPU efficiency. * Added batch_eval_metrics capability * Ran make fixup * Fixed bug * Fixed whitespace issue * Fixed outdated condition * Updated docstrings with instructions for batch_eval_metrics. Updated end of dataloader logic * Added first version of batch_eval_metrics Trainer test * Fixed batch_eval_metrics Trainer tests for both eval and predict * Fixed batch_eval_metrics behavior for new Trainer variables * Fixed batch_eval_metrics Trainer tests * Ran fixup
2024-05-06 05:23:40 -07:00
parent e076953079
commit df475bf8e6
3 changed files with 205 additions and 11 deletions
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -230,6 +230,27 @@ class AlmostAccuracy:
        return {"accuracy": true.astype(np.float32).mean().item()}


+class AlmostAccuracyBatched:
+    def __init__(self, thresh=0.25):
+        self.thresh = thresh
+        self.batch_acc = []
+
+    def __call__(self, eval_pred, compute_result):
+        predictions, labels = eval_pred
+        if isinstance(predictions, tuple):
+            predictions = predictions[0]
+        if isinstance(labels, tuple):
+            labels = labels[0]
+        batch_size = len(predictions)
+        true = torch.abs(predictions - labels) <= self.thresh
+        acc = true.type(torch.FloatTensor).mean().item()
+        self.batch_acc.extend([acc] * batch_size)
+        if compute_result:
+            result = {"accuracy": np.mean(self.batch_acc).item()}
+            self.batch_acc = []
+            return result
+
+
 class RegressionModelConfig(PretrainedConfig):
    def __init__(self, a=0, b=0, double_output=False, random_torch=True, **kwargs):
        super().__init__(**kwargs)
@@ -1524,6 +1545,49 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"]
        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)

+    def test_evaluate_with_batch_eval_metrics(self):
+        trainer = get_regression_trainer(
+            a=1.5, b=2.5, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True
+        )
+        results = trainer.evaluate()
+
+        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
+        pred = 1.5 * x + 2.5
+        expected_loss = ((pred - y) ** 2).mean()
+        self.assertAlmostEqual(results["eval_loss"], expected_loss)
+        expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
+        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
+
+        # With a number of elements not a round multiple of the batch size
+        trainer = get_regression_trainer(
+            a=1.5, b=2.5, eval_len=66, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True
+        )
+        results = trainer.evaluate()
+
+        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
+        pred = 1.5 * x + 2.5
+        expected_loss = ((pred - y) ** 2).mean()
+        self.assertAlmostEqual(results["eval_loss"], expected_loss)
+        expected_acc = AlmostAccuracy()((pred, y))["accuracy"]
+        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
+
+        # With logits preprocess
+        trainer = get_regression_trainer(
+            a=1.5,
+            b=2.5,
+            compute_metrics=AlmostAccuracyBatched(),
+            batch_eval_metrics=True,
+            preprocess_logits_for_metrics=lambda logits, labels: logits + 1,
+        )
+        results = trainer.evaluate()
+
+        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
+        pred = 1.5 * x + 2.5
+        expected_loss = ((pred - y) ** 2).mean()
+        self.assertAlmostEqual(results["eval_loss"], expected_loss)
+        expected_acc = AlmostAccuracy()((pred + 1, y))["accuracy"]
+        self.assertAlmostEqual(results["eval_accuracy"], expected_acc)
+
    def test_evaluate_with_jit(self):
        trainer = get_regression_trainer(a=1.5, b=2.5, compute_metrics=AlmostAccuracy(), jit_mode_eval=True)
        results = trainer.evaluate()
@@ -1651,6 +1715,58 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0]))
        self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1]))

+    def test_predict_with_batch_eval_metrics(self):
+        trainer = get_regression_trainer(
+            a=1.5, b=2.5, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True
+        )
+        results = trainer.predict(trainer.eval_dataset)
+        preds = results.predictions
+        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
+        gt = 1.5 * x + 2.5
+        self.assertTrue(np.allclose(preds, gt))
+        expected_acc = AlmostAccuracy()((preds, y))["accuracy"]
+        self.assertAlmostEqual(results.metrics["test_accuracy"], expected_acc)
+
+        # With a number of elements not a round multiple of the batch size
+        trainer = get_regression_trainer(
+            a=1.5, b=2.5, eval_len=66, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True
+        )
+        results = trainer.predict(trainer.eval_dataset)
+        preds = results.predictions
+        x, y = trainer.eval_dataset.x, trainer.eval_dataset.ys[0]
+        self.assertTrue(np.allclose(preds, 1.5 * x + 2.5))
+        expected_acc = AlmostAccuracy()((preds, y))["accuracy"]
+        self.assertAlmostEqual(results.metrics["test_accuracy"], expected_acc)
+
+        # With more than one output of the model
+        trainer = get_regression_trainer(
+            a=1.5, b=2.5, double_output=True, compute_metrics=AlmostAccuracyBatched(), batch_eval_metrics=True
+        )
+        preds = trainer.predict(trainer.eval_dataset).predictions
+        x = trainer.eval_dataset.x
+        self.assertEqual(len(preds), 2)
+        self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
+        self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
+
+        # With more than one output/label of the model
+        trainer = get_regression_trainer(
+            a=1.5,
+            b=2.5,
+            double_output=True,
+            label_names=["labels", "labels_2"],
+            compute_metrics=AlmostAccuracyBatched(),
+            batch_eval_metrics=True,
+        )
+        outputs = trainer.predict(trainer.eval_dataset)
+        preds = outputs.predictions
+        labels = outputs.label_ids
+        x = trainer.eval_dataset.x
+        self.assertEqual(len(preds), 2)
+        self.assertTrue(np.allclose(preds[0], 1.5 * x + 2.5))
+        self.assertTrue(np.allclose(preds[1], 1.5 * x + 2.5))
+        self.assertTrue(np.array_equal(labels[0], trainer.eval_dataset.ys[0]))
+        self.assertTrue(np.array_equal(labels[1], trainer.eval_dataset.ys[1]))
+
    def test_predict_with_jit(self):
        trainer = get_regression_trainer(a=1.5, b=2.5, jit_mode_eval=True)
        preds = trainer.predict(trainer.eval_dataset).predictions