From 0118ef89eeffc58d8e3d6e26ca955e643f89ec2f Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Mon, 19 Jul 2021 19:50:47 +0200
Subject: [PATCH] Enforce eval and save strategies are compatible when
 --load_best_model_at_end (#12786)

* Enforce eval and save strategies are compatible when --load_best_model_at_end

* Update doc

* Fix typos

* Fix tests
---
 src/transformers/trainer_callback.py |  5 +----
 src/transformers/training_args.py    | 21 +++++++++++++++++----
 tests/test_trainer.py                | 16 +++++++++++++---
 3 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py
index 278fb6d0ab..0ccbda280c 100644
--- a/src/transformers/trainer_callback.py
+++ b/src/transformers/trainer_callback.py
@@ -418,8 +418,7 @@ class DefaultFlowCallback(TrainerCallback):
 
         # Save
         if (
-            not args.load_best_model_at_end
-            and args.save_strategy == IntervalStrategy.STEPS
+            args.save_strategy == IntervalStrategy.STEPS
             and args.save_steps > 0
             and state.global_step % args.save_steps == 0
         ):
@@ -439,8 +438,6 @@ class DefaultFlowCallback(TrainerCallback):
         # Evaluate
         if args.evaluation_strategy == IntervalStrategy.EPOCH:
             control.should_evaluate = True
-            if args.load_best_model_at_end:
-                control.should_save = True
 
         # Save
         if args.save_strategy == IntervalStrategy.EPOCH:
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index dcb3aa6d0b..5866380f46 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -172,8 +172,7 @@ class TrainingArguments:
         logging_steps (:obj:`int`, `optional`, defaults to 500):
             Number of update steps between two logs if :obj:`logging_strategy="steps"`.
         save_strategy (:obj:`str` or :class:`~transformers.trainer_utils.IntervalStrategy`, `optional`, defaults to :obj:`"steps"`):
-            The checkpoint save strategy to adopt during training (Note that when :obj:`load_best_model_at_end=True`,
-            this parameter is ignored and the model is saved after each evaluation). Possible values are:
+            The checkpoint save strategy to adopt during training. Possible values are:
 
                 * :obj:`"no"`: No save is done during training.
                 * :obj:`"epoch"`: Save is done at the end of each epoch.
@@ -247,8 +246,9 @@ class TrainingArguments:
 
             .. note::
 
-                When set to :obj:`True`, the parameters :obj:`save_strategy` and :obj:`save_steps` will be ignored and
-                the model will be saved after each evaluation.
+                When set to :obj:`True`, the parameters :obj:`save_strategy` needs to be the same as
+                :obj:`eval_strategy`, and in the case it is "steps", :obj:`save_steps` must be a round multiple of
+                :obj:`eval_steps`.
         metric_for_best_model (:obj:`str`, `optional`):
             Use in conjunction with :obj:`load_best_model_at_end` to specify the metric to use to compare two different
             models. Must be the name of a metric returned by the evaluation with or without the prefix :obj:`"eval_"`.
@@ -666,6 +666,19 @@ class TrainingArguments:
         if self.eval_steps is None:
             self.eval_steps = self.logging_steps
 
+        # Sanity checks for load_best_model_at_end: we require save and eval strategies to be compatible.
+        if self.load_best_model_at_end:
+            if self.evaluation_strategy != self.save_strategy:
+                raise ValueError(
+                    "--load_best_model_at_end requires the save and eval strategy to match, but found\n- Evaluation "
+                    f"strategy: {self.evaluation_strategy}\n- Save strategy: {self.save_strategy}"
+                )
+            if self.evaluation_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0:
+                raise ValueError(
+                    "--load_best_model_at_end requires the saving steps to be a round multiple of the evaluation "
+                    f"steps, but found {self.save_steps}, which is not a round multiple of {self.eval_steps}."
+                )
+
         if self.load_best_model_at_end and self.metric_for_best_model is None:
             self.metric_for_best_model = "loss"
         if self.greater_is_better is None and self.metric_for_best_model is not None:
diff --git a/tests/test_trainer.py b/tests/test_trainer.py
index 3d12f82327..c94b61751c 100644
--- a/tests/test_trainer.py
+++ b/tests/test_trainer.py
@@ -915,6 +915,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
                 learning_rate=0.1,
                 eval_steps=5,
                 evaluation_strategy="steps",
+                save_steps=5,
                 load_best_model_at_end=True,
             )
             self.assertFalse(trainer.args.greater_is_better)
@@ -930,6 +931,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
                 learning_rate=0.1,
                 eval_steps=5,
                 evaluation_strategy="steps",
+                save_steps=5,
                 load_best_model_at_end=True,
                 metric_for_best_model="accuracy",
                 compute_metrics=AlmostAccuracy(),
@@ -939,7 +941,6 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
             self.check_saved_checkpoints(tmpdir, 5, total)
             self.check_best_model_has_been_loaded(tmpdir, 5, total, trainer, "eval_accuracy", greater_is_better=True)
 
-        # Save is done every eval regardless of the strategy
         with tempfile.TemporaryDirectory() as tmpdir:
             trainer = get_regression_trainer(
                 a=1.5,
@@ -947,6 +948,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
                 output_dir=tmpdir,
                 learning_rate=0.1,
                 evaluation_strategy="epoch",
+                save_strategy="epoch",
                 load_best_model_at_end=True,
                 metric_for_best_model="accuracy",
                 compute_metrics=AlmostAccuracy(),
@@ -965,6 +967,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
                 learning_rate=0.1,
                 eval_steps=5,
                 evaluation_strategy="steps",
+                save_steps=5,
                 load_best_model_at_end=True,
                 pretrained=False,
             )
@@ -1083,6 +1086,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
                 per_device_train_batch_size=16,
                 load_best_model_at_end=True,
                 evaluation_strategy=IntervalStrategy.EPOCH,
+                save_strategy=IntervalStrategy.EPOCH,
                 compute_metrics=AlmostAccuracy(),
                 metric_for_best_model="accuracy",
             )
@@ -1140,13 +1144,17 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
             self.check_checkpoint_deletion(trainer, tmp_dir, [20, 25])
 
             # With best model at end
-            trainer = get_regression_trainer(output_dir=tmp_dir, load_best_model_at_end=True, save_total_limit=2)
+            trainer = get_regression_trainer(
+                output_dir=tmp_dir, evaluation_strategy="steps", load_best_model_at_end=True, save_total_limit=2
+            )
             trainer.state.best_model_checkpoint = os.path.join(tmp_dir, "checkpoint-5")
             self.check_checkpoint_deletion(trainer, tmp_dir, [5, 25])
 
             # Edge case: we don't always honor save_total_limit=1 if load_best_model_at_end=True to be able to resume
             # from checkpoint
-            trainer = get_regression_trainer(output_dir=tmp_dir, load_best_model_at_end=True, save_total_limit=1)
+            trainer = get_regression_trainer(
+                output_dir=tmp_dir, evaluation_strategy="steps", load_best_model_at_end=True, save_total_limit=1
+            )
             trainer.state.best_model_checkpoint = os.path.join(tmp_dir, "checkpoint-25")
             self.check_checkpoint_deletion(trainer, tmp_dir, [25])
 
@@ -1350,6 +1358,7 @@ class TrainerHyperParameterOptunaIntegrationTest(unittest.TestCase):
                 learning_rate=0.1,
                 logging_steps=1,
                 evaluation_strategy=IntervalStrategy.EPOCH,
+                save_strategy=IntervalStrategy.EPOCH,
                 num_train_epochs=4,
                 disable_tqdm=True,
                 load_best_model_at_end=True,
@@ -1400,6 +1409,7 @@ class TrainerHyperParameterRayIntegrationTest(unittest.TestCase):
                 learning_rate=0.1,
                 logging_steps=1,
                 evaluation_strategy=IntervalStrategy.EPOCH,
+                save_strategy=IntervalStrategy.EPOCH,
                 num_train_epochs=4,
                 disable_tqdm=True,
                 load_best_model_at_end=True,