From 3318c246f3c9f7f513823e72318fe356790182b9 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Wed, 17 Mar 2021 11:16:37 -0700 Subject: [PATCH] make failure to find a resume checkpoint fatal + tests (#10777) --- src/transformers/trainer.py | 5 ++++- tests/test_trainer.py | 36 ++++++++++++++++++++++++------------ 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 4b4edd279e..bf1a5e1731 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -876,7 +876,10 @@ class Trainer: if resume_from_checkpoint is None: raise ValueError(f"No valid checkpoint found in output directory ({self.args.output_dir})") - if resume_from_checkpoint is not None and os.path.isfile(os.path.join(resume_from_checkpoint, WEIGHTS_NAME)): + if resume_from_checkpoint is not None: + if not os.path.isfile(os.path.join(resume_from_checkpoint, WEIGHTS_NAME)): + raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}") + logger.info(f"Loading model from {resume_from_checkpoint}).") if self.deepspeed: diff --git a/tests/test_trainer.py b/tests/test_trainer.py index 4a36118c4e..ed1deaa8c2 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -613,7 +613,8 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon): return with tempfile.TemporaryDirectory() as tmpdir: - trainer = get_regression_trainer(output_dir=tmpdir, train_len=128, save_steps=5, learning_rate=0.1) + kwargs = dict(output_dir=tmpdir, train_len=128, save_steps=5, learning_rate=0.1) + trainer = get_regression_trainer(**kwargs) trainer.train() (a, b) = trainer.model.a.item(), trainer.model.b.item() state = dataclasses.asdict(trainer.state) @@ -621,7 +622,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon): checkpoint = os.path.join(tmpdir, "checkpoint-5") # Reinitialize trainer - trainer = get_regression_trainer(output_dir=tmpdir, train_len=128, save_steps=5, learning_rate=0.1) + trainer = get_regression_trainer(**kwargs) trainer.train(resume_from_checkpoint=checkpoint) (a1, b1) = trainer.model.a.item(), trainer.model.b.item() @@ -634,7 +635,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon): checkpoint = os.path.join(tmpdir, "checkpoint-15") # Reinitialize trainer and load model - trainer = get_regression_trainer(output_dir=tmpdir, train_len=128, save_steps=5, learning_rate=0.1) + trainer = get_regression_trainer(**kwargs) trainer.train(resume_from_checkpoint=checkpoint) (a1, b1) = trainer.model.a.item(), trainer.model.b.item() @@ -645,9 +646,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon): # With a regular model that is not a PreTrainedModel with tempfile.TemporaryDirectory() as tmpdir: - trainer = get_regression_trainer( - output_dir=tmpdir, train_len=128, save_steps=5, learning_rate=0.1, pretrained=False - ) + kwargs = dict(output_dir=tmpdir, train_len=128, save_steps=5, learning_rate=0.1, pretrained=False) + + trainer = get_regression_trainer(**kwargs) trainer.train() (a, b) = trainer.model.a.item(), trainer.model.b.item() state = dataclasses.asdict(trainer.state) @@ -655,9 +656,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon): checkpoint = os.path.join(tmpdir, "checkpoint-5") # Reinitialize trainer and load model - trainer = get_regression_trainer( - output_dir=tmpdir, train_len=128, save_steps=5, learning_rate=0.1, pretrained=False - ) + trainer = get_regression_trainer(**kwargs) trainer.train(resume_from_checkpoint=checkpoint) (a1, b1) = trainer.model.a.item(), trainer.model.b.item() @@ -670,9 +669,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon): checkpoint = os.path.join(tmpdir, "checkpoint-15") # Reinitialize trainer and load model - trainer = get_regression_trainer( - output_dir=tmpdir, train_len=128, save_steps=5, learning_rate=0.1, pretrained=False - ) + trainer = get_regression_trainer(**kwargs) trainer.train(resume_from_checkpoint=checkpoint) (a1, b1) = trainer.model.a.item(), trainer.model.b.item() @@ -681,6 +678,21 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon): self.assertEqual(b, b1) self.check_trainer_state_are_the_same(state, state1) + # Now check failures + + # 1. fail to find a bogus checkpoint + trainer = get_regression_trainer() + with self.assertRaises(Exception) as context: + trainer.train(resume_from_checkpoint=f"{checkpoint}-bogus") + self.assertTrue("Can't find a valid checkpoint at" in str(context.exception)) + + # 2. fail to find any checkpoint - due a fresh output_dir + output_dir2 = self.get_auto_remove_tmp_dir() + trainer = get_regression_trainer(output_dir=output_dir2) + with self.assertRaises(Exception) as context: + trainer.train(resume_from_checkpoint=True) + self.assertTrue("No valid checkpoint found in output directory" in str(context.exception)) + def test_resume_training_with_gradient_accumulation(self): if torch.cuda.device_count() > 2: # This test will fail for more than 2 GPUs since the batch size will get bigger and with the number of