Yell at the user if zero-3 init wasn't performed, but expected to have been done (#32299)

* Test this zach * Test for improper init w/o zero3 * Move back * Apply suggestions from code review Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Get rid of stars in warning * Make private * Make clear --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
2024-08-01 15:18:43 -04:00
parent 51ab25e293
commit 82efc53513
3 changed files with 42 additions and 0 deletions
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -709,6 +709,31 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
        # Relative difference. See the note above how to get identical loss on a small bs
        self.assertTrue((no_grad_accum_loss - yes_grad_accum_loss) / (no_grad_accum_loss + 1e-15) <= 1e-3)

+    def test_missed_zero3_init(self):
+        from transformers import Trainer  # noqa
+
+        with mockenv_context(**self.dist_env_1_gpu):
+            model = AutoModel.from_pretrained(T5_TINY)
+            training_args = TrainingArguments(
+                output_dir="./test_missed_zero3_init",
+                deepspeed=self.get_config_dict(ZERO3),
+            )
+            with self.assertRaises(
+                ValueError, msg="Model was not initialized with `Zero-3` despite being configured."
+            ):
+                _ = Trainer(
+                    model=model,
+                    args=training_args,
+                )
+            # Now do it properly, triggered from our `TrainingArguments` earlier
+            model = AutoModel.from_pretrained(T5_TINY)
+            trainer = Trainer(
+                model=model,
+                args=training_args,
+            )
+            assert trainer.is_deepspeed_enabled
+            assert model._transformers_zero3_init_used
+
    def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage, dtype):
        # adapted from TrainerIntegrationCommon.check_saved_checkpoints
        file_list = [SAFE_WEIGHTS_NAME, "training_args.bin", "trainer_state.json", "config.json"]