[Deepspeed] ZeRO-Infinity integration plus config revamp (#11418)

* adding Z-inf * revamp config process * up version requirement * wip * massive rewrite * cleanup * cleanup * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * consistent json commas * act on suggestions * leave this feature for 0.3.16 * style Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
2021-04-26 10:40:32 -07:00
parent 0661abc545
commit bc2571e61c
10 changed files with 896 additions and 503 deletions
--- a/tests/deepspeed/ds_config_zero2.json
+++ b/tests/deepspeed/ds_config_zero2.json
@@ -1,6 +1,6 @@
 {
    "fp16": {
-        "enabled": true,
+        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
@@ -8,6 +8,25 @@
        "min_loss_scale": 1
    },

+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+
    "zero_optimization": {
        "stage": 2,
        "allgather_partitions": true,
@@ -19,25 +38,10 @@
        "cpu_offload": true
    },

-    "optimizer": {
-        "type": "AdamW",
-        "params": {
-            "lr": 3e-5,
-            "betas": [0.8, 0.999],
-            "eps": 1e-8,
-            "weight_decay": 3e-7
-        }
-    },
-
-    "scheduler": {
-        "type": "WarmupLR",
-        "params": {
-            "warmup_min_lr": 0,
-            "warmup_max_lr": 3e-5,
-            "warmup_num_steps": 500
-        }
-    },
-
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
 }
--- a/tests/deepspeed/ds_config_zero3.json
+++ b/tests/deepspeed/ds_config_zero3.json
@@ -1,6 +1,6 @@
 {
    "fp16": {
-        "enabled": true,
+        "enabled": "auto",
        "loss_scale": 0,
        "loss_scale_window": 1000,
        "initial_scale_power": 16,
@@ -8,41 +8,50 @@
        "min_loss_scale": 1
    },

-    "zero_optimization": {
-        "stage": 3,
-        "cpu_offload": true,
-        "cpu_offload_params": true,
-        "cpu_offload_use_pin_memory" : true,
-        "overlap_comm": true,
-        "contiguous_gradients": true,
-        "sub_group_size": 1e14,
-        "reduce_bucket_size": 0,
-        "stage3_prefetch_bucket_size": 0,
-        "stage3_param_persistence_threshold": 0,
-        "stage3_max_live_parameters": 1e9,
-        "stage3_max_reuse_distance": 1e9,
-        "stage3_gather_fp16_weights_on_model_save": true
-    },
-
    "optimizer": {
        "type": "AdamW",
        "params": {
-            "lr": 3e-5,
-            "betas": [0.8, 0.999],
-            "eps": 1e-8,
-            "weight_decay": 3e-7
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
        }
    },

    "scheduler": {
        "type": "WarmupLR",
        "params": {
-            "warmup_min_lr": 0,
-            "warmup_max_lr": 3e-5,
-            "warmup_num_steps": 500
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
        }
    },

+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e14,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_fp16_weights_on_model_save": true
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
    "wall_clock_breakdown": false
 }
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -42,7 +42,7 @@ with ExtendSysPath(f"{bindir}/.."):
    from test_trainer import TrainerIntegrationCommon  # noqa

    if is_torch_available():
-        from test_trainer import get_regression_trainer  # noqa
+        from test_trainer import RegressionModelConfig, RegressionPreTrainedModel, get_regression_trainer  # noqa


 set_seed(42)
@@ -66,6 +66,10 @@ def require_deepspeed(test_case):
        return test_case


+if is_deepspeed_available():
+    from deepspeed.utils import logger as deepspeed_logger  # noqa
+    from transformers.integrations import deepspeed_config, is_deepspeed_zero3_enabled  # noqa
+
 ZERO2 = "zero2"
 ZERO3 = "zero3"
 stages = [ZERO2, ZERO3]
@@ -115,12 +119,6 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
        with io.open(self.ds_config_file[ZERO3], "r", encoding="utf-8") as f:
            self.ds_config_dict[ZERO3] = json.load(f)

-    def tearDown(self):
-        # XXX: Fixme - this is a temporary band-aid since this global variable impacts other tests
-        import transformers
-
-        transformers.integrations._is_deepspeed_zero3_enabled = None
-
    def get_config_dict(self, stage):
        """As the tests modify the dict, always make a copy"""
        config = deepcopy(self.ds_config_dict[stage])
@@ -173,25 +171,65 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
            trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_zero2_dict)
            with self.assertRaises(Exception) as context:
                trainer.train()
-        self.assertTrue("HF scheduler + DeepSpeed optimizer combination is not possible" in str(context.exception))
+        self.assertTrue(
+            "HF scheduler + DeepSpeed optimizer combination is not possible" in str(context.exception),
+            f"got exception: {context.exception}",
+        )

-    def test_hf_optimizer_with_offload(self):
-        # must not allow non-DS optimizer when using ZERO-offload
+    def test_stage3_nvme_offload(self):
        with mockenv_context(**self.dist_env_1_gpu):
-            ds_config_zero2_dict = self.get_config_dict(ZERO2)
-            del ds_config_zero2_dict["optimizer"]  # force default HF Trainer optimizer
-            ds_config_zero2_dict["zero_optimization"]["cpu_offload"] = True
-            # sanity check - should the default config change
-            assert (
-                "cpu_offload" in ds_config_zero2_dict["zero_optimization"]
-                and ds_config_zero2_dict["zero_optimization"]["cpu_offload"] is True
-            ), "ensure the config is set up correctly"
-            trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_zero2_dict)
-            with self.assertRaises(Exception) as context:
+            # this actually doesn't have to be on NVMe, any storage will do since this test only
+            # runs a simple check that we can use some directory as if it were NVMe
+            nvme_path = self.get_auto_remove_tmp_dir()
+            nvme_config = dict(device="nvme", nvme_path=nvme_path)
+            ds_config_zero3_dict = self.get_config_dict(ZERO3)
+            ds_config_zero3_dict["zero_optimization"]["offload_optimizer"] = nvme_config
+            ds_config_zero3_dict["zero_optimization"]["offload_param"] = nvme_config
+            trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_zero3_dict)
+            with CaptureLogger(deepspeed_logger) as cs:
                trainer.train()
-        self.assertTrue("ZeRO Offload can only work with DeepSpeed optimizers" in str(context.exception))
+            self.assertIn("DeepSpeed info", cs.out, "expected DeepSpeed logger output but got none")

    # --- These tests need to run on both zero stages --- #
+
+    @parameterized.expand(stages)
+    def test_fp32(self, stage):
+        ds_config_dict = self.get_config_dict(stage)
+        ds_config_dict["fp16"]["enabled"] = False  # force non-fp16 mode
+
+        # XXX: do we go via from_pretrained in zero 3 here? need to test zero.Init(dtype=torch.float)
+
+        # XXX: rewrite this test once fp32 is supported by DeepSpeed
+        with mockenv_context(**self.dist_env_1_gpu):
+            trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_dict)
+            with self.assertRaises(Exception) as context:
+                trainer.train()
+            self.assertIn(
+                "ZeRO is only supported if fp16 is enabled",
+                str(context.exception),
+                f"got exception: {context.exception}",
+            )
+
+    @parameterized.expand(stages)
+    def test_hf_optimizer_with_offload(self, stage):
+        # must not allow non-DS optimizer when using ZERO-offload
+        ds_config_dict = self.get_config_dict(stage)
+        del ds_config_dict["optimizer"]  # force default HF Trainer optimizer
+        # force cpu offload
+        if stage == "stage2":
+            ds_config_dict["zero_optimization"]["cpu_offload"] = True
+        elif stage == "stage3":
+            ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
+        with mockenv_context(**self.dist_env_1_gpu):
+            trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_dict)
+            with self.assertRaises(Exception) as context:
+                trainer.train()
+            self.assertIn(
+                "ZeRO Offload can only work with DeepSpeed optimizers",
+                str(context.exception),
+                f"got exception: {context.exception}",
+            )
+
    @parameterized.expand(stages)
    def test_fake_notebook_no_launcher(self, stage):
        # this setup emulates a notebook where a launcher needs to be emulated by hand
@@ -199,14 +237,12 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
        # note that unittest resets sys.stdout each test, so `CaptureStd` will work here to capture
        # DeepSpeed log if this test happens to run first in this pytest worker. But it will fail if
        # it's run not as a first test as `sys.stdout` will no longer be the same. So we either have
-        # to reset `logger.handlers[0].setStream(sys.stdout)` or directly capture from the logger.
-        from deepspeed.utils import logger
-
-        with CaptureLogger(logger) as cs:
-            with mockenv_context(**self.dist_env_1_gpu):
-                trainer = get_regression_trainer(local_rank=0, deepspeed=self.ds_config_file[stage])
+        # to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger.
+        with mockenv_context(**self.dist_env_1_gpu):
+            trainer = get_regression_trainer(local_rank=0, deepspeed=self.ds_config_file[stage])
+            with CaptureLogger(deepspeed_logger) as cs:
                trainer.train()
-        assert "DeepSpeed info" in cs.out, "expected DeepSpeed logger output but got none"
+            self.assertIn("DeepSpeed info", cs.out, "expected DeepSpeed logger output but got none")

    @parameterized.expand(stages)
    def test_early_get_last_lr(self, stage):
@@ -425,6 +461,38 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
            self.assertEqual(b, b1)
            self.check_trainer_state_are_the_same(state, state1)

+    def test_config_object(self):
+        # test that we can switch from zero2 to zero3 in the same process for example
+        # test is_zero, etc.
+        output_dir = self.get_auto_remove_tmp_dir()
+        kwargs = dict(output_dir=output_dir, train_len=8)
+
+        with mockenv_context(**self.dist_env_1_gpu):
+            ds_config_zero3_dict = self.get_config_dict("zero3")
+            ds_config_zero2_dict = self.get_config_dict("zero2")
+
+            trainer = get_regression_trainer(deepspeed=ds_config_zero3_dict, **kwargs)
+            self.assertTrue(is_deepspeed_zero3_enabled())
+
+            # test we can repeat that and with train this time
+            trainer = get_regression_trainer(deepspeed=ds_config_zero3_dict, **kwargs)
+            trainer.train()
+            self.assertTrue(is_deepspeed_zero3_enabled())
+
+            # test zero3 is disabled
+            trainer = get_regression_trainer(deepspeed=ds_config_zero2_dict, **kwargs)
+            self.assertFalse(is_deepspeed_zero3_enabled())
+
+            # check config obj
+            config = deepspeed_config()
+            self.assertTrue(bool(config), "Deepspeed config should be accessible")
+
+            del trainer
+            # now weakref should gc the global and we shouldn't get anything here
+            config = deepspeed_config()
+            self.assertFalse(is_deepspeed_zero3_enabled())
+            self.assertFalse(bool(config), "Deepspeed config should not be accessible")
+

@slow
@require_deepspeed
@@ -557,6 +625,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
            --adafactor
            --source_lang en
            --target_lang ro
+            --report_to none
        """.split()
        args.extend(["--source_prefix", '"translate English to Romanian: "'])

@@ -626,6 +695,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
            --num_train_epochs 1
            --warmup_steps 8
            --block_size 128
+            --report_to none
            """.split()

        ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split()