[Trainer] implement gradient_accumulation_steps support in DeepSpeed integration (#10310)

* implement gradient_accumulation_steps support in DeepSpeed integration * typo * cleanup * cleanup
2021-02-22 11:15:59 -08:00
parent f991daed18
commit eab0afc19c
5 changed files with 162 additions and 27 deletions
--- a/examples/tests/deepspeed/test_deepspeed.py
+++ b/examples/tests/deepspeed/test_deepspeed.py
@@ -23,7 +23,7 @@ from transformers.testing_utils import (
    TestCasePlus,
    execute_subprocess_async,
    get_gpu_count,
-    mockenv,
+    mockenv_context,
    require_torch_gpu,
    require_torch_multi_gpu,
    slow,
@@ -31,6 +31,11 @@ from transformers.testing_utils import (
 from transformers.trainer_utils import set_seed


+bindir = os.path.abspath(os.path.dirname(__file__))
+sys.path.append(f"{bindir}/../../../tests")
+from test_trainer import get_regression_trainer  # noqa
+
+
 set_seed(42)
 MBART_TINY = "sshleifer/tiny-mbart"

@@ -51,32 +56,96 @@ def require_deepspeed(test_case):
        return test_case


+@require_deepspeed
+@require_torch_gpu
+class TrainerIntegrationDeepSpeed(TestCasePlus):
+    """ This class is for testing directly via get_regression_trainer """
+
+    def setUp(self):
+        super().setUp()
+        self.dist_env_1_gpu = dict(
+            MASTER_ADDR="localhost", MASTER_PORT="10999", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
+        )
+        self.ds_config_file = f"{self.test_file_dir_str}/ds_config.json"
+
+    def test_fake_notebook_no_launcher(self):
+
+        # this setup emulates a notebook where a launcher needs to be emulated by hand
+
+        with CaptureStd() as cs:
+            with mockenv_context(**self.dist_env_1_gpu):
+                trainer = get_regression_trainer(local_rank=0, deepspeed=self.ds_config_file)
+                trainer.train()
+        assert "DeepSpeed info" in cs.out, "expected DeepSpeed logger output but got none"
+
+    def test_gradient_accumulation(self):
+
+        # this test measures that we get identical weights and similar loss with:
+        # 1. per_device_train_batch_size=8, gradient_accumulation_steps=1
+        # 2. per_device_train_batch_size=4, gradient_accumulation_steps=2
+        # since the 2nd should produce the effective batch of 1st, with the same results
+        #
+        # I can get an identical loss for a small train_len=32, plus the power of the initial
+        # dynamic loss scale value set to:
+        #   "fp16.initial_scale_power": 1
+        # plus having the same WarmupLR's warmup_min_lr == warmup_max_lr in the config file
+        # but for some reason going to train_len=64 the weights, weights start to mismatch with this setup.
+        # the culprit seems to be `initial_scale_power` - putting it back to its default 32 keeps the weights identical
+
+        train_len = 64
+        a = b = 0.0
+
+        with mockenv_context(**self.dist_env_1_gpu):
+            no_grad_accum_trainer = get_regression_trainer(
+                a=a,
+                b=b,
+                local_rank=0,
+                train_len=train_len,
+                deepspeed=self.ds_config_file,
+                per_device_train_batch_size=8,
+                gradient_accumulation_steps=1,
+            )
+            no_grad_accum_result = no_grad_accum_trainer.train()
+            no_grad_accum_loss = no_grad_accum_result.training_loss
+            no_grad_accum_a = no_grad_accum_trainer.model.a.item()
+            no_grad_accum_b = no_grad_accum_trainer.model.b.item()
+            # make sure the optimizer kicked in - if it hasn't changed from the original value of a then make train_len bigger
+            self.assertNotEqual(no_grad_accum_a, a)
+
+        with mockenv_context(**self.dist_env_1_gpu):
+            yes_grad_accum_trainer = get_regression_trainer(
+                a=a,
+                b=b,
+                local_rank=0,
+                train_len=train_len,
+                deepspeed=self.ds_config_file,
+                per_device_train_batch_size=4,
+                gradient_accumulation_steps=2,
+            )
+            yes_grad_accum_result = yes_grad_accum_trainer.train()
+            yes_grad_accum_loss = yes_grad_accum_result.training_loss
+            yes_grad_accum_a = yes_grad_accum_trainer.model.a.item()
+            yes_grad_accum_b = yes_grad_accum_trainer.model.b.item()
+            self.assertNotEqual(yes_grad_accum_a, a)
+
+        # training with half the batch size but accumulation steps as 2 should give the same weights
+        self.assertEqual(no_grad_accum_a, yes_grad_accum_a)
+        self.assertEqual(no_grad_accum_b, yes_grad_accum_b)
+
+        # see the note above how to get identical loss on a small bs
+        self.assertAlmostEqual(no_grad_accum_loss, yes_grad_accum_loss, places=5)
+
+
@slow
@require_deepspeed
@require_torch_gpu
 class TestDeepSpeed(TestCasePlus):
-
-    # this setup emulates a notebook where a launcher needs to be emulated by hand
-    @mockenv(MASTER_ADDR="localhost", MASTER_PORT="10999", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1")
-    def test_fake_notebook_no_launcher(self):
-        sys.path.append(self.tests_dir_str)
-        from test_trainer import get_regression_trainer
-
-        del sys.path[-1]  # restore
-        ds_config_file = f"{self.test_file_dir_str}/ds_config.json"
-        with CaptureStd() as cs:
-            trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_file)
-            trainer.train()
-        assert "DeepSpeed info" in cs.out, "expected DeepSpeed logger output but got none"
+    """ This class is for testing via an external script """

    @require_torch_multi_gpu
    def test_basic_distributed(self):
        self.run_quick(distributed=True)

-    @require_torch_multi_gpu
-    def test_grad_acum(self):
-        self.run_quick(distributed=True, extra_args_str="--gradient_accumulation_steps 2")
-
    def test_do_eval_no_train(self):
        # we should not fail if train is skipped
        output_dir = self.run_trainer(