[trainer] deepspeed integration (#9211)

* deepspeed integration * style * add test * ds wants to do its own backward * fp16 assert * Update src/transformers/training_args.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * style * for clarity extract what args are being passed to deepspeed * introduce the concept of self.wrapped_model * s/self.wrapped_model/self.model_wrapped/ * complete transition to self.wrapped_model / self.model * fix * doc * give ds its own init * add custom overrides, handle bs correctly * fix test * clean up model_init logic, fix small bug * complete fix * collapse --deepspeed_config into --deepspeed * style * start adding doc notes * style * implement hf2ds optimizer and scheduler configuration remapping * oops * call get_num_training_steps absolutely when needed * workaround broken auto-formatter * deepspeed_config arg is no longer needed - fixed in deepspeed master * use hf's fp16 args in config * clean * start on the docs * rebase cleanup * finish up --fp16 * clarify the supported stages * big refactor thanks to discovering deepspeed.init_distributed * cleanup * revert fp16 part * add checkpoint-support * more init ds into integrations * extend docs * cleanup * unfix docs * clean up old code * imports * move docs * fix logic * make it clear which file it's referring to * document nodes/gpus * style * wrong format * style * deepspeed handles gradient clipping * easier to read * major doc rewrite * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * docs * switch to AdamW optimizer * style * Apply suggestions from code review Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * clarify doc Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
2021-01-12 19:05:18 -08:00
parent 5f6721032a
commit 2df34f4aba
7 changed files with 741 additions and 57 deletions
--- a/examples/seq2seq/ds_config.json
+++ b/examples/seq2seq/ds_config.json
@@ -0,0 +1,47 @@
+{
+    "fp16": {
+        "enabled": true,
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+
+   "zero_optimization": {
+       "stage": 2,
+       "allgather_partitions": true,
+       "allgather_bucket_size": 2e8,
+       "overlap_comm": true,
+       "reduce_scatter": true,
+       "reduce_bucket_size": 2e8,
+       "contiguous_gradients": true,
+       "cpu_offload": true
+   },
+
+   "zero_allow_untested_optimizer": true,
+
+   "optimizer": {
+     "type": "AdamW",
+     "params": {
+       "lr": 3e-5,
+       "betas": [
+         0.8,
+         0.999
+       ],
+       "eps": 1e-8,
+       "weight_decay": 3e-7
+     }
+   },
+
+   "scheduler": {
+     "type": "WarmupLR",
+     "params": {
+       "warmup_min_lr": 0,
+       "warmup_max_lr": 3e-5,
+       "warmup_num_steps": 500
+     }
+   },
+
+    "steps_per_print": 2000,
+    "wall_clock_breakdown": false
+}
--- a/examples/seq2seq/test_finetune_trainer.py
+++ b/examples/seq2seq/test_finetune_trainer.py
@@ -18,7 +18,7 @@ import unittest
 from unittest.mock import patch

 from transformers.file_utils import is_apex_available
-from transformers.integrations import is_fairscale_available
+from transformers.integrations import is_deepspeed_available, is_fairscale_available
 from transformers.testing_utils import (
    TestCasePlus,
    execute_subprocess_async,
@@ -49,6 +49,17 @@ def require_fairscale(test_case):
        return test_case


+# a candidate for testing_utils
+def require_deepspeed(test_case):
+    """
+    Decorator marking a test that requires deepspeed
+    """
+    if not is_deepspeed_available():
+        return unittest.skip("test requires deepspeed")(test_case)
+    else:
+        return test_case
+
+
 # a candidate for testing_utils
 def require_apex(test_case):
    """
@@ -61,8 +72,8 @@ def require_apex(test_case):


 class TestFinetuneTrainer(TestCasePlus):
-    def finetune_trainer_quick(self, distributed=None, extra_args_str=None):
-        output_dir = self.run_trainer(1, "12", MBART_TINY, 1, distributed, extra_args_str)
+    def finetune_trainer_quick(self, distributed=None, deepspeed=False, extra_args_str=None):
+        output_dir = self.run_trainer(1, "12", MBART_TINY, 1, distributed, deepspeed, extra_args_str)
        logs = TrainerState.load_from_json(os.path.join(output_dir, "trainer_state.json")).log_history
        eval_metrics = [log for log in logs if "eval_loss" in log.keys()]
        first_step_stats = eval_metrics[0]
@@ -96,6 +107,11 @@ class TestFinetuneTrainer(TestCasePlus):
    def test_finetune_trainer_apex(self):
        self.finetune_trainer_quick(extra_args_str="--fp16 --fp16_backend=apex")

+    @require_torch_multi_gpu
+    @require_deepspeed
+    def test_finetune_trainer_deepspeed(self):
+        self.finetune_trainer_quick(deepspeed=True)
+
    @slow
    def test_finetune_trainer_slow(self):
        # There is a missing call to __init__process_group somewhere
@@ -125,6 +141,7 @@ class TestFinetuneTrainer(TestCasePlus):
        model_name: str,
        num_train_epochs: int,
        distributed: bool = False,
+        deepspeed: bool = False,
        extra_args_str: str = None,
    ):
        data_dir = self.examples_dir / "seq2seq/test_data/wmt_en_ro"
@@ -164,7 +181,15 @@ class TestFinetuneTrainer(TestCasePlus):
        if extra_args_str is not None:
            args.extend(extra_args_str.split())

-        if distributed:
+        if deepspeed:
+            ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config.json".split()
+            distributed_args = f"""
+                {self.test_file_dir}/finetune_trainer.py
+            """.split()
+            cmd = ["deepspeed"] + distributed_args + args + ds_args
+            execute_subprocess_async(cmd, env=self.get_env())
+
+        elif distributed:
            n_gpu = get_gpu_count()
            distributed_args = f"""
                -m torch.distributed.launch
@@ -173,6 +198,7 @@ class TestFinetuneTrainer(TestCasePlus):
            """.split()
            cmd = [sys.executable] + distributed_args + args
            execute_subprocess_async(cmd, env=self.get_env())
+
        else:
            testargs = ["finetune_trainer.py"] + args
            with patch.object(sys, "argv", testargs):