[DeepSpeed] fp32 support (#11499)

* prep for deepspeed==0.3.16 * new version * too soon * support and test fp32 mode * troubleshooting doc start * workaround no longer needed * add fp32 doc * style * cleanup, add tf32 note * clarify * release was made
2021-04-30 12:51:48 -07:00
parent 282f3ac3ef
commit 4e7bf94e72
6 changed files with 139 additions and 70 deletions
--- a/docs/source/main_classes/trainer.rst
+++ b/docs/source/main_classes/trainer.rst
@@ -1507,6 +1507,35 @@ and ``total_num_steps`, ``warmup_max_lr``, ``warmup_num_steps`` and ``total_num_
 fp32 Precision
 =======================================================================================================================
 Deepspeed supports the full fp32 and the fp16 mixed precision.
 Because of the much reduced memory needs and faster speed one gets with the fp16 mixed precision, the only time you
 will want to not use it is when the model you're using doesn't behave well under this training mode. Typically this
 happens when the model wasn't pretrained in the fp16 mixed precision (e.g. often this happens with bf16-pretrained
 models). Such models may overflow or underflow leading to ``NaN`` loss. If this is your case then you will want to use
 the full fp32 mode, by explicitly disabling the otherwise default fp16 mixed precision mode with:
 .. code-block:: json
    {
        "fp16": {
            "enabled": "false",
        }
    }
 If you're using the Ampere-architecture based GPU, pytorch version 1.7 and higher will automatically switch to using
 the much more efficient tf32 format for some operations, but the results will still be in fp32. For details and
 benchmarks, please, see `TensorFloat-32(TF32) on Ampere devices
 <https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices>`__. The document includes
 instructions on how to disable this automatic conversion if for some reason you prefer not to use it.
 Automatic Mixed Precision
 =======================================================================================================================
@@ -1532,11 +1561,6 @@ and the :class:`~transformers.Trainer` will automatically enable or disable it b
 This mode gets enabled when ``--fp16 --fp16_backend amp`` command line args are passed.
 .. note::
   At the moment DeepSpeed doesn't supported fp32 mode, though it will become available soon. Until then it will be
   always set to ``true``.
 You can also enable/disable this mode explicitly:
 .. code-block:: json
@@ -1790,6 +1814,24 @@ stress on ``tensor([1.])``, or if you get an error where it says the parameter i
 larger multi-dimensional shape, this means that the parameter is partitioned and what you see is a ZeRO-3 placeholder.
 Troubleshooting
 =======================================================================================================================
 * ``deepspeed`` process gets killed at startup without a traceback
 If the ``deepspeed`` process gets killed at launch time without a traceback, that usually means that the program tried
 to allocate more CPU memory than your system has or your process is allowed to allocate and the OS kernel killed that
 process. This is because your configuration file most likely has either ``offload_optimizer`` or ``offload_param`` or
 both configured to offload to ``cpu`` (or under ZeRO-2 ``cpu_offload`` is enabled). If you have NVMe, experiment with
 offloading to NVMe if you're running under ZeRO-3.
 Work is being done to enable estimating how much memory is needed for a specific model: `PR
 <https://github.com/microsoft/DeepSpeed/pull/965>`__.
 Notes
 =======================================================================================================================
--- a/setup.py
+++ b/setup.py
@@ -90,7 +90,7 @@ _deps = [
    "cookiecutter==1.7.2",
    "dataclasses",
    "datasets",
-    "deepspeed>=0.3.15",
+    "deepspeed>=0.3.16",
    "docutils==0.16.0",
    "fairscale>0.3",
    "faiss-cpu",
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -7,7 +7,7 @@ deps = {
    "cookiecutter": "cookiecutter==1.7.2",
    "dataclasses": "dataclasses",
    "datasets": "datasets",
-    "deepspeed": "deepspeed>=0.3.15",
+    "deepspeed": "deepspeed>=0.3.16",
    "docutils": "docutils==0.16.0",
    "fairscale": "fairscale>0.3",
    "faiss-cpu": "faiss-cpu",
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -374,10 +374,7 @@ class DeepSpeedConfigHF:
        # amp: similar to the pytorch native amp - it has a bunch of optional params but we won't set
        # any here unless the user did the work
        config_fp16 = config.get("fp16")
-        # XXX: at the moment fp16 can't be False, but the fp32 solution is in works - once it's PR'ed and
+        _set_if_auto(config_fp16, "enabled", fp16_backend == "amp")
        # merged and a new release is made, delete the next line and uncomment the one after it
        _set_if_auto(config_fp16, "enabled", True)
        # _set_if_auto(config_fp16, "enabled", fp16_backend == "amp")
        # apex: delegates amp work to apex (which needs to be available), but it cannot be used with any
        # ZeRO features, so probably best to be avoided.
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -44,7 +44,7 @@ from .file_utils import (
    replace_return_docstrings,
 )
 from .generation_utils import GenerationMixin
-from .integrations import is_deepspeed_zero3_enabled
+from .integrations import deepspeed_config, is_deepspeed_zero3_enabled
 from .utils import logging
@@ -1124,10 +1124,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
            logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")
            # this immediately partitions the model across all gpus, to avoid the overhead in time
            # and memory copying it on CPU or each GPU first
-
+            with deepspeed.zero.Init(config=deepspeed_config()):
            # XXX: param_dict will be added in deepspeed==0.3.16 and probably replaced by deepspeed_config
            # with deepspeed.zero.Init(param_dict=deepspeed_config()):
            with deepspeed.zero.Init():
                model = cls(config, *model_args, **model_kwargs)
        else:
            model = cls(config, *model_args, **model_kwargs)
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -48,6 +48,7 @@ with ExtendSysPath(f"{bindir}/.."):
 set_seed(42)
 MBART_TINY = "sshleifer/tiny-mbart"
 T5_SMALL = "t5-small"
 T5_TINY = "patrickvonplaten/t5-tiny-random"
 def load_json(path):
@@ -108,25 +109,31 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
            MASTER_ADDR="localhost", MASTER_PORT="10999", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
        )
-        self.ds_config_file = {}
+        self.ds_config_file = dict(
-        self.ds_config_file[ZERO2] = f"{self.test_file_dir_str}/ds_config_zero2.json"
+            zero2=f"{self.test_file_dir_str}/ds_config_zero2.json",
-        self.ds_config_file[ZERO3] = f"{self.test_file_dir_str}/ds_config_zero3.json"
+            zero3=f"{self.test_file_dir_str}/ds_config_zero3.json",
        )
        # use self.get_config_dict(stage) to use these to ensure the original is not modified
        self.ds_config_dict = {}
        with io.open(self.ds_config_file[ZERO2], "r", encoding="utf-8") as f:
-            self.ds_config_dict[ZERO2] = json.load(f)
+            config_zero2 = json.load(f)
            # by default use fp16
            config_zero2["fp16"]["enabled"] = True
        with io.open(self.ds_config_file[ZERO3], "r", encoding="utf-8") as f:
-            self.ds_config_dict[ZERO3] = json.load(f)
+            config_zero3 = json.load(f)
-
+            # by default use fp16
-    def get_config_dict(self, stage):
+            config_zero3["fp16"]["enabled"] = True
        """As the tests modify the dict, always make a copy"""
        config = deepcopy(self.ds_config_dict[stage])
        if stage == ZERO3:
            # This setting slows things down, so don't enable it by default unless needed by a test.
            # It's in the file as a demo for users since we want everything to work out of the box even if slower.
-            config["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = False
+            config_zero3["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = False
-        return config
+        self.ds_config_dict = dict(
            zero2=config_zero2,
            zero3=config_zero3,
        )
    def get_config_dict(self, stage):
        # As some tests modify the dict, always make a copy
        return deepcopy(self.ds_config_dict[stage])
    # --- These tests are enough to run on one of zero stages --- #
@@ -192,24 +199,6 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
    # --- These tests need to run on both zero stages --- #
    @parameterized.expand(stages)
    def test_fp32(self, stage):
        ds_config_dict = self.get_config_dict(stage)
        ds_config_dict["fp16"]["enabled"] = False  # force non-fp16 mode
        # XXX: do we go via from_pretrained in zero 3 here? need to test zero.Init(dtype=torch.float)
        # XXX: rewrite this test once fp32 is supported by DeepSpeed
        with mockenv_context(**self.dist_env_1_gpu):
            trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_dict)
            with self.assertRaises(Exception) as context:
                trainer.train()
            self.assertIn(
                "ZeRO is only supported if fp16 is enabled",
                str(context.exception),
                f"got exception: {context.exception}",
            )
    @parameterized.expand(stages)
    def test_hf_optimizer_with_offload(self, stage):
        # must not allow non-DS optimizer when using ZERO-offload
@@ -239,7 +228,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
        # it's run not as a first test as `sys.stdout` will no longer be the same. So we either have
        # to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger.
        with mockenv_context(**self.dist_env_1_gpu):
-            trainer = get_regression_trainer(local_rank=0, deepspeed=self.ds_config_file[stage])
+            trainer = get_regression_trainer(local_rank=0, deepspeed=self.get_config_dict(stage))
            with CaptureLogger(deepspeed_logger) as cs:
                trainer.train()
            self.assertIn("DeepSpeed info", cs.out, "expected DeepSpeed logger output but got none")
@@ -259,7 +248,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
                b=b,
                local_rank=0,
                train_len=8,
-                deepspeed=self.ds_config_file[stage],
+                deepspeed=self.get_config_dict(stage),
                per_device_train_batch_size=8,
                logging_steps=1,
            )
@@ -267,7 +256,11 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
            post_train_a = trainer.model.a.item()
            # XXX: for some reason the following check fails with zero3 - not a broken but a
-            # different qualitative outcome - need to investigate at some point
+            # different qualitative outcome - as if optimizer did run
            # oddly getting 1.0 for both a and b from 0.0 - there is a bug somewhere
            # print(trainer.model.a.item())
            # print(trainer.model.b.item())
            # need to investigate at some point
            if stage == ZERO3:
                return
@@ -298,7 +291,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
                b=b,
                local_rank=0,
                train_len=train_len,
-                deepspeed=self.ds_config_file[stage],
+                deepspeed=self.get_config_dict(stage),
                per_device_train_batch_size=8,
                gradient_accumulation_steps=1,
            )
@@ -315,7 +308,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
                b=b,
                local_rank=0,
                train_len=train_len,
-                deepspeed=self.ds_config_file[stage],
+                deepspeed=self.get_config_dict(stage),
                per_device_train_batch_size=4,
                gradient_accumulation_steps=2,
            )
@@ -532,6 +525,35 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
            do_eval=True,
        )
    @parameterized.expand(stages)
    def test_fp32_non_distributed(self, stage):
        # real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
        # therefore no quality checks, just basic completion checks are done
        self.run_and_check(
            stage=stage,
            model_name=T5_TINY,
            distributed=False,
            do_train=True,
            do_eval=True,
            quality_checks=False,
            fp16=False,
        )
    @require_torch_multi_gpu
    @parameterized.expand(stages)
    def test_fp32_distributed(self, stage):
        # real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
        # therefore no quality checks, just basic completion checks are done
        self.run_and_check(
            stage=stage,
            model_name=T5_TINY,
            distributed=True,
            do_train=True,
            do_eval=True,
            quality_checks=False,
            fp16=False,
        )
    @parameterized.expand(stages)
    def test_resume_train_not_from_ds_checkpoint(self, stage):
        # do normal training and then resume not from the deepspeed checkpoint but explicitly from
@@ -550,44 +572,50 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
        self.do_checks(output_dir, do_train=do_train, do_eval=do_eval)
-    def do_checks(self, output_dir, do_train=True, do_eval=True):
+    def do_checks(self, output_dir, do_train=True, do_eval=True, quality_checks=True):
        if do_train:
            train_metrics = load_json(os.path.join(output_dir, "train_results.json"))
            self.assertIn("train_samples_per_second", train_metrics)
-            self.assertGreater(train_metrics["train_samples_per_second"], 0.5)
+            if quality_checks:
                self.assertGreater(train_metrics["train_samples_per_second"], 0.5)
        if do_eval:
            eval_metrics = load_json(os.path.join(output_dir, "eval_results.json"))
            self.assertIn("eval_bleu", eval_metrics)
-            self.assertGreater(eval_metrics["eval_bleu"], 0)
+            if quality_checks:
                self.assertGreater(eval_metrics["eval_bleu"], 1)
    # XXX: need to do better validation beyond just that the run was successful
    def run_and_check(
        self,
        stage,
-        eval_steps=10,
+        model_name: str = T5_SMALL,
-        distributed=True,
+        eval_steps: int = 10,
-        do_train=True,
+        distributed: bool = True,
-        do_eval=True,
+        do_train: bool = True,
-        extra_args_str=None,
+        do_eval: bool = True,
-        remove_args_str=None,
+        quality_checks: bool = True,
        fp16: bool = True,
        extra_args_str: str = None,
        remove_args_str: str = None,
    ):
        # we are doing quality testing so using a small real model
        output_dir = self.run_trainer(
            stage=stage,
-            model_name=T5_SMALL,
+            model_name=model_name,
            eval_steps=eval_steps,
            num_train_epochs=1,
            do_train=do_train,
            do_eval=do_eval,
            distributed=distributed,
            fp16=fp16,
            extra_args_str=extra_args_str,
            remove_args_str=remove_args_str,
        )
-        self.do_checks(output_dir, do_train=do_train, do_eval=do_eval)
+        self.do_checks(output_dir, do_train=do_train, do_eval=do_eval, quality_checks=quality_checks)
        return output_dir
@@ -600,6 +628,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
        do_train: bool = False,
        do_eval: bool = True,
        distributed: bool = True,
        fp16: bool = True,
        extra_args_str: str = None,
        remove_args_str: str = None,
    ):
@@ -629,6 +658,9 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
        """.split()
        args.extend(["--source_prefix", '"translate English to Romanian: "'])
        if fp16:
            args.extend(["--fp16"])
        actions = 0
        if do_train:
            actions += 1
@@ -636,7 +668,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
                f"""
            --do_train
            --num_train_epochs {str(num_train_epochs)}
-            --max_train_samples 100
+            --max_train_samples 16
            --per_device_train_batch_size 2
            --learning_rate 3e-3
            """.split()
@@ -647,7 +679,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
            args.extend(
                """
            --do_eval
-            --max_eval_samples 100
+            --max_eval_samples 16
            --per_device_eval_batch_size 2
            """.split()
            )
@@ -688,13 +720,14 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
            --overwrite_output_dir
            --do_train
            --do_eval
-            --max_train_samples 10
+            --max_train_samples 16
-            --max_eval_samples 10
+            --max_eval_samples 16
-            --per_device_train_batch_size 5
+            --per_device_train_batch_size 2
-            --per_device_eval_batch_size 5
+            --per_device_eval_batch_size 2
            --num_train_epochs 1
            --warmup_steps 8
-            --block_size 128
+            --block_size 64
            --fp16
            --report_to none
            """.split()