Add support for FSDP+QLoRA and DeepSpeed ZeRO3+QLoRA (#29587)

* fsdp+qlora related changes * fixes * Update quantization_config.py * support fsdp+qlora and dsz3+qlora * Update quantization_config.py * Update modeling_utils.py * Update modeling_utils.py * Update modeling_utils.py * Update modeling_utils.py * Update modeling_utils.py * Update modeling_utils.py * handle fsdp+qlora and dsz3+qlora correctly while model loading * fix param count * quality * fsdp related changes * fsdp changes only when using LoRA/QLoRA * add accelerate version check * refactor, update min accelerate version and add tests 1. Update minimum accelerate version to 0.26.0 2. Clean the trainer wrt accelerate version checks 3. FSDP refactor and test for fsdp config 4. use `itemsize` instead of `dtype2bytes` dict * fix test * Address comments Co-Authored-By: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> * fix the conditional flag * fix conditional flag * address comments Co-Authored-By: Zach Mueller <7831895+muellerzr@users.noreply.github.com> --------- Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Co-authored-by: Zach Mueller <7831895+muellerzr@users.noreply.github.com>
2024-03-13 22:03:02 +05:30
parent d3801aae2e
commit 350c5d1566
6 changed files with 119 additions and 24 deletions
--- a/tests/fsdp/test_fsdp.py
+++ b/tests/fsdp/test_fsdp.py
@@ -15,6 +15,7 @@
 import itertools
 import os
 import unittest
+from copy import deepcopy
 from functools import partial

 from parameterized import parameterized
@@ -171,6 +172,44 @@ class TrainerIntegrationFSDP(TestCasePlus, TrainerIntegrationCommon):
                self.assertEqual(v, self.fsdp_config[k])
            self.assertEqual(os.environ.get("ACCELERATE_USE_FSDP", "false"), "true")

+    @parameterized.expand(params, name_func=_parameterized_custom_name_func)
+    def test_fsdp_config_transformers_auto_wrap(self, sharding_strategy, dtype):
+        output_dir = self.get_auto_remove_tmp_dir()
+        fsdp_config = deepcopy(self.fsdp_config)
+        del fsdp_config["min_num_params"]
+        fsdp_config["transformer_layer_cls_to_wrap"] = "BertLayer"
+        kwargs = {
+            "output_dir": output_dir,
+            "train_len": 128,
+            "save_steps": 5,
+            "learning_rate": 0.1,
+            "fsdp": f"{sharding_strategy} offload auto_wrap",
+            "fsdp_config": fsdp_config,
+        }
+        kwargs[dtype] = True
+        prefix = "FSDP_"
+        with mockenv_context(**self.dist_env_1_gpu):
+            trainer = get_regression_trainer(**kwargs)
+            self.assertEqual(trainer.args.fsdp[0], sharding_strategy)
+            self.assertEqual(trainer.args.fsdp[1], FSDPOption.OFFLOAD)
+            self.assertEqual(trainer.args.fsdp[2], FSDPOption.AUTO_WRAP)
+            fsdp_sharding_strategy = (
+                str(FSDP_SHARDING_STRATEGY.index(sharding_strategy.upper()) + 1)
+                if is_accelerate_available("0.26.0")
+                else sharding_strategy.upper()
+            )
+            self.assertEqual(os.environ[f"{prefix}SHARDING_STRATEGY"], fsdp_sharding_strategy)
+            self.assertEqual(os.environ[f"{prefix}OFFLOAD_PARAMS"], "true")
+            self.assertEqual(os.environ[f"{prefix}AUTO_WRAP_POLICY"], "TRANSFORMER_BASED_WRAP")
+            self.assertEqual(
+                os.environ[f"{prefix}TRANSFORMER_CLS_TO_WRAP"], ",".join(fsdp_config["transformer_layer_cls_to_wrap"])
+            )
+            self.assertEqual(os.environ[f"{prefix}BACKWARD_PREFETCH"], fsdp_config["backward_prefetch"].upper())
+            self.assertEqual(os.environ[f"{prefix}FORWARD_PREFETCH"], fsdp_config["forward_prefetch"])
+            self.assertEqual(os.environ[f"{prefix}USE_ORIG_PARAMS"], fsdp_config["use_orig_params"])
+            self.assertEqual(os.environ[f"{prefix}SYNC_MODULE_STATES"], fsdp_config["sync_module_states"])
+            self.assertEqual(os.environ.get("ACCELERATE_USE_FSDP", "false"), "true")
+
    @parameterized.expand(params, name_func=_parameterized_custom_name_func)
    @require_torch_multi_accelerator
    @slow