Add support for FSDP+QLoRA and DeepSpeed ZeRO3+QLoRA (#29587)

* fsdp+qlora related changes

* fixes

* Update quantization_config.py

* support fsdp+qlora and dsz3+qlora

* Update quantization_config.py

* Update modeling_utils.py

* Update modeling_utils.py

* Update modeling_utils.py

* Update modeling_utils.py

* Update modeling_utils.py

* Update modeling_utils.py

* handle fsdp+qlora and dsz3+qlora correctly while model loading

* fix param count

* quality

* fsdp related changes

* fsdp changes only when using LoRA/QLoRA

* add accelerate version check

* refactor, update min accelerate version and add tests

1. Update minimum accelerate version to 0.26.0
2. Clean the trainer wrt accelerate version checks
3. FSDP refactor and test for fsdp config
4. use `itemsize` instead of `dtype2bytes` dict

* fix test

* Address comments

Co-Authored-By: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>

* fix the conditional flag

* fix conditional flag

* address comments

Co-Authored-By: Zach Mueller <7831895+muellerzr@users.noreply.github.com>

---------

Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com>
Co-authored-by: Zach Mueller <7831895+muellerzr@users.noreply.github.com>
This commit is contained in:
Sourab Mangrulkar
2024-03-13 22:03:02 +05:30
committed by GitHub
parent d3801aae2e
commit 350c5d1566
6 changed files with 119 additions and 24 deletions

View File

@@ -15,6 +15,7 @@
import itertools
import os
import unittest
from copy import deepcopy
from functools import partial
from parameterized import parameterized
@@ -171,6 +172,44 @@ class TrainerIntegrationFSDP(TestCasePlus, TrainerIntegrationCommon):
self.assertEqual(v, self.fsdp_config[k])
self.assertEqual(os.environ.get("ACCELERATE_USE_FSDP", "false"), "true")
@parameterized.expand(params, name_func=_parameterized_custom_name_func)
def test_fsdp_config_transformers_auto_wrap(self, sharding_strategy, dtype):
output_dir = self.get_auto_remove_tmp_dir()
fsdp_config = deepcopy(self.fsdp_config)
del fsdp_config["min_num_params"]
fsdp_config["transformer_layer_cls_to_wrap"] = "BertLayer"
kwargs = {
"output_dir": output_dir,
"train_len": 128,
"save_steps": 5,
"learning_rate": 0.1,
"fsdp": f"{sharding_strategy} offload auto_wrap",
"fsdp_config": fsdp_config,
}
kwargs[dtype] = True
prefix = "FSDP_"
with mockenv_context(**self.dist_env_1_gpu):
trainer = get_regression_trainer(**kwargs)
self.assertEqual(trainer.args.fsdp[0], sharding_strategy)
self.assertEqual(trainer.args.fsdp[1], FSDPOption.OFFLOAD)
self.assertEqual(trainer.args.fsdp[2], FSDPOption.AUTO_WRAP)
fsdp_sharding_strategy = (
str(FSDP_SHARDING_STRATEGY.index(sharding_strategy.upper()) + 1)
if is_accelerate_available("0.26.0")
else sharding_strategy.upper()
)
self.assertEqual(os.environ[f"{prefix}SHARDING_STRATEGY"], fsdp_sharding_strategy)
self.assertEqual(os.environ[f"{prefix}OFFLOAD_PARAMS"], "true")
self.assertEqual(os.environ[f"{prefix}AUTO_WRAP_POLICY"], "TRANSFORMER_BASED_WRAP")
self.assertEqual(
os.environ[f"{prefix}TRANSFORMER_CLS_TO_WRAP"], ",".join(fsdp_config["transformer_layer_cls_to_wrap"])
)
self.assertEqual(os.environ[f"{prefix}BACKWARD_PREFETCH"], fsdp_config["backward_prefetch"].upper())
self.assertEqual(os.environ[f"{prefix}FORWARD_PREFETCH"], fsdp_config["forward_prefetch"])
self.assertEqual(os.environ[f"{prefix}USE_ORIG_PARAMS"], fsdp_config["use_orig_params"])
self.assertEqual(os.environ[f"{prefix}SYNC_MODULE_STATES"], fsdp_config["sync_module_states"])
self.assertEqual(os.environ.get("ACCELERATE_USE_FSDP", "false"), "true")
@parameterized.expand(params, name_func=_parameterized_custom_name_func)
@require_torch_multi_accelerator
@slow