Add support for FSDP+QLoRA and DeepSpeed ZeRO3+QLoRA (#29587)
* fsdp+qlora related changes * fixes * Update quantization_config.py * support fsdp+qlora and dsz3+qlora * Update quantization_config.py * Update modeling_utils.py * Update modeling_utils.py * Update modeling_utils.py * Update modeling_utils.py * Update modeling_utils.py * Update modeling_utils.py * handle fsdp+qlora and dsz3+qlora correctly while model loading * fix param count * quality * fsdp related changes * fsdp changes only when using LoRA/QLoRA * add accelerate version check * refactor, update min accelerate version and add tests 1. Update minimum accelerate version to 0.26.0 2. Clean the trainer wrt accelerate version checks 3. FSDP refactor and test for fsdp config 4. use `itemsize` instead of `dtype2bytes` dict * fix test * Address comments Co-Authored-By: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> * fix the conditional flag * fix conditional flag * address comments Co-Authored-By: Zach Mueller <7831895+muellerzr@users.noreply.github.com> --------- Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Co-authored-by: Zach Mueller <7831895+muellerzr@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
d3801aae2e
commit
350c5d1566
@@ -15,6 +15,7 @@
|
||||
import itertools
|
||||
import os
|
||||
import unittest
|
||||
from copy import deepcopy
|
||||
from functools import partial
|
||||
|
||||
from parameterized import parameterized
|
||||
@@ -171,6 +172,44 @@ class TrainerIntegrationFSDP(TestCasePlus, TrainerIntegrationCommon):
|
||||
self.assertEqual(v, self.fsdp_config[k])
|
||||
self.assertEqual(os.environ.get("ACCELERATE_USE_FSDP", "false"), "true")
|
||||
|
||||
@parameterized.expand(params, name_func=_parameterized_custom_name_func)
|
||||
def test_fsdp_config_transformers_auto_wrap(self, sharding_strategy, dtype):
|
||||
output_dir = self.get_auto_remove_tmp_dir()
|
||||
fsdp_config = deepcopy(self.fsdp_config)
|
||||
del fsdp_config["min_num_params"]
|
||||
fsdp_config["transformer_layer_cls_to_wrap"] = "BertLayer"
|
||||
kwargs = {
|
||||
"output_dir": output_dir,
|
||||
"train_len": 128,
|
||||
"save_steps": 5,
|
||||
"learning_rate": 0.1,
|
||||
"fsdp": f"{sharding_strategy} offload auto_wrap",
|
||||
"fsdp_config": fsdp_config,
|
||||
}
|
||||
kwargs[dtype] = True
|
||||
prefix = "FSDP_"
|
||||
with mockenv_context(**self.dist_env_1_gpu):
|
||||
trainer = get_regression_trainer(**kwargs)
|
||||
self.assertEqual(trainer.args.fsdp[0], sharding_strategy)
|
||||
self.assertEqual(trainer.args.fsdp[1], FSDPOption.OFFLOAD)
|
||||
self.assertEqual(trainer.args.fsdp[2], FSDPOption.AUTO_WRAP)
|
||||
fsdp_sharding_strategy = (
|
||||
str(FSDP_SHARDING_STRATEGY.index(sharding_strategy.upper()) + 1)
|
||||
if is_accelerate_available("0.26.0")
|
||||
else sharding_strategy.upper()
|
||||
)
|
||||
self.assertEqual(os.environ[f"{prefix}SHARDING_STRATEGY"], fsdp_sharding_strategy)
|
||||
self.assertEqual(os.environ[f"{prefix}OFFLOAD_PARAMS"], "true")
|
||||
self.assertEqual(os.environ[f"{prefix}AUTO_WRAP_POLICY"], "TRANSFORMER_BASED_WRAP")
|
||||
self.assertEqual(
|
||||
os.environ[f"{prefix}TRANSFORMER_CLS_TO_WRAP"], ",".join(fsdp_config["transformer_layer_cls_to_wrap"])
|
||||
)
|
||||
self.assertEqual(os.environ[f"{prefix}BACKWARD_PREFETCH"], fsdp_config["backward_prefetch"].upper())
|
||||
self.assertEqual(os.environ[f"{prefix}FORWARD_PREFETCH"], fsdp_config["forward_prefetch"])
|
||||
self.assertEqual(os.environ[f"{prefix}USE_ORIG_PARAMS"], fsdp_config["use_orig_params"])
|
||||
self.assertEqual(os.environ[f"{prefix}SYNC_MODULE_STATES"], fsdp_config["sync_module_states"])
|
||||
self.assertEqual(os.environ.get("ACCELERATE_USE_FSDP", "false"), "true")
|
||||
|
||||
@parameterized.expand(params, name_func=_parameterized_custom_name_func)
|
||||
@require_torch_multi_accelerator
|
||||
@slow
|
||||
|
||||
Reference in New Issue
Block a user