non-native optimizers are mostly ok with zero-offload (#12690)
This commit is contained in:
@@ -1061,7 +1061,8 @@ optimizers, with the exception of using the combination of HuggingFace scheduler
|
|||||||
| DS Optimizer | No | Yes |
|
| DS Optimizer | No | Yes |
|
||||||
+--------------+--------------+--------------+
|
+--------------+--------------+--------------+
|
||||||
|
|
||||||
If ``offload_optimizer`` is enabled you must use both DeepSpeed scheduler and DeepSpeed optimizer.
|
It is possible to use a non-DeepSpeed optimizer when ``offload_optimizer`` is enabled, as long as it has both CPU and
|
||||||
|
GPU implementation (except LAMB).
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -315,9 +315,11 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None):
|
|||||||
#
|
#
|
||||||
# Unless Offload is enabled in which case it's:
|
# Unless Offload is enabled in which case it's:
|
||||||
# 1. DS scheduler + DS optimizer: Yes
|
# 1. DS scheduler + DS optimizer: Yes
|
||||||
# 2. HF scheduler + HF optimizer: No
|
# 2. HF scheduler + HF optimizer: Mostly*
|
||||||
# 3. DS scheduler + HF optimizer: No
|
# 3. DS scheduler + HF optimizer: Mostly*
|
||||||
# 4. HF scheduler + DS optimizer: No
|
# 4. HF scheduler + DS optimizer: No
|
||||||
|
#
|
||||||
|
# Mostly*: All non-native DeepSpeed optimizers that have both CPU and GPU implementation should work (except LAMB)
|
||||||
|
|
||||||
optimizer = None
|
optimizer = None
|
||||||
if "optimizer" in config:
|
if "optimizer" in config:
|
||||||
@@ -328,7 +330,9 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None):
|
|||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
if hf_deepspeed_config.is_offload():
|
if hf_deepspeed_config.is_offload():
|
||||||
raise ValueError("ZeRO Offload can only work with DeepSpeed optimizers")
|
logger.info(
|
||||||
|
"Detected ZeRO Offload and non-DeepSpeed optimizers: This combination should work as long as the custom optimizer has both CPU and GPU implementation (except LAMB)"
|
||||||
|
)
|
||||||
|
|
||||||
# ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch.
|
# ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch.
|
||||||
# But trainer uses AdamW by default.
|
# But trainer uses AdamW by default.
|
||||||
|
|||||||
@@ -325,20 +325,16 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
|
|
||||||
@parameterized.expand(stages)
|
@parameterized.expand(stages)
|
||||||
def test_hf_optimizer_with_offload(self, stage):
|
def test_hf_optimizer_with_offload(self, stage):
|
||||||
# must not allow non-DS optimizer when using ZERO-offload
|
# non-DS optimizers can be used with ZERO-offload (as long as they have both CPU and GPU implementation (except LAMB))
|
||||||
ds_config_dict = self.get_config_dict(stage)
|
ds_config_dict = self.get_config_dict(stage)
|
||||||
del ds_config_dict["optimizer"] # force default HF Trainer optimizer
|
del ds_config_dict["optimizer"] # force default HF Trainer optimizer
|
||||||
# force cpu offload
|
# force cpu offload
|
||||||
ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
|
ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
|
||||||
with mockenv_context(**self.dist_env_1_gpu):
|
with mockenv_context(**self.dist_env_1_gpu):
|
||||||
trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_dict)
|
trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_dict)
|
||||||
with self.assertRaises(Exception) as context:
|
with CaptureLogger(deepspeed_logger) as cl:
|
||||||
trainer.train()
|
trainer.train()
|
||||||
self.assertIn(
|
self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")
|
||||||
"ZeRO Offload can only work with DeepSpeed optimizers",
|
|
||||||
str(context.exception),
|
|
||||||
f"got exception: {context.exception}",
|
|
||||||
)
|
|
||||||
|
|
||||||
@parameterized.expand(stages)
|
@parameterized.expand(stages)
|
||||||
def test_fake_notebook_no_launcher(self, stage):
|
def test_fake_notebook_no_launcher(self, stage):
|
||||||
|
|||||||
Reference in New Issue
Block a user