diff --git a/docs/source/main_classes/deepspeed.rst b/docs/source/main_classes/deepspeed.rst index 05a86b0801..2769ca044a 100644 --- a/docs/source/main_classes/deepspeed.rst +++ b/docs/source/main_classes/deepspeed.rst @@ -1061,7 +1061,8 @@ optimizers, with the exception of using the combination of HuggingFace scheduler | DS Optimizer | No | Yes | +--------------+--------------+--------------+ -If ``offload_optimizer`` is enabled you must use both DeepSpeed scheduler and DeepSpeed optimizer. +It is possible to use a non-DeepSpeed optimizer when ``offload_optimizer`` is enabled, as long as it has both CPU and +GPU implementation (except LAMB). diff --git a/src/transformers/deepspeed.py b/src/transformers/deepspeed.py index 7cf9fb07f0..359a2a0d52 100644 --- a/src/transformers/deepspeed.py +++ b/src/transformers/deepspeed.py @@ -315,9 +315,11 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None): # # Unless Offload is enabled in which case it's: # 1. DS scheduler + DS optimizer: Yes - # 2. HF scheduler + HF optimizer: No - # 3. DS scheduler + HF optimizer: No + # 2. HF scheduler + HF optimizer: Mostly* + # 3. DS scheduler + HF optimizer: Mostly* # 4. HF scheduler + DS optimizer: No + # + # Mostly*: All non-native DeepSpeed optimizers that have both CPU and GPU implementation should work (except LAMB) optimizer = None if "optimizer" in config: @@ -328,7 +330,9 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None): ) else: if hf_deepspeed_config.is_offload(): - raise ValueError("ZeRO Offload can only work with DeepSpeed optimizers") + logger.info( + "Detected ZeRO Offload and non-DeepSpeed optimizers: This combination should work as long as the custom optimizer has both CPU and GPU implementation (except LAMB)" + ) # ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch. # But trainer uses AdamW by default. diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py index 6c5fe60c47..9ae4401afe 100644 --- a/tests/deepspeed/test_deepspeed.py +++ b/tests/deepspeed/test_deepspeed.py @@ -325,20 +325,16 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): @parameterized.expand(stages) def test_hf_optimizer_with_offload(self, stage): - # must not allow non-DS optimizer when using ZERO-offload + # non-DS optimizers can be used with ZERO-offload (as long as they have both CPU and GPU implementation (except LAMB)) ds_config_dict = self.get_config_dict(stage) del ds_config_dict["optimizer"] # force default HF Trainer optimizer # force cpu offload ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu" with mockenv_context(**self.dist_env_1_gpu): trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_dict) - with self.assertRaises(Exception) as context: + with CaptureLogger(deepspeed_logger) as cl: trainer.train() - self.assertIn( - "ZeRO Offload can only work with DeepSpeed optimizers", - str(context.exception), - f"got exception: {context.exception}", - ) + self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none") @parameterized.expand(stages) def test_fake_notebook_no_launcher(self, stage):