diff --git a/docs/source/main_classes/deepspeed.rst b/docs/source/main_classes/deepspeed.rst
index 05a86b0801..2769ca044a 100644
--- a/docs/source/main_classes/deepspeed.rst
+++ b/docs/source/main_classes/deepspeed.rst
@@ -1061,7 +1061,8 @@ optimizers, with the exception of using the combination of HuggingFace scheduler
 | DS Optimizer | No           | Yes          |
 +--------------+--------------+--------------+
 
-If ``offload_optimizer`` is enabled you must use both DeepSpeed scheduler and DeepSpeed optimizer.
+It is possible to use a non-DeepSpeed optimizer when ``offload_optimizer`` is enabled, as long as it has both CPU and
+GPU implementation (except LAMB).
 
 
 
diff --git a/src/transformers/deepspeed.py b/src/transformers/deepspeed.py
index 7cf9fb07f0..359a2a0d52 100644
--- a/src/transformers/deepspeed.py
+++ b/src/transformers/deepspeed.py
@@ -315,9 +315,11 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None):
     #
     # Unless Offload is enabled in which case it's:
     # 1. DS scheduler + DS optimizer: Yes
-    # 2. HF scheduler + HF optimizer: No
-    # 3. DS scheduler + HF optimizer: No
+    # 2. HF scheduler + HF optimizer: Mostly*
+    # 3. DS scheduler + HF optimizer: Mostly*
     # 4. HF scheduler + DS optimizer: No
+    #
+    # Mostly*: All non-native DeepSpeed optimizers that have both CPU and GPU implementation should work (except LAMB)
 
     optimizer = None
     if "optimizer" in config:
@@ -328,7 +330,9 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None):
             )
     else:
         if hf_deepspeed_config.is_offload():
-            raise ValueError("ZeRO Offload can only work with DeepSpeed optimizers")
+            logger.info(
+                "Detected ZeRO Offload and non-DeepSpeed optimizers: This combination should work as long as the custom optimizer has both CPU and GPU implementation (except LAMB)"
+            )
 
         # ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch.
         # But trainer uses AdamW by default.
diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 6c5fe60c47..9ae4401afe 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -325,20 +325,16 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
 
     @parameterized.expand(stages)
     def test_hf_optimizer_with_offload(self, stage):
-        # must not allow non-DS optimizer when using ZERO-offload
+        # non-DS optimizers can be used with ZERO-offload (as long as they have both CPU and GPU implementation (except LAMB))
         ds_config_dict = self.get_config_dict(stage)
         del ds_config_dict["optimizer"]  # force default HF Trainer optimizer
         # force cpu offload
         ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
         with mockenv_context(**self.dist_env_1_gpu):
             trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_dict)
-            with self.assertRaises(Exception) as context:
+            with CaptureLogger(deepspeed_logger) as cl:
                 trainer.train()
-            self.assertIn(
-                "ZeRO Offload can only work with DeepSpeed optimizers",
-                str(context.exception),
-                f"got exception: {context.exception}",
-            )
+            self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")
 
     @parameterized.expand(stages)
     def test_fake_notebook_no_launcher(self, stage):