Support constant lr with cooldown (#35453)

* Add support for constant learning rate with cooldown * Add support for constant learning rate with cooldown * Add support for constant learning rate with cooldown * Add support for constant learning rate with cooldown * Add support for constant learning rate with cooldown * Add support for constant learning rate with cooldown * Add support for constant learning rate with cooldown * Add more warmup and cooldown methods to 'get_wsc_schedule' * Add more warmup and cooldown methods to 'get_wsc_schedule' * Add more warmup and cooldown methods to 'get_wsc_schedule' * Add more warmup and cooldown methods to 'get_wsc_schedule' * Add more warmup and decay methods to 'get_wsd_schedule' * support num_training_steps and num_stable_steps for get_wsd_schedule * support num_training_steps and num_stable_steps for get_wsd_schedule * get wsd scheduler before the `num_training_steps` decision * fix code_quality * Update stable branch logic * fix code_quality * Move stable stage decide to `get_wsd_schedule` * Update docstring of `get_wsd_schedule` * Update `num_train_steps` to optional * Update `num_train_steps` to optional * Update docstring of `get_wsd_schedule` * Update src/transformers/optimization.py Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> --------- Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
2025-02-10 20:21:55 +08:00
parent 9a6be63fdb
commit 48a309d0d2
2 changed files with 86 additions and 17 deletions
--- a/tests/optimization/test_optimization.py
+++ b/tests/optimization/test_optimization.py
@@ -153,8 +153,8 @@ class ScheduleInitTest(unittest.TestCase):
                [0.0, 5.0, 10.0, 8.165, 7.071, 6.325, 5.774, 5.345, 5.0, 4.714],
            ),
            get_wsd_schedule: (
-                {"num_warmup_steps": 2, "num_stable_steps": 2, "num_decay_steps": 3, "min_lr_ratio": 0.1},
-                [0.0, 5.0, 10.0, 10.0, 10.0, 7.75, 3.25, 1.0, 1.0, 1.0],
+                {**common_kwargs, "num_decay_steps": 2, "min_lr_ratio": 0.0},
+                [0.0, 5.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 5.0],
            ),
        }

@@ -183,14 +183,34 @@ class ScheduleInitTest(unittest.TestCase):
                "name": "warmup_stable_decay",
                "optimizer": self.optimizer,
                "num_warmup_steps": 2,
-                "scheduler_specific_kwargs": {"num_stable_steps": 1, "num_decay_steps": 3},
+                "num_training_steps": 10,
+                "scheduler_specific_kwargs": {
+                    "num_decay_steps": 2,
+                    "warmup_type": "linear",
+                    "decay_type": "linear",
+                },
            },
            {
                "name": "warmup_stable_decay",
                "optimizer": self.optimizer,
                "num_warmup_steps": 2,
                "num_training_steps": 10,
-                "scheduler_specific_kwargs": {"num_stable_steps": 1, "num_decay_steps": 3},
+                "scheduler_specific_kwargs": {
+                    "num_decay_steps": 2,
+                    "warmup_type": "cosine",
+                    "decay_type": "cosine",
+                },
+            },
+            {
+                "name": "warmup_stable_decay",
+                "optimizer": self.optimizer,
+                "num_warmup_steps": 2,
+                "num_training_steps": 10,
+                "scheduler_specific_kwargs": {
+                    "num_decay_steps": 2,
+                    "warmup_type": "1-sqrt",
+                    "decay_type": "1-sqrt",
+                },
            },
            {"name": "cosine", "optimizer": self.optimizer, "num_warmup_steps": 2, "num_training_steps": 10},
        ]