Support constant lr with cooldown (#35453)

* Add support for constant learning rate with cooldown

* Add support for constant learning rate with cooldown

* Add support for constant learning rate with cooldown

* Add support for constant learning rate with cooldown

* Add support for constant learning rate with cooldown

* Add support for constant learning rate with cooldown

* Add support for constant learning rate with cooldown

* Add more warmup and cooldown methods to 'get_wsc_schedule'

* Add more warmup and cooldown methods to 'get_wsc_schedule'

* Add more warmup and cooldown methods to 'get_wsc_schedule'

* Add more warmup and cooldown methods to 'get_wsc_schedule'

* Add more warmup and decay methods to 'get_wsd_schedule'

* support num_training_steps and num_stable_steps for get_wsd_schedule

* support num_training_steps and num_stable_steps for get_wsd_schedule

* get wsd scheduler before the `num_training_steps` decision

* fix code_quality

* Update stable branch logic

* fix code_quality

* Move stable stage decide to `get_wsd_schedule`

* Update docstring of `get_wsd_schedule`

* Update `num_train_steps` to optional

* Update `num_train_steps` to optional

* Update docstring of `get_wsd_schedule`

* Update src/transformers/optimization.py

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>

---------

Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
This commit is contained in:
Jingze Shi
2025-02-10 20:21:55 +08:00
committed by GitHub
parent 9a6be63fdb
commit 48a309d0d2
2 changed files with 86 additions and 17 deletions

View File

@@ -153,8 +153,8 @@ class ScheduleInitTest(unittest.TestCase):
[0.0, 5.0, 10.0, 8.165, 7.071, 6.325, 5.774, 5.345, 5.0, 4.714],
),
get_wsd_schedule: (
{"num_warmup_steps": 2, "num_stable_steps": 2, "num_decay_steps": 3, "min_lr_ratio": 0.1},
[0.0, 5.0, 10.0, 10.0, 10.0, 7.75, 3.25, 1.0, 1.0, 1.0],
{**common_kwargs, "num_decay_steps": 2, "min_lr_ratio": 0.0},
[0.0, 5.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 5.0],
),
}
@@ -183,14 +183,34 @@ class ScheduleInitTest(unittest.TestCase):
"name": "warmup_stable_decay",
"optimizer": self.optimizer,
"num_warmup_steps": 2,
"scheduler_specific_kwargs": {"num_stable_steps": 1, "num_decay_steps": 3},
"num_training_steps": 10,
"scheduler_specific_kwargs": {
"num_decay_steps": 2,
"warmup_type": "linear",
"decay_type": "linear",
},
},
{
"name": "warmup_stable_decay",
"optimizer": self.optimizer,
"num_warmup_steps": 2,
"num_training_steps": 10,
"scheduler_specific_kwargs": {"num_stable_steps": 1, "num_decay_steps": 3},
"scheduler_specific_kwargs": {
"num_decay_steps": 2,
"warmup_type": "cosine",
"decay_type": "cosine",
},
},
{
"name": "warmup_stable_decay",
"optimizer": self.optimizer,
"num_warmup_steps": 2,
"num_training_steps": 10,
"scheduler_specific_kwargs": {
"num_decay_steps": 2,
"warmup_type": "1-sqrt",
"decay_type": "1-sqrt",
},
},
{"name": "cosine", "optimizer": self.optimizer, "num_warmup_steps": 2, "num_training_steps": 10},
]