From e048d48bd0f87a0331020f55966e715faf0671d4 Mon Sep 17 00:00:00 2001 From: richardodliu Date: Wed, 16 Jul 2025 18:01:08 +0800 Subject: [PATCH] =?UTF-8?q?Add=C2=A0cosine=5Fwith=5Fmin=5Flr=5Fschedule=5F?= =?UTF-8?q?with=5Fwarmup=5Flr=5Frate=C2=A0scheduler=20in=20Trainer=20(#318?= =?UTF-8?q?70)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add cosine_with_min_lr_schedule_with_warmup_lr_rate scheduler in trainer * Update src/transformers/optimization.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update optimization.py fix the error of the unclosed "(" * Update optimization.py remove whitespace in line 402 in order to pass the quality test * Update src/transformers/optimization.py * Update src/transformers/optimization.py * Apply style fixes --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> Co-authored-by: github-actions[bot] Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> --- src/transformers/optimization.py | 82 +++++++++++++++++++++++++++++++ src/transformers/trainer_utils.py | 1 + tests/trainer/test_trainer.py | 28 +++++++++++ 3 files changed, 111 insertions(+) diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py index 29d3b6f7b9..1c71487dbb 100644 --- a/src/transformers/optimization.py +++ b/src/transformers/optimization.py @@ -384,6 +384,87 @@ def get_cosine_with_min_lr_schedule_with_warmup( return LambdaLR(optimizer, lr_lambda, last_epoch) +def _get_cosine_with_min_lr_schedule_with_warmup_lr_rate_lambda( + current_step: int, + *, + num_warmup_steps: int, + num_training_steps: int, + num_cycles: float, + min_lr_rate: float = 0.0, + warmup_lr_rate: Optional[float] = None, +): + current_step = float(current_step) + num_warmup_steps = float(num_warmup_steps) + num_training_steps = float(num_training_steps) + + if current_step < num_warmup_steps: + if warmup_lr_rate is None: + return (current_step + 1.0) / max(1.0, num_warmup_steps) + else: + warmup_lr_rate = float(warmup_lr_rate) + return warmup_lr_rate + (1.0 - warmup_lr_rate) * (current_step) / (max(1, num_warmup_steps - 1)) + progress = (current_step - num_warmup_steps + 1.0) / (max(1.0, num_training_steps - num_warmup_steps)) + factor = 0.5 * (1.0 + math.cos(math.pi * num_cycles * 2.0 * progress)) + factor = factor * (1 - min_lr_rate) + min_lr_rate + return max(0, factor) + + +def get_cosine_with_min_lr_schedule_with_warmup_lr_rate( + optimizer: Optimizer, + num_warmup_steps: int, + num_training_steps: int, + num_cycles: float = 0.5, + last_epoch: int = -1, + min_lr: Optional[float] = None, + min_lr_rate: Optional[float] = None, + warmup_lr_rate: Optional[float] = None, +): + """ + Create a schedule with a learning rate that decreases following the values of the cosine function between the + initial lr set in the optimizer to min_lr, after a warmup period during which it increases linearly between 0 and the + initial lr set in the optimizer. + + Args: + optimizer ([`~torch.optim.Optimizer`]): + The optimizer for which to schedule the learning rate. + num_warmup_steps (`int`): + The number of steps for the warmup phase. + num_training_steps (`int`): + The total number of training steps. + num_cycles (`float`, *optional*, defaults to 0.5): + The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 + following a half-cosine). + last_epoch (`int`, *optional*, defaults to -1): + The index of the last epoch when resuming training. + min_lr (`float`, *optional*): + The minimum learning rate to reach after the cosine schedule. + min_lr_rate (`float`, *optional*): + The minimum learning rate as a ratio of the initial learning rate. If set, `min_lr` should not be set. + warmup_lr_rate (`float`, *optional*): + The minimum learning rate as a ratio of the start learning rate. If not set, `warmup_lr_rate` will be treated as float(1/num_warmup_steps). + + Return: + `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule. + """ + + if min_lr is not None and min_lr_rate is not None: + raise ValueError("Only one of min_lr or min_lr_rate should be set") + elif min_lr is not None: + min_lr_rate = min_lr / optimizer.defaults["lr"] + elif min_lr_rate is None: + raise ValueError("One of min_lr or min_lr_rate should be set through the `lr_scheduler_kwargs`") + + lr_lambda = partial( + _get_cosine_with_min_lr_schedule_with_warmup_lr_rate_lambda, + num_warmup_steps=num_warmup_steps, + num_training_steps=num_training_steps, + num_cycles=num_cycles, + min_lr_rate=min_lr_rate, + warmup_lr_rate=warmup_lr_rate, + ) + return LambdaLR(optimizer, lr_lambda, last_epoch) + + def _get_wsd_scheduler_lambda( current_step: int, *, @@ -505,6 +586,7 @@ TYPE_TO_SCHEDULER_FUNCTION = { SchedulerType.INVERSE_SQRT: get_inverse_sqrt_schedule, SchedulerType.REDUCE_ON_PLATEAU: get_reduce_on_plateau_schedule, SchedulerType.COSINE_WITH_MIN_LR: get_cosine_with_min_lr_schedule_with_warmup, + SchedulerType.COSINE_WARMUP_WITH_MIN_LR: get_cosine_with_min_lr_schedule_with_warmup_lr_rate, SchedulerType.WARMUP_STABLE_DECAY: get_wsd_schedule, } diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py index 0766b8c2ed..317e50fb68 100644 --- a/src/transformers/trainer_utils.py +++ b/src/transformers/trainer_utils.py @@ -444,6 +444,7 @@ class SchedulerType(ExplicitEnum): INVERSE_SQRT = "inverse_sqrt" REDUCE_ON_PLATEAU = "reduce_lr_on_plateau" COSINE_WITH_MIN_LR = "cosine_with_min_lr" + COSINE_WARMUP_WITH_MIN_LR = "cosine_warmup_with_min_lr" WARMUP_STABLE_DECAY = "warmup_stable_decay" diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 1a7f512025..e91ff1e21d 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1143,6 +1143,34 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon): trainer.lr_scheduler.step() self.assertEqual(trainer.lr_scheduler.get_last_lr()[0], 1e-5) + def test_cosine_with_min_lr_schedule_with_warmup_lr_rate(self): + train_dataset = RegressionDataset() + model = RegressionModel() + num_steps, num_warmup_steps = 10, 2 + extra_kwargs = {"min_lr": 1e-5} # Non-default arguments + args = TrainingArguments( + "./regression", + lr_scheduler_type="cosine_warmup_with_min_lr", + lr_scheduler_kwargs=extra_kwargs, + learning_rate=0.2, + warmup_steps=num_warmup_steps, + report_to="none", + ) + trainer = Trainer(model, args, train_dataset=train_dataset) + trainer.create_optimizer_and_scheduler(num_training_steps=num_steps) + + # Checking that the scheduler was created + self.assertIsNotNone(trainer.lr_scheduler) + + # Check the last learning rate + step_lrs = [] + for _ in range(num_steps): + step_lrs.append(trainer.optimizer.param_groups[0]["lr"]) + trainer.lr_scheduler.step() + self.assertEqual(step_lrs[0], 0.1) + self.assertEqual(step_lrs[1], 0.2) + self.assertEqual(step_lrs[-1], 1e-05) + def test_reduce_lr_on_plateau_args(self): # test passed arguments for a custom ReduceLROnPlateau scheduler train_dataset = RegressionDataset(length=64)