Add cosine_with_min_lr_schedule_with_warmup_lr_rate scheduler in Trainer (#31870)

* add cosine_with_min_lr_schedule_with_warmup_lr_rate scheduler in trainer * Update src/transformers/optimization.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update optimization.py fix the error of the unclosed "(" * Update optimization.py remove whitespace in line 402 in order to pass the quality test * Update src/transformers/optimization.py * Update src/transformers/optimization.py * Apply style fixes --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
2025-07-16 18:01:08 +08:00
parent 0cf08e90dd
commit e048d48bd0
3 changed files with 111 additions and 0 deletions
--- a/src/transformers/optimization.py
+++ b/src/transformers/optimization.py
@@ -384,6 +384,87 @@ def get_cosine_with_min_lr_schedule_with_warmup(
    return LambdaLR(optimizer, lr_lambda, last_epoch)
 def _get_cosine_with_min_lr_schedule_with_warmup_lr_rate_lambda(
    current_step: int,
    *,
    num_warmup_steps: int,
    num_training_steps: int,
    num_cycles: float,
    min_lr_rate: float = 0.0,
    warmup_lr_rate: Optional[float] = None,
 ):
    current_step = float(current_step)
    num_warmup_steps = float(num_warmup_steps)
    num_training_steps = float(num_training_steps)
    if current_step < num_warmup_steps:
        if warmup_lr_rate is None:
            return (current_step + 1.0) / max(1.0, num_warmup_steps)
        else:
            warmup_lr_rate = float(warmup_lr_rate)
            return warmup_lr_rate + (1.0 - warmup_lr_rate) * (current_step) / (max(1, num_warmup_steps - 1))
    progress = (current_step - num_warmup_steps + 1.0) / (max(1.0, num_training_steps - num_warmup_steps))
    factor = 0.5 * (1.0 + math.cos(math.pi * num_cycles * 2.0 * progress))
    factor = factor * (1 - min_lr_rate) + min_lr_rate
    return max(0, factor)
 def get_cosine_with_min_lr_schedule_with_warmup_lr_rate(
    optimizer: Optimizer,
    num_warmup_steps: int,
    num_training_steps: int,
    num_cycles: float = 0.5,
    last_epoch: int = -1,
    min_lr: Optional[float] = None,
    min_lr_rate: Optional[float] = None,
    warmup_lr_rate: Optional[float] = None,
 ):
    """
    Create a schedule with a learning rate that decreases following the values of the cosine function between the
    initial lr set in the optimizer to min_lr, after a warmup period during which it increases linearly between 0 and the
    initial lr set in the optimizer.
    Args:
        optimizer ([`~torch.optim.Optimizer`]):
            The optimizer for which to schedule the learning rate.
        num_warmup_steps (`int`):
            The number of steps for the warmup phase.
        num_training_steps (`int`):
            The total number of training steps.
        num_cycles (`float`, *optional*, defaults to 0.5):
            The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
            following a half-cosine).
        last_epoch (`int`, *optional*, defaults to -1):
            The index of the last epoch when resuming training.
        min_lr (`float`, *optional*):
            The minimum learning rate to reach after the cosine schedule.
        min_lr_rate (`float`, *optional*):
            The minimum learning rate as a ratio of the initial learning rate. If set, `min_lr` should not be set.
        warmup_lr_rate (`float`, *optional*):
            The minimum learning rate as a ratio of the start learning rate. If not set, `warmup_lr_rate` will be treated as float(1/num_warmup_steps).
    Return:
        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
    """
    if min_lr is not None and min_lr_rate is not None:
        raise ValueError("Only one of min_lr or min_lr_rate should be set")
    elif min_lr is not None:
        min_lr_rate = min_lr / optimizer.defaults["lr"]
    elif min_lr_rate is None:
        raise ValueError("One of min_lr or min_lr_rate should be set through the `lr_scheduler_kwargs`")
    lr_lambda = partial(
        _get_cosine_with_min_lr_schedule_with_warmup_lr_rate_lambda,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps,
        num_cycles=num_cycles,
        min_lr_rate=min_lr_rate,
        warmup_lr_rate=warmup_lr_rate,
    )
    return LambdaLR(optimizer, lr_lambda, last_epoch)
 def _get_wsd_scheduler_lambda(
    current_step: int,
    *,
@@ -505,6 +586,7 @@ TYPE_TO_SCHEDULER_FUNCTION = {
    SchedulerType.INVERSE_SQRT: get_inverse_sqrt_schedule,
    SchedulerType.REDUCE_ON_PLATEAU: get_reduce_on_plateau_schedule,
    SchedulerType.COSINE_WITH_MIN_LR: get_cosine_with_min_lr_schedule_with_warmup,
    SchedulerType.COSINE_WARMUP_WITH_MIN_LR: get_cosine_with_min_lr_schedule_with_warmup_lr_rate,
    SchedulerType.WARMUP_STABLE_DECAY: get_wsd_schedule,
 }
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -444,6 +444,7 @@ class SchedulerType(ExplicitEnum):
    INVERSE_SQRT = "inverse_sqrt"
    REDUCE_ON_PLATEAU = "reduce_lr_on_plateau"
    COSINE_WITH_MIN_LR = "cosine_with_min_lr"
    COSINE_WARMUP_WITH_MIN_LR = "cosine_warmup_with_min_lr"
    WARMUP_STABLE_DECAY = "warmup_stable_decay"
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -1143,6 +1143,34 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
                trainer.lr_scheduler.step()
            self.assertEqual(trainer.lr_scheduler.get_last_lr()[0], 1e-5)
    def test_cosine_with_min_lr_schedule_with_warmup_lr_rate(self):
        train_dataset = RegressionDataset()
        model = RegressionModel()
        num_steps, num_warmup_steps = 10, 2
        extra_kwargs = {"min_lr": 1e-5}  # Non-default arguments
        args = TrainingArguments(
            "./regression",
            lr_scheduler_type="cosine_warmup_with_min_lr",
            lr_scheduler_kwargs=extra_kwargs,
            learning_rate=0.2,
            warmup_steps=num_warmup_steps,
            report_to="none",
        )
        trainer = Trainer(model, args, train_dataset=train_dataset)
        trainer.create_optimizer_and_scheduler(num_training_steps=num_steps)
        # Checking that the scheduler was created
        self.assertIsNotNone(trainer.lr_scheduler)
        # Check the last learning rate
        step_lrs = []
        for _ in range(num_steps):
            step_lrs.append(trainer.optimizer.param_groups[0]["lr"])
            trainer.lr_scheduler.step()
        self.assertEqual(step_lrs[0], 0.1)
        self.assertEqual(step_lrs[1], 0.2)
        self.assertEqual(step_lrs[-1], 1e-05)
    def test_reduce_lr_on_plateau_args(self):
        # test passed arguments for a custom ReduceLROnPlateau scheduler
        train_dataset = RegressionDataset(length=64)