Add cosine_with_min_lr_schedule_with_warmup_lr_rate scheduler in Trainer (#31870)

* add cosine_with_min_lr_schedule_with_warmup_lr_rate scheduler in trainer

* Update src/transformers/optimization.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update optimization.py

fix the error of the unclosed "("

* Update optimization.py

remove whitespace in line 402 in order to pass the quality test

* Update src/transformers/optimization.py

* Update src/transformers/optimization.py

* Apply style fixes

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
This commit is contained in:
richardodliu
2025-07-16 18:01:08 +08:00
committed by GitHub
parent 0cf08e90dd
commit e048d48bd0
3 changed files with 111 additions and 0 deletions

View File

@@ -384,6 +384,87 @@ def get_cosine_with_min_lr_schedule_with_warmup(
return LambdaLR(optimizer, lr_lambda, last_epoch) return LambdaLR(optimizer, lr_lambda, last_epoch)
def _get_cosine_with_min_lr_schedule_with_warmup_lr_rate_lambda(
current_step: int,
*,
num_warmup_steps: int,
num_training_steps: int,
num_cycles: float,
min_lr_rate: float = 0.0,
warmup_lr_rate: Optional[float] = None,
):
current_step = float(current_step)
num_warmup_steps = float(num_warmup_steps)
num_training_steps = float(num_training_steps)
if current_step < num_warmup_steps:
if warmup_lr_rate is None:
return (current_step + 1.0) / max(1.0, num_warmup_steps)
else:
warmup_lr_rate = float(warmup_lr_rate)
return warmup_lr_rate + (1.0 - warmup_lr_rate) * (current_step) / (max(1, num_warmup_steps - 1))
progress = (current_step - num_warmup_steps + 1.0) / (max(1.0, num_training_steps - num_warmup_steps))
factor = 0.5 * (1.0 + math.cos(math.pi * num_cycles * 2.0 * progress))
factor = factor * (1 - min_lr_rate) + min_lr_rate
return max(0, factor)
def get_cosine_with_min_lr_schedule_with_warmup_lr_rate(
optimizer: Optimizer,
num_warmup_steps: int,
num_training_steps: int,
num_cycles: float = 0.5,
last_epoch: int = -1,
min_lr: Optional[float] = None,
min_lr_rate: Optional[float] = None,
warmup_lr_rate: Optional[float] = None,
):
"""
Create a schedule with a learning rate that decreases following the values of the cosine function between the
initial lr set in the optimizer to min_lr, after a warmup period during which it increases linearly between 0 and the
initial lr set in the optimizer.
Args:
optimizer ([`~torch.optim.Optimizer`]):
The optimizer for which to schedule the learning rate.
num_warmup_steps (`int`):
The number of steps for the warmup phase.
num_training_steps (`int`):
The total number of training steps.
num_cycles (`float`, *optional*, defaults to 0.5):
The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
following a half-cosine).
last_epoch (`int`, *optional*, defaults to -1):
The index of the last epoch when resuming training.
min_lr (`float`, *optional*):
The minimum learning rate to reach after the cosine schedule.
min_lr_rate (`float`, *optional*):
The minimum learning rate as a ratio of the initial learning rate. If set, `min_lr` should not be set.
warmup_lr_rate (`float`, *optional*):
The minimum learning rate as a ratio of the start learning rate. If not set, `warmup_lr_rate` will be treated as float(1/num_warmup_steps).
Return:
`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
"""
if min_lr is not None and min_lr_rate is not None:
raise ValueError("Only one of min_lr or min_lr_rate should be set")
elif min_lr is not None:
min_lr_rate = min_lr / optimizer.defaults["lr"]
elif min_lr_rate is None:
raise ValueError("One of min_lr or min_lr_rate should be set through the `lr_scheduler_kwargs`")
lr_lambda = partial(
_get_cosine_with_min_lr_schedule_with_warmup_lr_rate_lambda,
num_warmup_steps=num_warmup_steps,
num_training_steps=num_training_steps,
num_cycles=num_cycles,
min_lr_rate=min_lr_rate,
warmup_lr_rate=warmup_lr_rate,
)
return LambdaLR(optimizer, lr_lambda, last_epoch)
def _get_wsd_scheduler_lambda( def _get_wsd_scheduler_lambda(
current_step: int, current_step: int,
*, *,
@@ -505,6 +586,7 @@ TYPE_TO_SCHEDULER_FUNCTION = {
SchedulerType.INVERSE_SQRT: get_inverse_sqrt_schedule, SchedulerType.INVERSE_SQRT: get_inverse_sqrt_schedule,
SchedulerType.REDUCE_ON_PLATEAU: get_reduce_on_plateau_schedule, SchedulerType.REDUCE_ON_PLATEAU: get_reduce_on_plateau_schedule,
SchedulerType.COSINE_WITH_MIN_LR: get_cosine_with_min_lr_schedule_with_warmup, SchedulerType.COSINE_WITH_MIN_LR: get_cosine_with_min_lr_schedule_with_warmup,
SchedulerType.COSINE_WARMUP_WITH_MIN_LR: get_cosine_with_min_lr_schedule_with_warmup_lr_rate,
SchedulerType.WARMUP_STABLE_DECAY: get_wsd_schedule, SchedulerType.WARMUP_STABLE_DECAY: get_wsd_schedule,
} }

View File

@@ -444,6 +444,7 @@ class SchedulerType(ExplicitEnum):
INVERSE_SQRT = "inverse_sqrt" INVERSE_SQRT = "inverse_sqrt"
REDUCE_ON_PLATEAU = "reduce_lr_on_plateau" REDUCE_ON_PLATEAU = "reduce_lr_on_plateau"
COSINE_WITH_MIN_LR = "cosine_with_min_lr" COSINE_WITH_MIN_LR = "cosine_with_min_lr"
COSINE_WARMUP_WITH_MIN_LR = "cosine_warmup_with_min_lr"
WARMUP_STABLE_DECAY = "warmup_stable_decay" WARMUP_STABLE_DECAY = "warmup_stable_decay"

View File

@@ -1143,6 +1143,34 @@ class TrainerIntegrationPrerunTest(TestCasePlus, TrainerIntegrationCommon):
trainer.lr_scheduler.step() trainer.lr_scheduler.step()
self.assertEqual(trainer.lr_scheduler.get_last_lr()[0], 1e-5) self.assertEqual(trainer.lr_scheduler.get_last_lr()[0], 1e-5)
def test_cosine_with_min_lr_schedule_with_warmup_lr_rate(self):
train_dataset = RegressionDataset()
model = RegressionModel()
num_steps, num_warmup_steps = 10, 2
extra_kwargs = {"min_lr": 1e-5} # Non-default arguments
args = TrainingArguments(
"./regression",
lr_scheduler_type="cosine_warmup_with_min_lr",
lr_scheduler_kwargs=extra_kwargs,
learning_rate=0.2,
warmup_steps=num_warmup_steps,
report_to="none",
)
trainer = Trainer(model, args, train_dataset=train_dataset)
trainer.create_optimizer_and_scheduler(num_training_steps=num_steps)
# Checking that the scheduler was created
self.assertIsNotNone(trainer.lr_scheduler)
# Check the last learning rate
step_lrs = []
for _ in range(num_steps):
step_lrs.append(trainer.optimizer.param_groups[0]["lr"])
trainer.lr_scheduler.step()
self.assertEqual(step_lrs[0], 0.1)
self.assertEqual(step_lrs[1], 0.2)
self.assertEqual(step_lrs[-1], 1e-05)
def test_reduce_lr_on_plateau_args(self): def test_reduce_lr_on_plateau_args(self):
# test passed arguments for a custom ReduceLROnPlateau scheduler # test passed arguments for a custom ReduceLROnPlateau scheduler
train_dataset = RegressionDataset(length=64) train_dataset = RegressionDataset(length=64)