diff --git a/README.md b/README.md index fde35d23ea..b348fde28c 100644 --- a/README.md +++ b/README.md @@ -984,7 +984,10 @@ The optimizer accepts the following arguments: - `warmup` : portion of `t_total` for the warmup, `-1` means no warmup. Default : `-1` - `t_total` : total number of training steps for the learning rate schedule, `-1` means constant learning rate. Default : `-1` -- `schedule` : schedule to use for the warmup (see above). Default : `'warmup_linear'` +- `schedule` : schedule to use for the warmup (see above). + Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object (see below). + If `None` or `'none'`, learning rate is always kept constant. + Default : `'warmup_linear'` - `b1` : Adams b1. Default : `0.9` - `b2` : Adams b2. Default : `0.999` - `e` : Adams epsilon. Default : `1e-6` @@ -998,6 +1001,32 @@ The differences with `BertAdam` is that `OpenAIGPTAdam` compensate for bias as i `OpenAIGPTAdam` accepts the same arguments as `BertAdam`. +#### Learning Rate Schedules +The `.optimization` module also provides additional schedules in the form of schedule objects that inherit from `_LRSchedule`. +All `_LRSchedule` subclasses accept `warmup` and `t_total` arguments at construction. +When an `_LRSchedule` object is passed into `BertAdam` or `OpenAIAdam`, +the `warmup` and `t_total` arguments on the optimizer are ignored and the ones in the `_LRSchedule` object are used. +An overview of the implemented schedules: +- `ConstantLR`: always returns learning rate 1. +- `WarmupConstantSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps. + Keeps learning rate equal to 1. after warmup. + ![](docs/imgs/warmup_constant_schedule.png) +- `WarmupLinearSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps. + Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps. + ![](docs/imgs/warmup_linear_schedule.png) +- `WarmupCosineSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps. + Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve. + If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup. + ![](docs/imgs/warmup_cosine_schedule.png) +- `WarmupCosineWithHardRestartsSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps. + If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying learning rate (with hard restarts). + ![](docs/imgs/warmup_cosine_hard_restarts_schedule.png) +- `WarmupCosineWithWarmupRestartsSchedule`: All training progress is divided in `cycles` (default=1.) parts of equal length. + Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1., + followed by a learning rate decreasing from 1. to 0. following a cosine curve. + Note that the total number of all warmup steps over all cycles together is equal to `warmup` * `cycles` + ![](docs/imgs/warmup_cosine_warm_restarts_schedule.png) + ## Examples | Sub-section | Description | diff --git a/docs/imgs/warmup_constant_schedule.png b/docs/imgs/warmup_constant_schedule.png new file mode 100644 index 0000000000..e2448e9f2c Binary files /dev/null and b/docs/imgs/warmup_constant_schedule.png differ diff --git a/docs/imgs/warmup_cosine_hard_restarts_schedule.png b/docs/imgs/warmup_cosine_hard_restarts_schedule.png new file mode 100644 index 0000000000..be73605b9c Binary files /dev/null and b/docs/imgs/warmup_cosine_hard_restarts_schedule.png differ diff --git a/docs/imgs/warmup_cosine_schedule.png b/docs/imgs/warmup_cosine_schedule.png new file mode 100644 index 0000000000..6d27926ab1 Binary files /dev/null and b/docs/imgs/warmup_cosine_schedule.png differ diff --git a/docs/imgs/warmup_cosine_warm_restarts_schedule.png b/docs/imgs/warmup_cosine_warm_restarts_schedule.png new file mode 100644 index 0000000000..71b39bffd3 Binary files /dev/null and b/docs/imgs/warmup_cosine_warm_restarts_schedule.png differ diff --git a/docs/imgs/warmup_linear_schedule.png b/docs/imgs/warmup_linear_schedule.png new file mode 100644 index 0000000000..4e1af31025 Binary files /dev/null and b/docs/imgs/warmup_linear_schedule.png differ diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py index 7e88b1b61c..03856956ac 100644 --- a/pytorch_pretrained_bert/optimization.py +++ b/pytorch_pretrained_bert/optimization.py @@ -85,7 +85,9 @@ class ConstantLR(_LRSchedule): class WarmupCosineSchedule(_LRSchedule): """ - Cosine learning rate schedule with linear warmup. Cosine after warmup is without restarts. + Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps. + Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve. + If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup. """ warn_t_total = True def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw): @@ -108,7 +110,9 @@ class WarmupCosineSchedule(_LRSchedule): class WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule): """ - Cosine learning rate schedule with linear warmup and hard restarts (if cycles > 1). + Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps. + If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying + learning rate (with hard restarts). """ def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw): super(WarmupCosineWithHardRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw) @@ -125,9 +129,9 @@ class WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule): class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedule): """ - Cosine learning rate schedule with linear warmups and linear warmup restarts. - The same warmup rate is used for warmup restarts as for initial warmup. - The total effective fraction of warmup steps over all cycles is warmup * cycles! + All training progress is divided in `cycles` (default=1.) parts of equal length. + Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1., + followed by a learning rate decreasing from 1. to 0. following a cosine curve. """ def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw): assert(warmup * cycles < 1.) @@ -146,7 +150,8 @@ class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedul class WarmupConstantSchedule(_LRSchedule): """ - Applies linear warmup. After warmup always returns 1.. + Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps. + Keeps learning rate equal to 1. after warmup. """ def get_lr_(self, progress): if progress < self.warmup: @@ -156,7 +161,8 @@ class WarmupConstantSchedule(_LRSchedule): class WarmupLinearSchedule(_LRSchedule): """ - Linear warmup. Linear decay after warmup. + Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps. + Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps. """ warn_t_total = True def get_lr_(self, progress): @@ -182,8 +188,9 @@ class BertAdam(Optimizer): t_total: total number of training steps for the learning rate schedule, -1 means constant learning rate of 1. (no warmup regardless of warmup setting). Default: -1 schedule: schedule to use for the warmup (see above). - Can be 'warmup_linear', 'warmup_constant', 'warmup_cosine', or a LRSchedule object. - Default: 'warmup_linear' + Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object (see below). + If `None` or `'none'`, learning rate is always kept constant. + Default : `'warmup_linear'` b1: Adams b1. Default: 0.9 b2: Adams b2. Default: 0.999 e: Adams epsilon. Default: 1e-6 @@ -208,8 +215,8 @@ class BertAdam(Optimizer): schedule = schedule_type(warmup=warmup, t_total=t_total) else: if warmup != -1 or t_total != -1: - logger.warning("Non-default warmup and t_total are ineffective when LRSchedule object is provided. " - "Please specify custom warmup and t_total in LRSchedule object.") + logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. " + "Please specify custom warmup and t_total in _LRSchedule object.") defaults = dict(lr=lr, schedule=schedule, b1=b1, b2=b2, e=e, weight_decay=weight_decay, max_grad_norm=max_grad_norm) diff --git a/pytorch_pretrained_bert/optimization_openai.py b/pytorch_pretrained_bert/optimization_openai.py index 0cf0494e20..bff4ebe61f 100644 --- a/pytorch_pretrained_bert/optimization_openai.py +++ b/pytorch_pretrained_bert/optimization_openai.py @@ -48,8 +48,8 @@ class OpenAIAdam(Optimizer): schedule = schedule_type(warmup=warmup, t_total=t_total) else: if warmup != -1 or t_total != -1: - logger.warning("Non-default warmup and t_total are ineffective when LRSchedule object is provided. " - "Please specify custom warmup and t_total in LRSchedule object.") + logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. " + "Please specify custom warmup and t_total in _LRSchedule object.") defaults = dict(lr=lr, schedule=schedule, b1=b1, b2=b2, e=e, weight_decay=weight_decay, vector_l2=vector_l2, max_grad_norm=max_grad_norm) diff --git a/tests/optimization_test.py b/tests/optimization_test.py index f52aeb506b..bc12ff8a90 100644 --- a/tests/optimization_test.py +++ b/tests/optimization_test.py @@ -22,7 +22,8 @@ import torch from pytorch_pretrained_bert import BertAdam from pytorch_pretrained_bert import OpenAIAdam -from pytorch_pretrained_bert.optimization import ConstantLR, WarmupLinearSchedule, WarmupCosineWithWarmupRestartsSchedule +from pytorch_pretrained_bert.optimization import ConstantLR, WarmupLinearSchedule, WarmupConstantSchedule, \ + WarmupCosineWithWarmupRestartsSchedule, WarmupCosineWithHardRestartsSchedule, WarmupCosineSchedule import numpy as np @@ -86,6 +87,18 @@ class WarmupCosineWithRestartsTest(unittest.TestCase): self.assertTrue(np.allclose(expected_zeros, 0)) +class TestSchedulePlot(unittest.TestCase): + def test_plot_schedule(self): + import matplotlib as mpl + from matplotlib import pyplot as plt + m = WarmupCosineWithWarmupRestartsSchedule(warmup=.1, t_total=1000., cycles=3.) + x = np.arange(0, 1000) + y = [m.get_lr(xe) for xe in x] + y = np.asarray(y) + plt.figure(figsize=(9, 2)) + plt.plot(y) + #plt.grid(True) + plt.show() if __name__ == "__main__":