- updated docs for new LR API
- added some images for illustration - updated comments in optimization
This commit is contained in:
31
README.md
31
README.md
@@ -984,7 +984,10 @@ The optimizer accepts the following arguments:
|
|||||||
- `warmup` : portion of `t_total` for the warmup, `-1` means no warmup. Default : `-1`
|
- `warmup` : portion of `t_total` for the warmup, `-1` means no warmup. Default : `-1`
|
||||||
- `t_total` : total number of training steps for the learning
|
- `t_total` : total number of training steps for the learning
|
||||||
rate schedule, `-1` means constant learning rate. Default : `-1`
|
rate schedule, `-1` means constant learning rate. Default : `-1`
|
||||||
- `schedule` : schedule to use for the warmup (see above). Default : `'warmup_linear'`
|
- `schedule` : schedule to use for the warmup (see above).
|
||||||
|
Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object (see below).
|
||||||
|
If `None` or `'none'`, learning rate is always kept constant.
|
||||||
|
Default : `'warmup_linear'`
|
||||||
- `b1` : Adams b1. Default : `0.9`
|
- `b1` : Adams b1. Default : `0.9`
|
||||||
- `b2` : Adams b2. Default : `0.999`
|
- `b2` : Adams b2. Default : `0.999`
|
||||||
- `e` : Adams epsilon. Default : `1e-6`
|
- `e` : Adams epsilon. Default : `1e-6`
|
||||||
@@ -998,6 +1001,32 @@ The differences with `BertAdam` is that `OpenAIGPTAdam` compensate for bias as i
|
|||||||
|
|
||||||
`OpenAIGPTAdam` accepts the same arguments as `BertAdam`.
|
`OpenAIGPTAdam` accepts the same arguments as `BertAdam`.
|
||||||
|
|
||||||
|
#### Learning Rate Schedules
|
||||||
|
The `.optimization` module also provides additional schedules in the form of schedule objects that inherit from `_LRSchedule`.
|
||||||
|
All `_LRSchedule` subclasses accept `warmup` and `t_total` arguments at construction.
|
||||||
|
When an `_LRSchedule` object is passed into `BertAdam` or `OpenAIAdam`,
|
||||||
|
the `warmup` and `t_total` arguments on the optimizer are ignored and the ones in the `_LRSchedule` object are used.
|
||||||
|
An overview of the implemented schedules:
|
||||||
|
- `ConstantLR`: always returns learning rate 1.
|
||||||
|
- `WarmupConstantSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
|
||||||
|
Keeps learning rate equal to 1. after warmup.
|
||||||
|

|
||||||
|
- `WarmupLinearSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
|
||||||
|
Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps.
|
||||||
|

|
||||||
|
- `WarmupCosineSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
|
||||||
|
Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve.
|
||||||
|
If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
|
||||||
|

|
||||||
|
- `WarmupCosineWithHardRestartsSchedule`: Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
|
||||||
|
If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying learning rate (with hard restarts).
|
||||||
|

|
||||||
|
- `WarmupCosineWithWarmupRestartsSchedule`: All training progress is divided in `cycles` (default=1.) parts of equal length.
|
||||||
|
Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1.,
|
||||||
|
followed by a learning rate decreasing from 1. to 0. following a cosine curve.
|
||||||
|
Note that the total number of all warmup steps over all cycles together is equal to `warmup` * `cycles`
|
||||||
|

|
||||||
|
|
||||||
## Examples
|
## Examples
|
||||||
|
|
||||||
| Sub-section | Description |
|
| Sub-section | Description |
|
||||||
|
|||||||
BIN
docs/imgs/warmup_constant_schedule.png
Normal file
BIN
docs/imgs/warmup_constant_schedule.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 9.7 KiB |
BIN
docs/imgs/warmup_cosine_hard_restarts_schedule.png
Normal file
BIN
docs/imgs/warmup_cosine_hard_restarts_schedule.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 22 KiB |
BIN
docs/imgs/warmup_cosine_schedule.png
Normal file
BIN
docs/imgs/warmup_cosine_schedule.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 17 KiB |
BIN
docs/imgs/warmup_cosine_warm_restarts_schedule.png
Normal file
BIN
docs/imgs/warmup_cosine_warm_restarts_schedule.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 22 KiB |
BIN
docs/imgs/warmup_linear_schedule.png
Normal file
BIN
docs/imgs/warmup_linear_schedule.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 16 KiB |
@@ -85,7 +85,9 @@ class ConstantLR(_LRSchedule):
|
|||||||
|
|
||||||
class WarmupCosineSchedule(_LRSchedule):
|
class WarmupCosineSchedule(_LRSchedule):
|
||||||
"""
|
"""
|
||||||
Cosine learning rate schedule with linear warmup. Cosine after warmup is without restarts.
|
Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
|
||||||
|
Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve.
|
||||||
|
If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
|
||||||
"""
|
"""
|
||||||
warn_t_total = True
|
warn_t_total = True
|
||||||
def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw):
|
def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw):
|
||||||
@@ -108,7 +110,9 @@ class WarmupCosineSchedule(_LRSchedule):
|
|||||||
|
|
||||||
class WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule):
|
class WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule):
|
||||||
"""
|
"""
|
||||||
Cosine learning rate schedule with linear warmup and hard restarts (if cycles > 1).
|
Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
|
||||||
|
If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
|
||||||
|
learning rate (with hard restarts).
|
||||||
"""
|
"""
|
||||||
def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
|
def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
|
||||||
super(WarmupCosineWithHardRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
|
super(WarmupCosineWithHardRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
|
||||||
@@ -125,9 +129,9 @@ class WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule):
|
|||||||
|
|
||||||
class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedule):
|
class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedule):
|
||||||
"""
|
"""
|
||||||
Cosine learning rate schedule with linear warmups and linear warmup restarts.
|
All training progress is divided in `cycles` (default=1.) parts of equal length.
|
||||||
The same warmup rate is used for warmup restarts as for initial warmup.
|
Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1.,
|
||||||
The total effective fraction of warmup steps over all cycles is warmup * cycles!
|
followed by a learning rate decreasing from 1. to 0. following a cosine curve.
|
||||||
"""
|
"""
|
||||||
def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
|
def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
|
||||||
assert(warmup * cycles < 1.)
|
assert(warmup * cycles < 1.)
|
||||||
@@ -146,7 +150,8 @@ class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedul
|
|||||||
|
|
||||||
class WarmupConstantSchedule(_LRSchedule):
|
class WarmupConstantSchedule(_LRSchedule):
|
||||||
"""
|
"""
|
||||||
Applies linear warmup. After warmup always returns 1..
|
Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
|
||||||
|
Keeps learning rate equal to 1. after warmup.
|
||||||
"""
|
"""
|
||||||
def get_lr_(self, progress):
|
def get_lr_(self, progress):
|
||||||
if progress < self.warmup:
|
if progress < self.warmup:
|
||||||
@@ -156,7 +161,8 @@ class WarmupConstantSchedule(_LRSchedule):
|
|||||||
|
|
||||||
class WarmupLinearSchedule(_LRSchedule):
|
class WarmupLinearSchedule(_LRSchedule):
|
||||||
"""
|
"""
|
||||||
Linear warmup. Linear decay after warmup.
|
Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
|
||||||
|
Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps.
|
||||||
"""
|
"""
|
||||||
warn_t_total = True
|
warn_t_total = True
|
||||||
def get_lr_(self, progress):
|
def get_lr_(self, progress):
|
||||||
@@ -182,8 +188,9 @@ class BertAdam(Optimizer):
|
|||||||
t_total: total number of training steps for the learning
|
t_total: total number of training steps for the learning
|
||||||
rate schedule, -1 means constant learning rate of 1. (no warmup regardless of warmup setting). Default: -1
|
rate schedule, -1 means constant learning rate of 1. (no warmup regardless of warmup setting). Default: -1
|
||||||
schedule: schedule to use for the warmup (see above).
|
schedule: schedule to use for the warmup (see above).
|
||||||
Can be 'warmup_linear', 'warmup_constant', 'warmup_cosine', or a LRSchedule object.
|
Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object (see below).
|
||||||
Default: 'warmup_linear'
|
If `None` or `'none'`, learning rate is always kept constant.
|
||||||
|
Default : `'warmup_linear'`
|
||||||
b1: Adams b1. Default: 0.9
|
b1: Adams b1. Default: 0.9
|
||||||
b2: Adams b2. Default: 0.999
|
b2: Adams b2. Default: 0.999
|
||||||
e: Adams epsilon. Default: 1e-6
|
e: Adams epsilon. Default: 1e-6
|
||||||
@@ -208,8 +215,8 @@ class BertAdam(Optimizer):
|
|||||||
schedule = schedule_type(warmup=warmup, t_total=t_total)
|
schedule = schedule_type(warmup=warmup, t_total=t_total)
|
||||||
else:
|
else:
|
||||||
if warmup != -1 or t_total != -1:
|
if warmup != -1 or t_total != -1:
|
||||||
logger.warning("Non-default warmup and t_total are ineffective when LRSchedule object is provided. "
|
logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
|
||||||
"Please specify custom warmup and t_total in LRSchedule object.")
|
"Please specify custom warmup and t_total in _LRSchedule object.")
|
||||||
defaults = dict(lr=lr, schedule=schedule,
|
defaults = dict(lr=lr, schedule=schedule,
|
||||||
b1=b1, b2=b2, e=e, weight_decay=weight_decay,
|
b1=b1, b2=b2, e=e, weight_decay=weight_decay,
|
||||||
max_grad_norm=max_grad_norm)
|
max_grad_norm=max_grad_norm)
|
||||||
|
|||||||
@@ -48,8 +48,8 @@ class OpenAIAdam(Optimizer):
|
|||||||
schedule = schedule_type(warmup=warmup, t_total=t_total)
|
schedule = schedule_type(warmup=warmup, t_total=t_total)
|
||||||
else:
|
else:
|
||||||
if warmup != -1 or t_total != -1:
|
if warmup != -1 or t_total != -1:
|
||||||
logger.warning("Non-default warmup and t_total are ineffective when LRSchedule object is provided. "
|
logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
|
||||||
"Please specify custom warmup and t_total in LRSchedule object.")
|
"Please specify custom warmup and t_total in _LRSchedule object.")
|
||||||
defaults = dict(lr=lr, schedule=schedule,
|
defaults = dict(lr=lr, schedule=schedule,
|
||||||
b1=b1, b2=b2, e=e, weight_decay=weight_decay, vector_l2=vector_l2,
|
b1=b1, b2=b2, e=e, weight_decay=weight_decay, vector_l2=vector_l2,
|
||||||
max_grad_norm=max_grad_norm)
|
max_grad_norm=max_grad_norm)
|
||||||
|
|||||||
@@ -22,7 +22,8 @@ import torch
|
|||||||
|
|
||||||
from pytorch_pretrained_bert import BertAdam
|
from pytorch_pretrained_bert import BertAdam
|
||||||
from pytorch_pretrained_bert import OpenAIAdam
|
from pytorch_pretrained_bert import OpenAIAdam
|
||||||
from pytorch_pretrained_bert.optimization import ConstantLR, WarmupLinearSchedule, WarmupCosineWithWarmupRestartsSchedule
|
from pytorch_pretrained_bert.optimization import ConstantLR, WarmupLinearSchedule, WarmupConstantSchedule, \
|
||||||
|
WarmupCosineWithWarmupRestartsSchedule, WarmupCosineWithHardRestartsSchedule, WarmupCosineSchedule
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
@@ -86,6 +87,18 @@ class WarmupCosineWithRestartsTest(unittest.TestCase):
|
|||||||
self.assertTrue(np.allclose(expected_zeros, 0))
|
self.assertTrue(np.allclose(expected_zeros, 0))
|
||||||
|
|
||||||
|
|
||||||
|
class TestSchedulePlot(unittest.TestCase):
|
||||||
|
def test_plot_schedule(self):
|
||||||
|
import matplotlib as mpl
|
||||||
|
from matplotlib import pyplot as plt
|
||||||
|
m = WarmupCosineWithWarmupRestartsSchedule(warmup=.1, t_total=1000., cycles=3.)
|
||||||
|
x = np.arange(0, 1000)
|
||||||
|
y = [m.get_lr(xe) for xe in x]
|
||||||
|
y = np.asarray(y)
|
||||||
|
plt.figure(figsize=(9, 2))
|
||||||
|
plt.plot(y)
|
||||||
|
#plt.grid(True)
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
Reference in New Issue
Block a user