From 022525b0031bcdbbb62d1223f75919983f2ac426 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Louf?= Date: Tue, 12 Nov 2019 11:08:47 +0100 Subject: [PATCH] replace LambdaLR scheduler wrappers by function Custom schedulers are currently initiated by wrapping Pytorch's LambdaLR class and passing a method of the wrapping class to the __init__ function of LambdaLR. This approach is not appropriate for several reasons: 1. one does not need to define a class when it only defines a __init__() method; 2. instantiating the parent class by passing a method of the child class creates a cyclical reference which leads to memory leaks. See issues #1742 and #1134. In this commit we replace the wrapper classes with functions that instantiate `LambdaLR` with a custom learning rate function. We use a closure to specify the parameter of the latter. We also do a bit of renaming within the function to explicit the behaviour and removed docstrings that were subsequently not necessary. --- transformers/__init__.py | 4 +- transformers/optimization.py | 108 +++++++++--------------- transformers/tests/optimization_test.py | 29 ++++--- 3 files changed, 61 insertions(+), 80 deletions(-) diff --git a/transformers/__init__.py b/transformers/__init__.py index 53f3c39dc7..426f3bd3a2 100644 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -97,8 +97,8 @@ if is_torch_available(): from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model # Optimization - from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule, - WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule) + from .optimization import (AdamW, get_constant_schedule, get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup, + get_cosine_with_hard_restarts_schedule_with_warmup, get_linear_schedule_with_warmup) # TensorFlow diff --git a/transformers/optimization.py b/transformers/optimization.py index a48b5fea54..99e6cc75e4 100644 --- a/transformers/optimization.py +++ b/transformers/optimization.py @@ -23,89 +23,65 @@ from torch.optim.lr_scheduler import LambdaLR logger = logging.getLogger(__name__) -class ConstantLRSchedule(LambdaLR): - """ Constant learning rate schedule. + +def get_constant_schedule(optimizer, last_epoch=-1): + """ Create a schedule with a constant learning rate. """ - def __init__(self, optimizer, last_epoch=-1): - super(ConstantLRSchedule, self).__init__(optimizer, lambda _: 1.0, last_epoch=last_epoch) + return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch) -class WarmupConstantSchedule(LambdaLR): - """ Linear warmup and then constant. - Multiplies the learning rate defined in the optimizer by a dynamic variable determined by the current step. - Linearly increases the multiplicative variable from 0. to 1. over `warmup_steps` training steps. - Keeps multiplicative variable equal to 1. after warmup_steps. +def get_constant_schedule_with_warmup(optimizer, num_warmup_steps, last_epoch=-1): + """ Create a schedule with a constant learning rate preceded by a warmup + period during which the learning rate increases linearly between 0 and 1. """ - def __init__(self, optimizer, warmup_steps, last_epoch=-1): - self.warmup_steps = warmup_steps - super(WarmupConstantSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch) - - def lr_lambda(self, step): - if step < self.warmup_steps: - return float(step) / float(max(1.0, self.warmup_steps)) + def lr_lambda(current_step): + if current_step < num_warmup_steps: + return float(current_step) / float(max(1.0, num_warmup_steps)) return 1. + return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch) -class WarmupLinearSchedule(LambdaLR): - """ Linear warmup and then linear decay. - Multiplies the learning rate defined in the optimizer by a dynamic variable determined by the current step. - Linearly increases the multiplicative variable from 0. to 1. over `warmup_steps` training steps. - Linearly decreases the multiplicative variable from 1. to 0. over remaining `t_total - warmup_steps` steps. + +def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1): + """ Create a schedule with a learning rate that decreases linearly after + linearly increasing during a warmup period. """ - def __init__(self, optimizer, warmup_steps, t_total, last_epoch=-1): - self.warmup_steps = warmup_steps - self.t_total = t_total - super(WarmupLinearSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch) + def lr_lambda(current_step): + if current_step < num_warmup_steps: + return float(current_step) / float(max(1, num_warmup_steps)) + return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))) - def lr_lambda(self, step): - if step < self.warmup_steps: - return float(step) / float(max(1, self.warmup_steps)) - return max(0.0, float(self.t_total - step) / float(max(1.0, self.t_total - self.warmup_steps))) + return LambdaLR(optimizer, lr_lambda, last_epoch) -class WarmupCosineSchedule(LambdaLR): - """ Linear warmup and then cosine decay. - Multiplies the learning rate defined in the optimizer by a dynamic variable determined by the current step. - Linearly increases the multiplicative variable from 0. to 1. over `warmup_steps` training steps. - Decreases the multiplicative variable from 1. to 0. over remaining `t_total - warmup_steps` steps following a cosine curve. - If `cycles` (default=0.5) is different from default, then the multiplicative variable follows cosine function after warmup. +def get_cosine_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=.5, last_epoch=-1): + """ Create a schedule with a learning rate that decreases following the + values of the cosine function between 0 and `pi * cycles` after a warmup + period during which it increases linearly between 0 and 1. """ - def __init__(self, optimizer, warmup_steps, t_total, cycles=.5, last_epoch=-1): - self.warmup_steps = warmup_steps - self.t_total = t_total - self.cycles = cycles - super(WarmupCosineSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch) + def lr_lambda(current_step): + if current_step < num_warmup_steps: + return float(current_step) / float(max(1, num_warmup_steps)) + progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) + return max(0., 0.5 * (1. + math.cos(math.pi * float(num_cycles) * 2. * progress))) - def lr_lambda(self, step): - if step < self.warmup_steps: - return float(step) / float(max(1.0, self.warmup_steps)) - # progress after warmup - progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps)) - return max(0.0, 0.5 * (1. + math.cos(math.pi * float(self.cycles) * 2.0 * progress))) + return LambdaLR(optimizer, lr_lambda, last_epoch) -class WarmupCosineWithHardRestartsSchedule(LambdaLR): - """ Linear warmup and then cosine cycles with hard restarts. - Multiplies the learning rate defined in the optimizer by a dynamic variable determined by the current step. - Linearly increases the multiplicative variable from 0. to 1. over `warmup_steps` training steps. - If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying - learning rate (with hard restarts). +def get_cosine_with_hard_restarts_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, num_cycles=1., last_epoch=-1): + """ Create a schedule with a learning rate that decreases following the + values of the cosine function with several hard restarts, after a warmup + period during which it increases linearly between 0 and 1. """ - def __init__(self, optimizer, warmup_steps, t_total, cycles=1., last_epoch=-1): - self.warmup_steps = warmup_steps - self.t_total = t_total - self.cycles = cycles - super(WarmupCosineWithHardRestartsSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch) - - def lr_lambda(self, step): - if step < self.warmup_steps: - return float(step) / float(max(1, self.warmup_steps)) - # progress after warmup - progress = float(step - self.warmup_steps) / float(max(1, self.t_total - self.warmup_steps)) - if progress >= 1.0: - return 0.0 - return max(0.0, 0.5 * (1. + math.cos(math.pi * ((float(self.cycles) * progress) % 1.0)))) + def lr_lambda(current_step): + if current_step < num_warmup_steps: + return float(current_step) / float(max(1, num_warmup_steps)) + progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) + if progress >= 1.: + return 0. + return max(0., 0.5 * (1. + math.cos(math.pi * ((float(num_cycles) * progress) % 1.)))) + return LambdaLR(optimizer, lr_lambda, last_epoch) class AdamW(Optimizer): diff --git a/transformers/tests/optimization_test.py b/transformers/tests/optimization_test.py index 84dbaca52a..ab9afbfcf7 100644 --- a/transformers/tests/optimization_test.py +++ b/transformers/tests/optimization_test.py @@ -25,8 +25,12 @@ from transformers import is_torch_available if is_torch_available(): import torch - from transformers import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, - WarmupCosineSchedule, WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule) + from transformers import (AdamW, + get_constant_schedule, + get_constant_schedule_with_warmup, + get_cosine_schedule_with_warmup, + get_cosine_with_hard_restarts_schedule_with_warmup, + get_linear_schedule_with_warmup) else: pytestmark = pytest.mark.skip("Require Torch") @@ -87,59 +91,60 @@ class ScheduleInitTest(unittest.TestCase): self.assertAlmostEqual(a, b, delta=tol) def test_constant_scheduler(self): - scheduler = ConstantLRSchedule(self.optimizer) + scheduler = get_constant_schedule(self.optimizer) lrs = unwrap_schedule(scheduler, self.num_steps) expected_learning_rates = [10.] * self.num_steps self.assertEqual(len(lrs[0]), 1) self.assertListEqual([l[0] for l in lrs], expected_learning_rates) - scheduler = ConstantLRSchedule(self.optimizer) + scheduler = get_constant_schedule(self.optimizer) lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps) self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2]) def test_warmup_constant_scheduler(self): - scheduler = WarmupConstantSchedule(self.optimizer, warmup_steps=4) + scheduler = get_constant_schedule_with_warmup(self.optimizer, num_warmup_steps=4) lrs = unwrap_schedule(scheduler, self.num_steps) expected_learning_rates = [2.5, 5.0, 7.5, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0] self.assertEqual(len(lrs[0]), 1) self.assertListEqual([l[0] for l in lrs], expected_learning_rates) - scheduler = WarmupConstantSchedule(self.optimizer, warmup_steps=4) + scheduler = get_constant_schedule_with_warmup(self.optimizer, num_warmup_steps=4) lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps) self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2]) def test_warmup_linear_scheduler(self): - scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=2, t_total=10) + scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=2, num_training_steps=10) lrs = unwrap_schedule(scheduler, self.num_steps) expected_learning_rates = [5.0, 10.0, 8.75, 7.5, 6.25, 5.0, 3.75, 2.5, 1.25, 0.0] self.assertEqual(len(lrs[0]), 1) self.assertListEqual([l[0] for l in lrs], expected_learning_rates) - scheduler = WarmupLinearSchedule(self.optimizer, warmup_steps=2, t_total=10) + scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps=2, num_training_steps=10) lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps) self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2]) def test_warmup_cosine_scheduler(self): - scheduler = WarmupCosineSchedule(self.optimizer, warmup_steps=2, t_total=10) + scheduler = get_cosine_schedule_with_warmup(self.optimizer, num_warmup_steps=2, num_training_steps=10) lrs = unwrap_schedule(scheduler, self.num_steps) expected_learning_rates = [5.0, 10.0, 9.61, 8.53, 6.91, 5.0, 3.08, 1.46, 0.38, 0.0] self.assertEqual(len(lrs[0]), 1) self.assertListAlmostEqual([l[0] for l in lrs], expected_learning_rates, tol=1e-2) - scheduler = WarmupCosineSchedule(self.optimizer, warmup_steps=2, t_total=10) + scheduler = get_cosine_schedule_with_warmup(self.optimizer, num_warmup_steps=2, num_training_steps=10) lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps) self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2]) def test_warmup_cosine_hard_restart_scheduler(self): - scheduler = WarmupCosineWithHardRestartsSchedule(self.optimizer, warmup_steps=2, cycles=2, t_total=10) + scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(self.optimizer, num_warmup_steps=2, num_cycles=2, num_training_steps=10) lrs = unwrap_schedule(scheduler, self.num_steps) expected_learning_rates = [5.0, 10.0, 8.53, 5.0, 1.46, 10.0, 8.53, 5.0, 1.46, 0.0] self.assertEqual(len(lrs[0]), 1) self.assertListAlmostEqual([l[0] for l in lrs], expected_learning_rates, tol=1e-2) - scheduler = WarmupCosineWithHardRestartsSchedule(self.optimizer, warmup_steps=2, cycles=2, t_total=10) + scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(self.optimizer, num_warmup_steps=2, num_cycles=2, num_training_steps=10) lrs_2 = unwrap_and_save_reload_schedule(scheduler, self.num_steps) self.assertListEqual([l[0] for l in lrs], [l[0] for l in lrs_2]) + if __name__ == "__main__": unittest.main()