From e04bab59e1cf7e683e0365fd78e9e59e83c16d32 Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Tue, 26 Feb 2019 16:22:52 +0100 Subject: [PATCH] fix for negative learning rate with warmup_linear in BertAdam (happens when t_total is specified incorrectly) + copied BERT optimization warmup functions to OpenAI optimization file + added comments --- pytorch_pretrained_bert/optimization.py | 6 +++++- .../optimization_openai.py | 19 +++++++++++++------ 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py index f3d1de0d37..2b20cd87b7 100644 --- a/pytorch_pretrained_bert/optimization.py +++ b/pytorch_pretrained_bert/optimization.py @@ -26,14 +26,18 @@ def warmup_cosine(x, warmup=0.002): return 0.5 * (1.0 + torch.cos(math.pi * x)) def warmup_constant(x, warmup=0.002): + """ Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps. + Learning rate is 1. afterwards. """ if x < warmup: return x/warmup return 1.0 def warmup_linear(x, warmup=0.002): + """ Specifies a triangular learning rate schedule where peak is reached at `warmup`*`t_total`-th (as provided to BertAdam) training step. + After `t_total`-th training step, learning rate is zero. """ if x < warmup: return x/warmup - return 1.0 - x + return max(1.0 - x, 0) SCHEDULES = { 'warmup_cosine':warmup_cosine, diff --git a/pytorch_pretrained_bert/optimization_openai.py b/pytorch_pretrained_bert/optimization_openai.py index 4cc815c9ea..5950865a17 100644 --- a/pytorch_pretrained_bert/optimization_openai.py +++ b/pytorch_pretrained_bert/optimization_openai.py @@ -21,16 +21,23 @@ from torch.optim.optimizer import required from torch.nn.utils import clip_grad_norm_ def warmup_cosine(x, warmup=0.002): - s = 1 if x <= warmup else 0 - return s*(x/warmup) + (1-s)*(0.5 * (1 + torch.cos(math.pi * x))) + if x < warmup: + return x/warmup + return 0.5 * (1.0 + torch.cos(math.pi * x)) def warmup_constant(x, warmup=0.002): - s = 1 if x <= warmup else 0 - return s*(x/warmup) + (1-s)*1 + """ Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps. + Learning rate is 1. afterwards. """ + if x < warmup: + return x/warmup + return 1.0 def warmup_linear(x, warmup=0.002): - s = 1 if x <= warmup else 0 - return (s*(x/warmup) + (1-s))*(1-x) + """ Specifies a triangular learning rate schedule where peak is reached at `warmup`*`t_total`-th (as provided to BertAdam) training step. + After `t_total`-th training step, learning rate is zero. """ + if x < warmup: + return x/warmup + return max(1.0 - x, 0) SCHEDULES = { 'warmup_cosine':warmup_cosine,