fix for negative learning rate with warmup_linear in BertAdam (happens when t_total is specified incorrectly)

+ copied BERT optimization warmup functions to OpenAI optimization file + added comments
2019-02-26 16:22:52 +01:00
parent 2152bfeae8
commit e04bab59e1
2 changed files with 18 additions and 7 deletions
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -26,14 +26,18 @@ def warmup_cosine(x, warmup=0.002):
    return 0.5 * (1.0 + torch.cos(math.pi * x))
 def warmup_constant(x, warmup=0.002):
    """ Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps.
        Learning rate is 1. afterwards. """
    if x < warmup:
        return x/warmup
    return 1.0
 def warmup_linear(x, warmup=0.002):
    """ Specifies a triangular learning rate schedule where peak is reached at `warmup`*`t_total`-th (as provided to BertAdam) training step.
        After `t_total`-th training step, learning rate is zero. """
    if x < warmup:
        return x/warmup
-    return 1.0 - x
+    return max(1.0 - x, 0)
 SCHEDULES = {
    'warmup_cosine':warmup_cosine,
--- a/pytorch_pretrained_bert/optimization_openai.py
+++ b/pytorch_pretrained_bert/optimization_openai.py
@@ -21,16 +21,23 @@ from torch.optim.optimizer import required
 from torch.nn.utils import clip_grad_norm_
 def warmup_cosine(x, warmup=0.002):
-    s = 1 if x <= warmup else 0
+    if x < warmup:
-    return s*(x/warmup) + (1-s)*(0.5 * (1 + torch.cos(math.pi * x)))
+        return x/warmup
    return 0.5 * (1.0 + torch.cos(math.pi * x))
 def warmup_constant(x, warmup=0.002):
-    s = 1 if x <= warmup else 0
+    """ Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps.
-    return s*(x/warmup) + (1-s)*1
+        Learning rate is 1. afterwards. """
    if x < warmup:
        return x/warmup
    return 1.0
 def warmup_linear(x, warmup=0.002):
-    s = 1 if x <= warmup else 0
+    """ Specifies a triangular learning rate schedule where peak is reached at `warmup`*`t_total`-th (as provided to BertAdam) training step.
-    return (s*(x/warmup) + (1-s))*(1-x)
+        After `t_total`-th training step, learning rate is zero. """
    if x < warmup:
        return x/warmup
    return max(1.0 - x, 0)
 SCHEDULES = {
    'warmup_cosine':warmup_cosine,