From e04bab59e1cf7e683e0365fd78e9e59e83c16d32 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Tue, 26 Feb 2019 16:22:52 +0100
Subject: [PATCH] fix for negative learning rate with warmup_linear in BertAdam
 (happens when t_total is specified incorrectly) + copied BERT optimization
 warmup functions to OpenAI optimization file + added comments

---
 pytorch_pretrained_bert/optimization.py       |  6 +++++-
 .../optimization_openai.py                    | 19 +++++++++++++------
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index f3d1de0d37..2b20cd87b7 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -26,14 +26,18 @@ def warmup_cosine(x, warmup=0.002):
     return 0.5 * (1.0 + torch.cos(math.pi * x))
 
 def warmup_constant(x, warmup=0.002):
+    """ Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps.
+        Learning rate is 1. afterwards. """
     if x < warmup:
         return x/warmup
     return 1.0
 
 def warmup_linear(x, warmup=0.002):
+    """ Specifies a triangular learning rate schedule where peak is reached at `warmup`*`t_total`-th (as provided to BertAdam) training step.
+        After `t_total`-th training step, learning rate is zero. """
     if x < warmup:
         return x/warmup
-    return 1.0 - x
+    return max(1.0 - x, 0)
 
 SCHEDULES = {
     'warmup_cosine':warmup_cosine,
diff --git a/pytorch_pretrained_bert/optimization_openai.py b/pytorch_pretrained_bert/optimization_openai.py
index 4cc815c9ea..5950865a17 100644
--- a/pytorch_pretrained_bert/optimization_openai.py
+++ b/pytorch_pretrained_bert/optimization_openai.py
@@ -21,16 +21,23 @@ from torch.optim.optimizer import required
 from torch.nn.utils import clip_grad_norm_
 
 def warmup_cosine(x, warmup=0.002):
-    s = 1 if x <= warmup else 0
-    return s*(x/warmup) + (1-s)*(0.5 * (1 + torch.cos(math.pi * x)))
+    if x < warmup:
+        return x/warmup
+    return 0.5 * (1.0 + torch.cos(math.pi * x))
 
 def warmup_constant(x, warmup=0.002):
-    s = 1 if x <= warmup else 0
-    return s*(x/warmup) + (1-s)*1
+    """ Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps.
+        Learning rate is 1. afterwards. """
+    if x < warmup:
+        return x/warmup
+    return 1.0
 
 def warmup_linear(x, warmup=0.002):
-    s = 1 if x <= warmup else 0
-    return (s*(x/warmup) + (1-s))*(1-x)
+    """ Specifies a triangular learning rate schedule where peak is reached at `warmup`*`t_total`-th (as provided to BertAdam) training step.
+        After `t_total`-th training step, learning rate is zero. """
+    if x < warmup:
+        return x/warmup
+    return max(1.0 - x, 0)
 
 SCHEDULES = {
     'warmup_cosine':warmup_cosine,