From 88874f6cf09e14fc482abc186adebb2767dca258 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Fri, 8 Mar 2019 19:08:30 +0100
Subject: [PATCH 01/21] BertAdam schedule objects

---
 pytorch_pretrained_bert/optimization.py | 141 +++++++++++++++++-------
 1 file changed, 99 insertions(+), 42 deletions(-)

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index fa911e5c04..73afc71058 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -23,29 +23,99 @@ import logging
 
 logger = logging.getLogger(__name__)
 
-def warmup_cosine(x, warmup=0.002):
-    if x < warmup:
-        return x/warmup
-    return 0.5 * (1.0 + torch.cos(math.pi * x))
 
-def warmup_constant(x, warmup=0.002):
-    """ Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps.
-        Learning rate is 1. afterwards. """
-    if x < warmup:
-        return x/warmup
-    return 1.0
+class LRSchedule(object):
+    warn_t_total = False
+    def __init__(self, warmup=0.002, t_total=-1, **kw):
+        super(LRSchedule, self).__init__(**kw)
+        self.warmup, self.t_total = warmup, t_total
+        if t_total <= 0:
+            logger.warning("t_total value of {} results in schedule not being applied".format(t_total))
+        if not 0.0 <= warmup < 1.0 and not warmup == -1:
+            raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
+        self.warned_for_t_total_at_progress = -1
 
-def warmup_linear(x, warmup=0.002):
-    """ Specifies a triangular learning rate schedule where peak is reached at `warmup`*`t_total`-th (as provided to BertAdam) training step.
-        After `t_total`-th training step, learning rate is zero. """
-    if x < warmup:
-        return x/warmup
-    return max((x-1.)/(warmup-1.), 0)
+    def get_lr(self, step, nowarn=False):
+        progress = step / self.t_total
+        ret = self.get_lr_(progress)
+        # warning for exceeding t_total (only active with warmup_linear
+        if not nowarn and self.warn_t_total and progress > 1. and progress > self.warned_for_t_total_at_progress:
+            logger.warning(
+                "Training beyond specified 't_total'. Learning rate multiplier set to {}. Please set 't_total' of {} correctly."
+                    .format(ret, self.__class__.__name__))
+            self.warned_for_t_total_at_progress = progress
+        # end warning
+        return ret
+
+    def get_lr_(self, step):
+        return 1.
+        # raise NotImplemented("use subclass")
+
+
+class WarmupCosineSchedule(LRSchedule):
+    warn_t_total = True
+    def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw):
+        super(WarmupCosineSchedule, self).__init__(warmup=warmup, t_total=t_total, **kw)
+        self.cycles = cycles
+
+    def get_lr_(self, progress):
+        """ get learning rate multiplier """
+        if self.t_total <= 0:
+            return 1.
+        if progress < self.warmup:
+            return progress / self.warmup
+        else:
+            progress = (progress - self.warmup) / (1 - self.warmup)   # progress after warmup
+            return 0.5 * (1. + torch.cos(math.pi * self.cycles * 2 * progress))
+
+
+class WarmupConstantSchedule(LRSchedule):
+    warn_t_total = False
+    def get_lr_(self, progress):
+        if progress < self.warmup:
+            return progress / self.warmup
+        return 1.
+
+
+class WarmupLinearSchedule(LRSchedule):
+    warn_t_total = True
+    def get_lr_(self, progress):
+        if progress < self.warmup:
+            return progress / self.warmup
+        return max((progress - 1.) / (self.warmup - 1.), 0)
+#
+#
+# def warmup_cosine(x, warmup=0.002):
+#     if x < warmup:
+#         return x/warmup
+#     return 0.5 * (1.0 + torch.cos(math.pi * x))
+#
+# def warmup_constant(x, warmup=0.002):
+#     """ Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps.
+#         Learning rate is 1. afterwards. """
+#     if x < warmup:
+#         return x/warmup
+#     return 1.0
+#
+# def warmup_linear(x, warmup=0.002):
+#     """ Specifies a triangular learning rate schedule where peak is reached at `warmup`*`t_total`-th (as provided to BertAdam) training step.
+#         After `t_total`-th training step, learning rate is zero. """
+#     if x < warmup:
+#         return x/warmup
+#     return max((x-1.)/(warmup-1.), 0)
+#
+# SCHEDULES = {
+#     'warmup_cosine':   warmup_cosine,
+#     'warmup_constant': warmup_constant,
+#     'warmup_linear':   warmup_linear,
+# }
 
 SCHEDULES = {
-    'warmup_cosine':   warmup_cosine,
-    'warmup_constant': warmup_constant,
-    'warmup_linear':   warmup_linear,
+    None:       LRSchedule,
+    "none":     LRSchedule,
+    "warmup_cosine": WarmupCosineSchedule,
+    "warmup_constant": WarmupConstantSchedule,
+    "warmup_linear": WarmupLinearSchedule
 }
 
 
@@ -70,15 +140,16 @@ class BertAdam(Optimizer):
             raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
         if schedule not in SCHEDULES:
             raise ValueError("Invalid schedule parameter: {}".format(schedule))
-        if not 0.0 <= warmup < 1.0 and not warmup == -1:
-            raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
         if not 0.0 <= b1 < 1.0:
             raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
         if not 0.0 <= b2 < 1.0:
             raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
         if not e >= 0.0:
             raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
-        defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
+        # initialize schedule object
+        schedule_type = SCHEDULES[schedule]
+        sched = schedule_type(warmup=warmup, t_total=t_total)
+        defaults = dict(lr=lr, schedule=sched,
                         b1=b1, b2=b2, e=e, weight_decay=weight_decay,
                         max_grad_norm=max_grad_norm)
         super(BertAdam, self).__init__(params, defaults)
@@ -90,11 +161,10 @@ class BertAdam(Optimizer):
                 state = self.state[p]
                 if len(state) == 0:
                     return [0]
-                if group['t_total'] != -1:
-                    schedule_fct = SCHEDULES[group['schedule']]
-                    lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
-                else:
-                    lr_scheduled = group['lr']
+
+                lr_scheduled = group['lr']
+                lr_scheduled *= group['schedule'](state['step'])
+
                 lr.append(lr_scheduled)
         return lr
 
@@ -109,8 +179,6 @@ class BertAdam(Optimizer):
         if closure is not None:
             loss = closure()
 
-        warned_for_t_total = False
-
         for group in self.param_groups:
             for p in group['params']:
                 if p.grad is None:
@@ -152,19 +220,8 @@ class BertAdam(Optimizer):
                 if group['weight_decay'] > 0.0:
                     update += group['weight_decay'] * p.data
 
-                if group['t_total'] != -1:
-                    schedule_fct = SCHEDULES[group['schedule']]
-                    progress = state['step']/group['t_total']
-                    lr_scheduled = group['lr'] * schedule_fct(progress, group['warmup'])
-                    # warning for exceeding t_total (only active with warmup_linear
-                    if group['schedule'] == "warmup_linear" and progress > 1. and not warned_for_t_total:
-                        logger.warning(
-                            "Training beyond specified 't_total' steps with schedule '{}'. Learning rate set to {}. "
-                            "Please set 't_total' of {} correctly.".format(group['schedule'], lr_scheduled, self.__class__.__name__))
-                        warned_for_t_total = True
-                    # end warning
-                else:
-                    lr_scheduled = group['lr']
+                lr_scheduled = group['lr']
+                lr_scheduled *= group['schedule'](state['step'])
 
                 update_with_lr = lr_scheduled * update
                 p.data.add_(-update_with_lr)

From 90a41dbe1404f734f6a25bfbaf89be71ba5e4613 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Sat, 9 Mar 2019 02:23:20 +0100
Subject: [PATCH 02/21] BertAdam schedule objects

---
 pytorch_pretrained_bert/__init__.py     |  2 +-
 pytorch_pretrained_bert/optimization.py | 48 +++++++++----------------
 2 files changed, 17 insertions(+), 33 deletions(-)

diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py
index bd455b8d9c..e82d409ee0 100644
--- a/pytorch_pretrained_bert/__init__.py
+++ b/pytorch_pretrained_bert/__init__.py
@@ -18,7 +18,7 @@ from .modeling_gpt2 import (GPT2Config, GPT2Model,
                             GPT2LMHeadModel, GPT2DoubleHeadsModel,
                             load_tf_weights_in_gpt2)
 
-from .optimization import BertAdam
+from .optimization import *
 from .optimization_openai import OpenAIAdam
 
 from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path
diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index 73afc71058..cea35c39e9 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -24,6 +24,9 @@ import logging
 logger = logging.getLogger(__name__)
 
 
+__all__ = ["LRSchedule", "WarmupLinearSchedule", "WarmupConstantSchedule", "WarmupCosineSchedule", "BertAdam"]
+
+
 class LRSchedule(object):
     warn_t_total = False
     def __init__(self, warmup=0.002, t_total=-1, **kw):
@@ -83,32 +86,7 @@ class WarmupLinearSchedule(LRSchedule):
         if progress < self.warmup:
             return progress / self.warmup
         return max((progress - 1.) / (self.warmup - 1.), 0)
-#
-#
-# def warmup_cosine(x, warmup=0.002):
-#     if x < warmup:
-#         return x/warmup
-#     return 0.5 * (1.0 + torch.cos(math.pi * x))
-#
-# def warmup_constant(x, warmup=0.002):
-#     """ Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps.
-#         Learning rate is 1. afterwards. """
-#     if x < warmup:
-#         return x/warmup
-#     return 1.0
-#
-# def warmup_linear(x, warmup=0.002):
-#     """ Specifies a triangular learning rate schedule where peak is reached at `warmup`*`t_total`-th (as provided to BertAdam) training step.
-#         After `t_total`-th training step, learning rate is zero. """
-#     if x < warmup:
-#         return x/warmup
-#     return max((x-1.)/(warmup-1.), 0)
-#
-# SCHEDULES = {
-#     'warmup_cosine':   warmup_cosine,
-#     'warmup_constant': warmup_constant,
-#     'warmup_linear':   warmup_linear,
-# }
+
 
 SCHEDULES = {
     None:       LRSchedule,
@@ -126,7 +104,9 @@ class BertAdam(Optimizer):
         warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
         t_total: total number of training steps for the learning
             rate schedule, -1  means constant learning rate. Default: -1
-        schedule: schedule to use for the warmup (see above). Default: 'warmup_linear'
+        schedule: schedule to use for the warmup (see above).
+            Can be 'warmup_linear', 'warmup_constant', 'warmup_cosine', or a LRSchedule object.
+            Default: 'warmup_linear'
         b1: Adams b1. Default: 0.9
         b2: Adams b2. Default: 0.999
         e: Adams epsilon. Default: 1e-6
@@ -147,9 +127,13 @@ class BertAdam(Optimizer):
         if not e >= 0.0:
             raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
         # initialize schedule object
-        schedule_type = SCHEDULES[schedule]
-        sched = schedule_type(warmup=warmup, t_total=t_total)
-        defaults = dict(lr=lr, schedule=sched,
+        if not isinstance(schedule, LRSchedule):
+            schedule_type = SCHEDULES[schedule]
+            schedule = schedule_type(warmup=warmup, t_total=t_total)
+        else:
+            if warmup != -1 or t_total != -1:
+                logger.warning("Non-default warmup and t_total are ineffective when LRSchedule object is provided.")
+        defaults = dict(lr=lr, schedule=schedule,
                         b1=b1, b2=b2, e=e, weight_decay=weight_decay,
                         max_grad_norm=max_grad_norm)
         super(BertAdam, self).__init__(params, defaults)
@@ -163,7 +147,7 @@ class BertAdam(Optimizer):
                     return [0]
 
                 lr_scheduled = group['lr']
-                lr_scheduled *= group['schedule'](state['step'])
+                lr_scheduled *= group['schedule'].get_lr(state['step'])
 
                 lr.append(lr_scheduled)
         return lr
@@ -221,7 +205,7 @@ class BertAdam(Optimizer):
                     update += group['weight_decay'] * p.data
 
                 lr_scheduled = group['lr']
-                lr_scheduled *= group['schedule'](state['step'])
+                lr_scheduled *= group['schedule'].get_lr(state['step'])
 
                 update_with_lr = lr_scheduled * update
                 p.data.add_(-update_with_lr)

From f113a2dfdcf3116a35f856b274b4e4c2ecbeb6c0 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Sat, 9 Mar 2019 02:29:57 +0100
Subject: [PATCH 03/21] readme de

---
 pytorch_pretrained_bert/optimization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index cea35c39e9..84f329feae 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -118,7 +118,7 @@ class BertAdam(Optimizer):
                  max_grad_norm=1.0):
         if lr is not required and lr < 0.0:
             raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
-        if schedule not in SCHEDULES:
+        if not isinstance(schedule, LRSchedule) and schedule not in SCHEDULES:
             raise ValueError("Invalid schedule parameter: {}".format(schedule))
         if not 0.0 <= b1 < 1.0:
             raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))

From 51efde54a907c85495263a980d253dcdb3e75209 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Sat, 9 Mar 2019 02:45:25 +0100
Subject: [PATCH 04/21] cos fix

---
 pytorch_pretrained_bert/optimization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index 84f329feae..a92adb4c56 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -69,7 +69,7 @@ class WarmupCosineSchedule(LRSchedule):
             return progress / self.warmup
         else:
             progress = (progress - self.warmup) / (1 - self.warmup)   # progress after warmup
-            return 0.5 * (1. + torch.cos(math.pi * self.cycles * 2 * progress))
+            return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress))
 
 
 class WarmupConstantSchedule(LRSchedule):

From baf66d141958785feb0dfc90d6cd8558eb95a774 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Tue, 12 Mar 2019 13:22:23 +0100
Subject: [PATCH 05/21] restart cosine lr schedule

---
 pytorch_pretrained_bert/optimization.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index a92adb4c56..58e16f01a6 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -69,7 +69,23 @@ class WarmupCosineSchedule(LRSchedule):
             return progress / self.warmup
         else:
             progress = (progress - self.warmup) / (1 - self.warmup)   # progress after warmup
-            return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress))
+            return 0.5 * (1. + math.cos(math.pi * ((self.cycles * 2 * progress) % 1))
+
+
+class WarmupCosineWithRestartsSchedule(WarmupCosineSchedule):
+    warn_t_total = True
+    def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
+        super(WarmupCosineWithRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
+
+    def get_lr_(self, progress):
+        if self.t_total <= 0:
+            return 1.
+        if progress < self.warmup:
+            return progress / self.warmup
+        else:
+            progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
+            ret = 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress))
+            return ret
 
 
 class WarmupConstantSchedule(LRSchedule):

From 902461333715d16773cac1e5e1300be705f49205 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Tue, 12 Mar 2019 13:23:58 +0100
Subject: [PATCH 06/21] changing docker

---
 pytorch_pretrained_bert/optimization.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index 58e16f01a6..481072c483 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -69,7 +69,7 @@ class WarmupCosineSchedule(LRSchedule):
             return progress / self.warmup
         else:
             progress = (progress - self.warmup) / (1 - self.warmup)   # progress after warmup
-            return 0.5 * (1. + math.cos(math.pi * ((self.cycles * 2 * progress) % 1))
+            return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress))
 
 
 class WarmupCosineWithRestartsSchedule(WarmupCosineSchedule):
@@ -84,7 +84,7 @@ class WarmupCosineWithRestartsSchedule(WarmupCosineSchedule):
             return progress / self.warmup
         else:
             progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
-            ret = 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress))
+            ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * 2 * progress) % 1)))
             return ret
 
 

From 471daf1b6c0821e8b5ab6a173d7f41de079bae8a Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Tue, 12 Mar 2019 13:32:42 +0100
Subject: [PATCH 07/21] changing docker

---
 pytorch_pretrained_bert/optimization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index 481072c483..dcd8cfff2f 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -84,7 +84,7 @@ class WarmupCosineWithRestartsSchedule(WarmupCosineSchedule):
             return progress / self.warmup
         else:
             progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
-            ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * 2 * progress) % 1)))
+            ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * progress) % 1)))
             return ret
 
 

From eac039d21f8141c501bd75d02d46ba5545797b63 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Tue, 12 Mar 2019 13:45:12 +0100
Subject: [PATCH 08/21] changing docker

---
 pytorch_pretrained_bert/optimization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index dcd8cfff2f..7eda3ba92a 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -24,7 +24,7 @@ import logging
 logger = logging.getLogger(__name__)
 
 
-__all__ = ["LRSchedule", "WarmupLinearSchedule", "WarmupConstantSchedule", "WarmupCosineSchedule", "BertAdam"]
+__all__ = ["LRSchedule", "WarmupLinearSchedule", "WarmupConstantSchedule", "WarmupCosineSchedule", "BertAdam", "WarmupCosineWithRestartsSchedule"]
 
 
 class LRSchedule(object):

From 20e652209c7da7a73c9d1f3a65418d0ea118680e Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Wed, 13 Mar 2019 16:13:37 +0100
Subject: [PATCH 09/21] relation classification: replacing entity mention with
 mask token

---
 pytorch_pretrained_bert/optimization.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index 7eda3ba92a..9a873e221b 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -130,7 +130,7 @@ class BertAdam(Optimizer):
         max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
     """
     def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
-                 b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01,
+                 b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, init_weight_decay=0.,
                  max_grad_norm=1.0):
         if lr is not required and lr < 0.0:
             raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
@@ -150,7 +150,7 @@ class BertAdam(Optimizer):
             if warmup != -1 or t_total != -1:
                 logger.warning("Non-default warmup and t_total are ineffective when LRSchedule object is provided.")
         defaults = dict(lr=lr, schedule=schedule,
-                        b1=b1, b2=b2, e=e, weight_decay=weight_decay,
+                        b1=b1, b2=b2, e=e, weight_decay=weight_decay, init_weight_decay=init_weight_decay,
                         max_grad_norm=max_grad_norm)
         super(BertAdam, self).__init__(params, defaults)
 
@@ -220,6 +220,8 @@ class BertAdam(Optimizer):
                 if group['weight_decay'] > 0.0:
                     update += group['weight_decay'] * p.data
 
+                # TODO: init weight decay
+
                 lr_scheduled = group['lr']
                 lr_scheduled *= group['schedule'].get_lr(state['step'])
 

From bed6408dcce8cf1b04e8dfa41f481500f40e47ca Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Mon, 18 Mar 2019 13:09:55 +0100
Subject: [PATCH 10/21] branches, optim cosine fix

---
 pytorch_pretrained_bert/optimization.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index fa911e5c04..e553365b54 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -26,7 +26,9 @@ logger = logging.getLogger(__name__)
 def warmup_cosine(x, warmup=0.002):
     if x < warmup:
         return x/warmup
-    return 0.5 * (1.0 + torch.cos(math.pi * x))
+
+    x_ = (x - warmup) / (1 - warmup)  # progress after warmup
+    return 0.5 * (1. + math.cos(math.pi * x_))
 
 def warmup_constant(x, warmup=0.002):
     """ Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps.

From ef28b2c74739162b88d78009ed3fae74deeb8b36 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Mon, 18 Mar 2019 13:18:07 +0100
Subject: [PATCH 11/21] branches, optim cosine fix

---
 pytorch_pretrained_bert/optimization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index 95411857c6..aa59c7d7ec 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -26,7 +26,7 @@ logger = logging.getLogger(__name__)
 def warmup_cosine(x, warmup=0.002):
     if x < warmup:
         return x/warmup
-    x_ = (x - warmup) / (1 - warmup)  # progress after warmup
+    x_ = (x - warmup) / (1 - warmup)  # progress after warmup -
     return 0.5 * (1. + math.cos(math.pi * x_))
 
 def warmup_constant(x, warmup=0.002):

From 2283dcca5e62438efbe445d518cddfaa2842a75b Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Mon, 18 Mar 2019 13:40:12 +0100
Subject: [PATCH 12/21] import revert

---
 pytorch_pretrained_bert/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py
index e82d409ee0..bd455b8d9c 100644
--- a/pytorch_pretrained_bert/__init__.py
+++ b/pytorch_pretrained_bert/__init__.py
@@ -18,7 +18,7 @@ from .modeling_gpt2 import (GPT2Config, GPT2Model,
                             GPT2LMHeadModel, GPT2DoubleHeadsModel,
                             load_tf_weights_in_gpt2)
 
-from .optimization import *
+from .optimization import BertAdam
 from .optimization_openai import OpenAIAdam
 
 from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path

From 19cc2c084e265d1cf2ae2218147cd52bdd71bf1a Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Mon, 18 Mar 2019 15:13:35 +0100
Subject: [PATCH 13/21] same

---
 pytorch_pretrained_bert/optimization_openai.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/optimization_openai.py b/pytorch_pretrained_bert/optimization_openai.py
index 7f56a1284c..99ac15e108 100644
--- a/pytorch_pretrained_bert/optimization_openai.py
+++ b/pytorch_pretrained_bert/optimization_openai.py
@@ -26,7 +26,8 @@ logger = logging.getLogger(__name__)
 def warmup_cosine(x, warmup=0.002):
     if x < warmup:
         return x/warmup
-    return 0.5 * (1.0 + torch.cos(math.pi * x))
+    x_ = (x - warmup) / (1 - warmup)  # progress after warmup
+    return 0.5 * (1. + math.cos(math.pi * x_))
 
 def warmup_constant(x, warmup=0.002):
     """ Linearly increases learning rate over `warmup`*`t_total` (as provided to OpenAIAdam) training steps.

From 7797d21b8d7180af2114890c4c3393c762a69154 Mon Sep 17 00:00:00 2001
From: Catalin Voss <catalin@cs.stanford.edu>
Date: Sun, 24 Mar 2019 13:34:30 -0700
Subject: [PATCH 14/21] Fix GPT2 language modeling loss computation

---
 pytorch_pretrained_bert/modeling_gpt2.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index c381b288f8..13ae7a2342 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -617,8 +617,16 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
         hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past)
         lm_logits = self.lm_head(hidden_states)
         if lm_labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[:, :-1]
+            shift_labels = torch_batch[:, 1:]
+
+            # In tensorflow, it's [batch, d_0, d_1, ..., d_{r-1}, num_classes]
+            # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}]
+            # We just flatten the tokens out this way.
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1))
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1))
+                            shift_labels.view(-1))
             return loss
         return lm_logits, presents
 

From 5938f31fa7aa28cdff662f79c7c038cab21bb370 Mon Sep 17 00:00:00 2001
From: Catalin Voss <catalin@cs.stanford.edu>
Date: Sun, 24 Mar 2019 13:35:32 -0700
Subject: [PATCH 15/21] Fix c/p typo from my experiment code

---
 pytorch_pretrained_bert/modeling_gpt2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 13ae7a2342..1733a5b3f4 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -619,7 +619,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
         if lm_labels is not None:
             # Shift so that tokens < n predict n
             shift_logits = lm_logits[:, :-1]
-            shift_labels = torch_batch[:, 1:]
+            shift_labels = lm_labels[:, 1:]
 
             # In tensorflow, it's [batch, d_0, d_1, ..., d_{r-1}, num_classes]
             # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}]

From 2e6f5ffb96029398f740b6eacdc86b117cccb86b Mon Sep 17 00:00:00 2001
From: Catalin Voss <catalin@cs.stanford.edu>
Date: Sun, 24 Mar 2019 13:36:46 -0700
Subject: [PATCH 16/21] Fix GPT language model loss here as well

---
 pytorch_pretrained_bert/modeling_openai.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 296abbfc31..9c708f88a2 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -716,8 +716,16 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
         hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
         lm_logits = self.lm_head(hidden_states)
         if lm_labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[:, :-1]
+            shift_labels = lm_labels[:, 1:]
+
+            # In tensorflow, it's [batch, d_0, d_1, ..., d_{r-1}, num_classes]
+            # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}]
+            # We just flatten the tokens out this way.
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1))
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1))
+                            shift_labels.view(-1))
             return loss
         return lm_logits
 

From 472857c47f3b6a142a7aaa53836e33cd8543088d Mon Sep 17 00:00:00 2001
From: Catalin Voss <catalin@cs.stanford.edu>
Date: Sun, 24 Mar 2019 13:49:42 -0700
Subject: [PATCH 17/21] Fix typo syntax err (sorry, c/p from my repo)

---
 pytorch_pretrained_bert/modeling_gpt2.py   | 2 +-
 pytorch_pretrained_bert/modeling_openai.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 1733a5b3f4..15e7ca26e1 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -625,7 +625,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
             # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}]
             # We just flatten the tokens out this way.
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1))
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                             shift_labels.view(-1))
             return loss
         return lm_logits, presents
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 9c708f88a2..ab4107667b 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -724,7 +724,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
             # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}]
             # We just flatten the tokens out this way.
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1))
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                             shift_labels.view(-1))
             return loss
         return lm_logits

From 0dd796e359d1fbf9c0ea39b04e9b5655e5a09dee Mon Sep 17 00:00:00 2001
From: Catalin Voss <catalin@cs.stanford.edu>
Date: Sun, 24 Mar 2019 14:35:55 -0700
Subject: [PATCH 18/21] Also fix loss function issue with the double head
 models

---
 pytorch_pretrained_bert/modeling_gpt2.py   | 5 ++++-
 pytorch_pretrained_bert/modeling_openai.py | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 15e7ca26e1..635326b408 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -698,8 +698,11 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
         mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
         losses = []
         if lm_labels is not None:
+            shift_logits = lm_logits[:, :-1]
+            shift_labels = lm_labels[:, 1:]
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            losses.append(loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1)))
+            losses.append(loss_fct(shift_logits.view(-1,
+                          shift_logits.size(-1)), shift_labels.view(-1)))
         if mc_labels is not None:
             loss_fct = CrossEntropyLoss()
             losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index ab4107667b..8c1dd5e4a3 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -811,8 +811,11 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
         losses = []
         if lm_labels is not None:
+            shift_logits = lm_logits[:, :-1]
+            shift_labels = lm_labels[:, 1:]
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            losses.append(loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1)))
+            losses.append(loss_fct(shift_logits.view(-1,
+                          shift_logits.size(-1)), shift_labels.view(-1)))
         if mc_labels is not None:
             loss_fct = CrossEntropyLoss()
             losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))

From fda2f623953bfe2290cd65429eb008f02ebdb152 Mon Sep 17 00:00:00 2001
From: Catalin Voss <catalin@cs.stanford.edu>
Date: Sun, 24 Mar 2019 14:37:13 -0700
Subject: [PATCH 19/21] Fix test failures due to old torch issue with
 non-contiguous view

---
 pytorch_pretrained_bert/modeling_gpt2.py   | 8 ++++----
 pytorch_pretrained_bert/modeling_openai.py | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 635326b408..7a0bb4db53 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -618,8 +618,8 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
         lm_logits = self.lm_head(hidden_states)
         if lm_labels is not None:
             # Shift so that tokens < n predict n
-            shift_logits = lm_logits[:, :-1]
-            shift_labels = lm_labels[:, 1:]
+            shift_logits = lm_logits[:, :-1].contiguous()
+            shift_labels = lm_labels[:, 1:].contiguous()
 
             # In tensorflow, it's [batch, d_0, d_1, ..., d_{r-1}, num_classes]
             # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}]
@@ -698,8 +698,8 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
         mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
         losses = []
         if lm_labels is not None:
-            shift_logits = lm_logits[:, :-1]
-            shift_labels = lm_labels[:, 1:]
+            shift_logits = lm_logits[:, :-1].contiguous()
+            shift_labels = lm_labels[:, 1:].contiguous()
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             losses.append(loss_fct(shift_logits.view(-1,
                           shift_logits.size(-1)), shift_labels.view(-1)))
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 8c1dd5e4a3..4385c1eaa7 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -717,8 +717,8 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
         lm_logits = self.lm_head(hidden_states)
         if lm_labels is not None:
             # Shift so that tokens < n predict n
-            shift_logits = lm_logits[:, :-1]
-            shift_labels = lm_labels[:, 1:]
+            shift_logits = lm_logits[:, :-1].contiguous()
+            shift_labels = lm_labels[:, 1:].contiguous()
 
             # In tensorflow, it's [batch, d_0, d_1, ..., d_{r-1}, num_classes]
             # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}]
@@ -811,8 +811,8 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
         losses = []
         if lm_labels is not None:
-            shift_logits = lm_logits[:, :-1]
-            shift_labels = lm_labels[:, 1:]
+            shift_logits = lm_logits[:, :-1].contiguous()
+            shift_labels = lm_labels[:, 1:].contiguous()
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             losses.append(loss_fct(shift_logits.view(-1,
                           shift_logits.size(-1)), shift_labels.view(-1)))

From 01520d5412ab1b17c4ef0da5ed6cb9e62d6dfcb1 Mon Sep 17 00:00:00 2001
From: Catalin Voss <catalin@cs.stanford.edu>
Date: Wed, 27 Mar 2019 10:45:11 -0700
Subject: [PATCH 20/21] Remove my unhelpful comments :)

---
 pytorch_pretrained_bert/modeling_gpt2.py   | 4 +---
 pytorch_pretrained_bert/modeling_openai.py | 4 +---
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 7a0bb4db53..7b00ce7730 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -621,9 +621,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
             shift_logits = lm_logits[:, :-1].contiguous()
             shift_labels = lm_labels[:, 1:].contiguous()
 
-            # In tensorflow, it's [batch, d_0, d_1, ..., d_{r-1}, num_classes]
-            # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}]
-            # We just flatten the tokens out this way.
+            # Flatten the tokens
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                             shift_labels.view(-1))
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 4385c1eaa7..7273e75bf6 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -720,9 +720,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
             shift_logits = lm_logits[:, :-1].contiguous()
             shift_labels = lm_labels[:, 1:].contiguous()
 
-            # In tensorflow, it's [batch, d_0, d_1, ..., d_{r-1}, num_classes]
-            # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}]
-            # We just flatten the tokens out this way.
+            # Flatten the tokens
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                             shift_labels.view(-1))

From 60005f464d2069801a2cf26dc0f011da8ed639b0 Mon Sep 17 00:00:00 2001
From: jeonsworld <37530102+jeonsworld@users.noreply.github.com>
Date: Sat, 30 Mar 2019 14:50:17 +0900
Subject: [PATCH 21/21] Update pregenerate_training_data.py

If the value of rand_end is returned from the randint function, the value of sampled_doc_index that matches current_idx is returned from searchsorted.

example:
cumsum_max = {int64} 30
doc_cumsum = {ndarray} [ 5  7 11 19 30]
doc_lengths = {list} <class 'list'>: [5, 2, 4, 8, 11]
if current_idx  = 1,
rand_start = 7
rand_end = 35
sentence_index = randint(7, 35) % cumsum_max
if randint return 35, sentence_index becomes 5.
if sentence_index is 5, np.searchsorted returns 1 equal to current_index.
---
 examples/lm_finetuning/pregenerate_training_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/lm_finetuning/pregenerate_training_data.py b/examples/lm_finetuning/pregenerate_training_data.py
index 498ab22333..8cc28d2e78 100644
--- a/examples/lm_finetuning/pregenerate_training_data.py
+++ b/examples/lm_finetuning/pregenerate_training_data.py
@@ -49,7 +49,7 @@ class DocumentDatabase:
                 self._precalculate_doc_weights()
             rand_start = self.doc_cumsum[current_idx]
             rand_end = rand_start + self.cumsum_max - self.doc_lengths[current_idx]
-            sentence_index = randint(rand_start, rand_end) % self.cumsum_max
+            sentence_index = randint(rand_start, rand_end-1) % self.cumsum_max
             sampled_doc_index = np.searchsorted(self.doc_cumsum, sentence_index, side='right')
         else:
             # If we don't use sentence weighting, then every doc has an equal chance to be chosen