From 88874f6cf09e14fc482abc186adebb2767dca258 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Fri, 8 Mar 2019 19:08:30 +0100
Subject: [PATCH 01/27] BertAdam schedule objects

---
 pytorch_pretrained_bert/optimization.py | 141 +++++++++++++++++-------
 1 file changed, 99 insertions(+), 42 deletions(-)

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index fa911e5c04..73afc71058 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -23,29 +23,99 @@ import logging
 
 logger = logging.getLogger(__name__)
 
-def warmup_cosine(x, warmup=0.002):
-    if x < warmup:
-        return x/warmup
-    return 0.5 * (1.0 + torch.cos(math.pi * x))
 
-def warmup_constant(x, warmup=0.002):
-    """ Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps.
-        Learning rate is 1. afterwards. """
-    if x < warmup:
-        return x/warmup
-    return 1.0
+class LRSchedule(object):
+    warn_t_total = False
+    def __init__(self, warmup=0.002, t_total=-1, **kw):
+        super(LRSchedule, self).__init__(**kw)
+        self.warmup, self.t_total = warmup, t_total
+        if t_total <= 0:
+            logger.warning("t_total value of {} results in schedule not being applied".format(t_total))
+        if not 0.0 <= warmup < 1.0 and not warmup == -1:
+            raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
+        self.warned_for_t_total_at_progress = -1
 
-def warmup_linear(x, warmup=0.002):
-    """ Specifies a triangular learning rate schedule where peak is reached at `warmup`*`t_total`-th (as provided to BertAdam) training step.
-        After `t_total`-th training step, learning rate is zero. """
-    if x < warmup:
-        return x/warmup
-    return max((x-1.)/(warmup-1.), 0)
+    def get_lr(self, step, nowarn=False):
+        progress = step / self.t_total
+        ret = self.get_lr_(progress)
+        # warning for exceeding t_total (only active with warmup_linear
+        if not nowarn and self.warn_t_total and progress > 1. and progress > self.warned_for_t_total_at_progress:
+            logger.warning(
+                "Training beyond specified 't_total'. Learning rate multiplier set to {}. Please set 't_total' of {} correctly."
+                    .format(ret, self.__class__.__name__))
+            self.warned_for_t_total_at_progress = progress
+        # end warning
+        return ret
+
+    def get_lr_(self, step):
+        return 1.
+        # raise NotImplemented("use subclass")
+
+
+class WarmupCosineSchedule(LRSchedule):
+    warn_t_total = True
+    def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw):
+        super(WarmupCosineSchedule, self).__init__(warmup=warmup, t_total=t_total, **kw)
+        self.cycles = cycles
+
+    def get_lr_(self, progress):
+        """ get learning rate multiplier """
+        if self.t_total <= 0:
+            return 1.
+        if progress < self.warmup:
+            return progress / self.warmup
+        else:
+            progress = (progress - self.warmup) / (1 - self.warmup)   # progress after warmup
+            return 0.5 * (1. + torch.cos(math.pi * self.cycles * 2 * progress))
+
+
+class WarmupConstantSchedule(LRSchedule):
+    warn_t_total = False
+    def get_lr_(self, progress):
+        if progress < self.warmup:
+            return progress / self.warmup
+        return 1.
+
+
+class WarmupLinearSchedule(LRSchedule):
+    warn_t_total = True
+    def get_lr_(self, progress):
+        if progress < self.warmup:
+            return progress / self.warmup
+        return max((progress - 1.) / (self.warmup - 1.), 0)
+#
+#
+# def warmup_cosine(x, warmup=0.002):
+#     if x < warmup:
+#         return x/warmup
+#     return 0.5 * (1.0 + torch.cos(math.pi * x))
+#
+# def warmup_constant(x, warmup=0.002):
+#     """ Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps.
+#         Learning rate is 1. afterwards. """
+#     if x < warmup:
+#         return x/warmup
+#     return 1.0
+#
+# def warmup_linear(x, warmup=0.002):
+#     """ Specifies a triangular learning rate schedule where peak is reached at `warmup`*`t_total`-th (as provided to BertAdam) training step.
+#         After `t_total`-th training step, learning rate is zero. """
+#     if x < warmup:
+#         return x/warmup
+#     return max((x-1.)/(warmup-1.), 0)
+#
+# SCHEDULES = {
+#     'warmup_cosine':   warmup_cosine,
+#     'warmup_constant': warmup_constant,
+#     'warmup_linear':   warmup_linear,
+# }
 
 SCHEDULES = {
-    'warmup_cosine':   warmup_cosine,
-    'warmup_constant': warmup_constant,
-    'warmup_linear':   warmup_linear,
+    None:       LRSchedule,
+    "none":     LRSchedule,
+    "warmup_cosine": WarmupCosineSchedule,
+    "warmup_constant": WarmupConstantSchedule,
+    "warmup_linear": WarmupLinearSchedule
 }
 
 
@@ -70,15 +140,16 @@ class BertAdam(Optimizer):
             raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
         if schedule not in SCHEDULES:
             raise ValueError("Invalid schedule parameter: {}".format(schedule))
-        if not 0.0 <= warmup < 1.0 and not warmup == -1:
-            raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
         if not 0.0 <= b1 < 1.0:
             raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
         if not 0.0 <= b2 < 1.0:
             raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
         if not e >= 0.0:
             raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
-        defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
+        # initialize schedule object
+        schedule_type = SCHEDULES[schedule]
+        sched = schedule_type(warmup=warmup, t_total=t_total)
+        defaults = dict(lr=lr, schedule=sched,
                         b1=b1, b2=b2, e=e, weight_decay=weight_decay,
                         max_grad_norm=max_grad_norm)
         super(BertAdam, self).__init__(params, defaults)
@@ -90,11 +161,10 @@ class BertAdam(Optimizer):
                 state = self.state[p]
                 if len(state) == 0:
                     return [0]
-                if group['t_total'] != -1:
-                    schedule_fct = SCHEDULES[group['schedule']]
-                    lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
-                else:
-                    lr_scheduled = group['lr']
+
+                lr_scheduled = group['lr']
+                lr_scheduled *= group['schedule'](state['step'])
+
                 lr.append(lr_scheduled)
         return lr
 
@@ -109,8 +179,6 @@ class BertAdam(Optimizer):
         if closure is not None:
             loss = closure()
 
-        warned_for_t_total = False
-
         for group in self.param_groups:
             for p in group['params']:
                 if p.grad is None:
@@ -152,19 +220,8 @@ class BertAdam(Optimizer):
                 if group['weight_decay'] > 0.0:
                     update += group['weight_decay'] * p.data
 
-                if group['t_total'] != -1:
-                    schedule_fct = SCHEDULES[group['schedule']]
-                    progress = state['step']/group['t_total']
-                    lr_scheduled = group['lr'] * schedule_fct(progress, group['warmup'])
-                    # warning for exceeding t_total (only active with warmup_linear
-                    if group['schedule'] == "warmup_linear" and progress > 1. and not warned_for_t_total:
-                        logger.warning(
-                            "Training beyond specified 't_total' steps with schedule '{}'. Learning rate set to {}. "
-                            "Please set 't_total' of {} correctly.".format(group['schedule'], lr_scheduled, self.__class__.__name__))
-                        warned_for_t_total = True
-                    # end warning
-                else:
-                    lr_scheduled = group['lr']
+                lr_scheduled = group['lr']
+                lr_scheduled *= group['schedule'](state['step'])
 
                 update_with_lr = lr_scheduled * update
                 p.data.add_(-update_with_lr)

From 90a41dbe1404f734f6a25bfbaf89be71ba5e4613 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Sat, 9 Mar 2019 02:23:20 +0100
Subject: [PATCH 02/27] BertAdam schedule objects

---
 pytorch_pretrained_bert/__init__.py     |  2 +-
 pytorch_pretrained_bert/optimization.py | 48 +++++++++----------------
 2 files changed, 17 insertions(+), 33 deletions(-)

diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py
index bd455b8d9c..e82d409ee0 100644
--- a/pytorch_pretrained_bert/__init__.py
+++ b/pytorch_pretrained_bert/__init__.py
@@ -18,7 +18,7 @@ from .modeling_gpt2 import (GPT2Config, GPT2Model,
                             GPT2LMHeadModel, GPT2DoubleHeadsModel,
                             load_tf_weights_in_gpt2)
 
-from .optimization import BertAdam
+from .optimization import *
 from .optimization_openai import OpenAIAdam
 
 from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path
diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index 73afc71058..cea35c39e9 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -24,6 +24,9 @@ import logging
 logger = logging.getLogger(__name__)
 
 
+__all__ = ["LRSchedule", "WarmupLinearSchedule", "WarmupConstantSchedule", "WarmupCosineSchedule", "BertAdam"]
+
+
 class LRSchedule(object):
     warn_t_total = False
     def __init__(self, warmup=0.002, t_total=-1, **kw):
@@ -83,32 +86,7 @@ class WarmupLinearSchedule(LRSchedule):
         if progress < self.warmup:
             return progress / self.warmup
         return max((progress - 1.) / (self.warmup - 1.), 0)
-#
-#
-# def warmup_cosine(x, warmup=0.002):
-#     if x < warmup:
-#         return x/warmup
-#     return 0.5 * (1.0 + torch.cos(math.pi * x))
-#
-# def warmup_constant(x, warmup=0.002):
-#     """ Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps.
-#         Learning rate is 1. afterwards. """
-#     if x < warmup:
-#         return x/warmup
-#     return 1.0
-#
-# def warmup_linear(x, warmup=0.002):
-#     """ Specifies a triangular learning rate schedule where peak is reached at `warmup`*`t_total`-th (as provided to BertAdam) training step.
-#         After `t_total`-th training step, learning rate is zero. """
-#     if x < warmup:
-#         return x/warmup
-#     return max((x-1.)/(warmup-1.), 0)
-#
-# SCHEDULES = {
-#     'warmup_cosine':   warmup_cosine,
-#     'warmup_constant': warmup_constant,
-#     'warmup_linear':   warmup_linear,
-# }
+
 
 SCHEDULES = {
     None:       LRSchedule,
@@ -126,7 +104,9 @@ class BertAdam(Optimizer):
         warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
         t_total: total number of training steps for the learning
             rate schedule, -1  means constant learning rate. Default: -1
-        schedule: schedule to use for the warmup (see above). Default: 'warmup_linear'
+        schedule: schedule to use for the warmup (see above).
+            Can be 'warmup_linear', 'warmup_constant', 'warmup_cosine', or a LRSchedule object.
+            Default: 'warmup_linear'
         b1: Adams b1. Default: 0.9
         b2: Adams b2. Default: 0.999
         e: Adams epsilon. Default: 1e-6
@@ -147,9 +127,13 @@ class BertAdam(Optimizer):
         if not e >= 0.0:
             raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
         # initialize schedule object
-        schedule_type = SCHEDULES[schedule]
-        sched = schedule_type(warmup=warmup, t_total=t_total)
-        defaults = dict(lr=lr, schedule=sched,
+        if not isinstance(schedule, LRSchedule):
+            schedule_type = SCHEDULES[schedule]
+            schedule = schedule_type(warmup=warmup, t_total=t_total)
+        else:
+            if warmup != -1 or t_total != -1:
+                logger.warning("Non-default warmup and t_total are ineffective when LRSchedule object is provided.")
+        defaults = dict(lr=lr, schedule=schedule,
                         b1=b1, b2=b2, e=e, weight_decay=weight_decay,
                         max_grad_norm=max_grad_norm)
         super(BertAdam, self).__init__(params, defaults)
@@ -163,7 +147,7 @@ class BertAdam(Optimizer):
                     return [0]
 
                 lr_scheduled = group['lr']
-                lr_scheduled *= group['schedule'](state['step'])
+                lr_scheduled *= group['schedule'].get_lr(state['step'])
 
                 lr.append(lr_scheduled)
         return lr
@@ -221,7 +205,7 @@ class BertAdam(Optimizer):
                     update += group['weight_decay'] * p.data
 
                 lr_scheduled = group['lr']
-                lr_scheduled *= group['schedule'](state['step'])
+                lr_scheduled *= group['schedule'].get_lr(state['step'])
 
                 update_with_lr = lr_scheduled * update
                 p.data.add_(-update_with_lr)

From f113a2dfdcf3116a35f856b274b4e4c2ecbeb6c0 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Sat, 9 Mar 2019 02:29:57 +0100
Subject: [PATCH 03/27] readme de

---
 pytorch_pretrained_bert/optimization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index cea35c39e9..84f329feae 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -118,7 +118,7 @@ class BertAdam(Optimizer):
                  max_grad_norm=1.0):
         if lr is not required and lr < 0.0:
             raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
-        if schedule not in SCHEDULES:
+        if not isinstance(schedule, LRSchedule) and schedule not in SCHEDULES:
             raise ValueError("Invalid schedule parameter: {}".format(schedule))
         if not 0.0 <= b1 < 1.0:
             raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))

From 51efde54a907c85495263a980d253dcdb3e75209 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Sat, 9 Mar 2019 02:45:25 +0100
Subject: [PATCH 04/27] cos fix

---
 pytorch_pretrained_bert/optimization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index 84f329feae..a92adb4c56 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -69,7 +69,7 @@ class WarmupCosineSchedule(LRSchedule):
             return progress / self.warmup
         else:
             progress = (progress - self.warmup) / (1 - self.warmup)   # progress after warmup
-            return 0.5 * (1. + torch.cos(math.pi * self.cycles * 2 * progress))
+            return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress))
 
 
 class WarmupConstantSchedule(LRSchedule):

From baf66d141958785feb0dfc90d6cd8558eb95a774 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Tue, 12 Mar 2019 13:22:23 +0100
Subject: [PATCH 05/27] restart cosine lr schedule

---
 pytorch_pretrained_bert/optimization.py | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index a92adb4c56..58e16f01a6 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -69,7 +69,23 @@ class WarmupCosineSchedule(LRSchedule):
             return progress / self.warmup
         else:
             progress = (progress - self.warmup) / (1 - self.warmup)   # progress after warmup
-            return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress))
+            return 0.5 * (1. + math.cos(math.pi * ((self.cycles * 2 * progress) % 1))
+
+
+class WarmupCosineWithRestartsSchedule(WarmupCosineSchedule):
+    warn_t_total = True
+    def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
+        super(WarmupCosineWithRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
+
+    def get_lr_(self, progress):
+        if self.t_total <= 0:
+            return 1.
+        if progress < self.warmup:
+            return progress / self.warmup
+        else:
+            progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
+            ret = 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress))
+            return ret
 
 
 class WarmupConstantSchedule(LRSchedule):

From 902461333715d16773cac1e5e1300be705f49205 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Tue, 12 Mar 2019 13:23:58 +0100
Subject: [PATCH 06/27] changing docker

---
 pytorch_pretrained_bert/optimization.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index 58e16f01a6..481072c483 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -69,7 +69,7 @@ class WarmupCosineSchedule(LRSchedule):
             return progress / self.warmup
         else:
             progress = (progress - self.warmup) / (1 - self.warmup)   # progress after warmup
-            return 0.5 * (1. + math.cos(math.pi * ((self.cycles * 2 * progress) % 1))
+            return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress))
 
 
 class WarmupCosineWithRestartsSchedule(WarmupCosineSchedule):
@@ -84,7 +84,7 @@ class WarmupCosineWithRestartsSchedule(WarmupCosineSchedule):
             return progress / self.warmup
         else:
             progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
-            ret = 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress))
+            ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * 2 * progress) % 1)))
             return ret
 
 

From 471daf1b6c0821e8b5ab6a173d7f41de079bae8a Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Tue, 12 Mar 2019 13:32:42 +0100
Subject: [PATCH 07/27] changing docker

---
 pytorch_pretrained_bert/optimization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index 481072c483..dcd8cfff2f 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -84,7 +84,7 @@ class WarmupCosineWithRestartsSchedule(WarmupCosineSchedule):
             return progress / self.warmup
         else:
             progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
-            ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * 2 * progress) % 1)))
+            ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * progress) % 1)))
             return ret
 
 

From eac039d21f8141c501bd75d02d46ba5545797b63 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Tue, 12 Mar 2019 13:45:12 +0100
Subject: [PATCH 08/27] changing docker

---
 pytorch_pretrained_bert/optimization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index dcd8cfff2f..7eda3ba92a 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -24,7 +24,7 @@ import logging
 logger = logging.getLogger(__name__)
 
 
-__all__ = ["LRSchedule", "WarmupLinearSchedule", "WarmupConstantSchedule", "WarmupCosineSchedule", "BertAdam"]
+__all__ = ["LRSchedule", "WarmupLinearSchedule", "WarmupConstantSchedule", "WarmupCosineSchedule", "BertAdam", "WarmupCosineWithRestartsSchedule"]
 
 
 class LRSchedule(object):

From 20e652209c7da7a73c9d1f3a65418d0ea118680e Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Wed, 13 Mar 2019 16:13:37 +0100
Subject: [PATCH 09/27] relation classification: replacing entity mention with
 mask token

---
 pytorch_pretrained_bert/optimization.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index 7eda3ba92a..9a873e221b 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -130,7 +130,7 @@ class BertAdam(Optimizer):
         max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
     """
     def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
-                 b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01,
+                 b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, init_weight_decay=0.,
                  max_grad_norm=1.0):
         if lr is not required and lr < 0.0:
             raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
@@ -150,7 +150,7 @@ class BertAdam(Optimizer):
             if warmup != -1 or t_total != -1:
                 logger.warning("Non-default warmup and t_total are ineffective when LRSchedule object is provided.")
         defaults = dict(lr=lr, schedule=schedule,
-                        b1=b1, b2=b2, e=e, weight_decay=weight_decay,
+                        b1=b1, b2=b2, e=e, weight_decay=weight_decay, init_weight_decay=init_weight_decay,
                         max_grad_norm=max_grad_norm)
         super(BertAdam, self).__init__(params, defaults)
 
@@ -220,6 +220,8 @@ class BertAdam(Optimizer):
                 if group['weight_decay'] > 0.0:
                     update += group['weight_decay'] * p.data
 
+                # TODO: init weight decay
+
                 lr_scheduled = group['lr']
                 lr_scheduled *= group['schedule'].get_lr(state['step'])
 

From bed6408dcce8cf1b04e8dfa41f481500f40e47ca Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Mon, 18 Mar 2019 13:09:55 +0100
Subject: [PATCH 10/27] branches, optim cosine fix

---
 pytorch_pretrained_bert/optimization.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index fa911e5c04..e553365b54 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -26,7 +26,9 @@ logger = logging.getLogger(__name__)
 def warmup_cosine(x, warmup=0.002):
     if x < warmup:
         return x/warmup
-    return 0.5 * (1.0 + torch.cos(math.pi * x))
+
+    x_ = (x - warmup) / (1 - warmup)  # progress after warmup
+    return 0.5 * (1. + math.cos(math.pi * x_))
 
 def warmup_constant(x, warmup=0.002):
     """ Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps.

From ef28b2c74739162b88d78009ed3fae74deeb8b36 Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Mon, 18 Mar 2019 13:18:07 +0100
Subject: [PATCH 11/27] branches, optim cosine fix

---
 pytorch_pretrained_bert/optimization.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py
index 95411857c6..aa59c7d7ec 100644
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -26,7 +26,7 @@ logger = logging.getLogger(__name__)
 def warmup_cosine(x, warmup=0.002):
     if x < warmup:
         return x/warmup
-    x_ = (x - warmup) / (1 - warmup)  # progress after warmup
+    x_ = (x - warmup) / (1 - warmup)  # progress after warmup -
     return 0.5 * (1. + math.cos(math.pi * x_))
 
 def warmup_constant(x, warmup=0.002):

From 2283dcca5e62438efbe445d518cddfaa2842a75b Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Mon, 18 Mar 2019 13:40:12 +0100
Subject: [PATCH 12/27] import revert

---
 pytorch_pretrained_bert/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py
index e82d409ee0..bd455b8d9c 100644
--- a/pytorch_pretrained_bert/__init__.py
+++ b/pytorch_pretrained_bert/__init__.py
@@ -18,7 +18,7 @@ from .modeling_gpt2 import (GPT2Config, GPT2Model,
                             GPT2LMHeadModel, GPT2DoubleHeadsModel,
                             load_tf_weights_in_gpt2)
 
-from .optimization import *
+from .optimization import BertAdam
 from .optimization_openai import OpenAIAdam
 
 from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path

From 19cc2c084e265d1cf2ae2218147cd52bdd71bf1a Mon Sep 17 00:00:00 2001
From: lukovnikov <lukovnikov@outlook.com>
Date: Mon, 18 Mar 2019 15:13:35 +0100
Subject: [PATCH 13/27] same

---
 pytorch_pretrained_bert/optimization_openai.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/optimization_openai.py b/pytorch_pretrained_bert/optimization_openai.py
index 7f56a1284c..99ac15e108 100644
--- a/pytorch_pretrained_bert/optimization_openai.py
+++ b/pytorch_pretrained_bert/optimization_openai.py
@@ -26,7 +26,8 @@ logger = logging.getLogger(__name__)
 def warmup_cosine(x, warmup=0.002):
     if x < warmup:
         return x/warmup
-    return 0.5 * (1.0 + torch.cos(math.pi * x))
+    x_ = (x - warmup) / (1 - warmup)  # progress after warmup
+    return 0.5 * (1. + math.cos(math.pi * x_))
 
 def warmup_constant(x, warmup=0.002):
     """ Linearly increases learning rate over `warmup`*`t_total` (as provided to OpenAIAdam) training steps.

From 7797d21b8d7180af2114890c4c3393c762a69154 Mon Sep 17 00:00:00 2001
From: Catalin Voss <catalin@cs.stanford.edu>
Date: Sun, 24 Mar 2019 13:34:30 -0700
Subject: [PATCH 14/27] Fix GPT2 language modeling loss computation

---
 pytorch_pretrained_bert/modeling_gpt2.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index c381b288f8..13ae7a2342 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -617,8 +617,16 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
         hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past)
         lm_logits = self.lm_head(hidden_states)
         if lm_labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[:, :-1]
+            shift_labels = torch_batch[:, 1:]
+
+            # In tensorflow, it's [batch, d_0, d_1, ..., d_{r-1}, num_classes]
+            # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}]
+            # We just flatten the tokens out this way.
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1))
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1))
+                            shift_labels.view(-1))
             return loss
         return lm_logits, presents
 

From 5938f31fa7aa28cdff662f79c7c038cab21bb370 Mon Sep 17 00:00:00 2001
From: Catalin Voss <catalin@cs.stanford.edu>
Date: Sun, 24 Mar 2019 13:35:32 -0700
Subject: [PATCH 15/27] Fix c/p typo from my experiment code

---
 pytorch_pretrained_bert/modeling_gpt2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 13ae7a2342..1733a5b3f4 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -619,7 +619,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
         if lm_labels is not None:
             # Shift so that tokens < n predict n
             shift_logits = lm_logits[:, :-1]
-            shift_labels = torch_batch[:, 1:]
+            shift_labels = lm_labels[:, 1:]
 
             # In tensorflow, it's [batch, d_0, d_1, ..., d_{r-1}, num_classes]
             # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}]

From 2e6f5ffb96029398f740b6eacdc86b117cccb86b Mon Sep 17 00:00:00 2001
From: Catalin Voss <catalin@cs.stanford.edu>
Date: Sun, 24 Mar 2019 13:36:46 -0700
Subject: [PATCH 16/27] Fix GPT language model loss here as well

---
 pytorch_pretrained_bert/modeling_openai.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 296abbfc31..9c708f88a2 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -716,8 +716,16 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
         hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
         lm_logits = self.lm_head(hidden_states)
         if lm_labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[:, :-1]
+            shift_labels = lm_labels[:, 1:]
+
+            # In tensorflow, it's [batch, d_0, d_1, ..., d_{r-1}, num_classes]
+            # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}]
+            # We just flatten the tokens out this way.
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1))
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1))
+                            shift_labels.view(-1))
             return loss
         return lm_logits
 

From 472857c47f3b6a142a7aaa53836e33cd8543088d Mon Sep 17 00:00:00 2001
From: Catalin Voss <catalin@cs.stanford.edu>
Date: Sun, 24 Mar 2019 13:49:42 -0700
Subject: [PATCH 17/27] Fix typo syntax err (sorry, c/p from my repo)

---
 pytorch_pretrained_bert/modeling_gpt2.py   | 2 +-
 pytorch_pretrained_bert/modeling_openai.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 1733a5b3f4..15e7ca26e1 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -625,7 +625,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
             # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}]
             # We just flatten the tokens out this way.
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1))
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                             shift_labels.view(-1))
             return loss
         return lm_logits, presents
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 9c708f88a2..ab4107667b 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -724,7 +724,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
             # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}]
             # We just flatten the tokens out this way.
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1))
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                             shift_labels.view(-1))
             return loss
         return lm_logits

From 0dd796e359d1fbf9c0ea39b04e9b5655e5a09dee Mon Sep 17 00:00:00 2001
From: Catalin Voss <catalin@cs.stanford.edu>
Date: Sun, 24 Mar 2019 14:35:55 -0700
Subject: [PATCH 18/27] Also fix loss function issue with the double head
 models

---
 pytorch_pretrained_bert/modeling_gpt2.py   | 5 ++++-
 pytorch_pretrained_bert/modeling_openai.py | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 15e7ca26e1..635326b408 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -698,8 +698,11 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
         mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
         losses = []
         if lm_labels is not None:
+            shift_logits = lm_logits[:, :-1]
+            shift_labels = lm_labels[:, 1:]
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            losses.append(loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1)))
+            losses.append(loss_fct(shift_logits.view(-1,
+                          shift_logits.size(-1)), shift_labels.view(-1)))
         if mc_labels is not None:
             loss_fct = CrossEntropyLoss()
             losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index ab4107667b..8c1dd5e4a3 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -811,8 +811,11 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
         losses = []
         if lm_labels is not None:
+            shift_logits = lm_logits[:, :-1]
+            shift_labels = lm_labels[:, 1:]
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            losses.append(loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1)))
+            losses.append(loss_fct(shift_logits.view(-1,
+                          shift_logits.size(-1)), shift_labels.view(-1)))
         if mc_labels is not None:
             loss_fct = CrossEntropyLoss()
             losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))

From fda2f623953bfe2290cd65429eb008f02ebdb152 Mon Sep 17 00:00:00 2001
From: Catalin Voss <catalin@cs.stanford.edu>
Date: Sun, 24 Mar 2019 14:37:13 -0700
Subject: [PATCH 19/27] Fix test failures due to old torch issue with
 non-contiguous view

---
 pytorch_pretrained_bert/modeling_gpt2.py   | 8 ++++----
 pytorch_pretrained_bert/modeling_openai.py | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 635326b408..7a0bb4db53 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -618,8 +618,8 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
         lm_logits = self.lm_head(hidden_states)
         if lm_labels is not None:
             # Shift so that tokens < n predict n
-            shift_logits = lm_logits[:, :-1]
-            shift_labels = lm_labels[:, 1:]
+            shift_logits = lm_logits[:, :-1].contiguous()
+            shift_labels = lm_labels[:, 1:].contiguous()
 
             # In tensorflow, it's [batch, d_0, d_1, ..., d_{r-1}, num_classes]
             # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}]
@@ -698,8 +698,8 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
         mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
         losses = []
         if lm_labels is not None:
-            shift_logits = lm_logits[:, :-1]
-            shift_labels = lm_labels[:, 1:]
+            shift_logits = lm_logits[:, :-1].contiguous()
+            shift_labels = lm_labels[:, 1:].contiguous()
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             losses.append(loss_fct(shift_logits.view(-1,
                           shift_logits.size(-1)), shift_labels.view(-1)))
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 8c1dd5e4a3..4385c1eaa7 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -717,8 +717,8 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
         lm_logits = self.lm_head(hidden_states)
         if lm_labels is not None:
             # Shift so that tokens < n predict n
-            shift_logits = lm_logits[:, :-1]
-            shift_labels = lm_labels[:, 1:]
+            shift_logits = lm_logits[:, :-1].contiguous()
+            shift_labels = lm_labels[:, 1:].contiguous()
 
             # In tensorflow, it's [batch, d_0, d_1, ..., d_{r-1}, num_classes]
             # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}]
@@ -811,8 +811,8 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
         losses = []
         if lm_labels is not None:
-            shift_logits = lm_logits[:, :-1]
-            shift_labels = lm_labels[:, 1:]
+            shift_logits = lm_logits[:, :-1].contiguous()
+            shift_labels = lm_labels[:, 1:].contiguous()
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             losses.append(loss_fct(shift_logits.view(-1,
                           shift_logits.size(-1)), shift_labels.view(-1)))

From 01520d5412ab1b17c4ef0da5ed6cb9e62d6dfcb1 Mon Sep 17 00:00:00 2001
From: Catalin Voss <catalin@cs.stanford.edu>
Date: Wed, 27 Mar 2019 10:45:11 -0700
Subject: [PATCH 20/27] Remove my unhelpful comments :)

---
 pytorch_pretrained_bert/modeling_gpt2.py   | 4 +---
 pytorch_pretrained_bert/modeling_openai.py | 4 +---
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py
index 7a0bb4db53..7b00ce7730 100644
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -621,9 +621,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
             shift_logits = lm_logits[:, :-1].contiguous()
             shift_labels = lm_labels[:, 1:].contiguous()
 
-            # In tensorflow, it's [batch, d_0, d_1, ..., d_{r-1}, num_classes]
-            # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}]
-            # We just flatten the tokens out this way.
+            # Flatten the tokens
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                             shift_labels.view(-1))
diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py
index 4385c1eaa7..7273e75bf6 100644
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -720,9 +720,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
             shift_logits = lm_logits[:, :-1].contiguous()
             shift_labels = lm_labels[:, 1:].contiguous()
 
-            # In tensorflow, it's [batch, d_0, d_1, ..., d_{r-1}, num_classes]
-            # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}]
-            # We just flatten the tokens out this way.
+            # Flatten the tokens
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                             shift_labels.view(-1))

From 60005f464d2069801a2cf26dc0f011da8ed639b0 Mon Sep 17 00:00:00 2001
From: jeonsworld <37530102+jeonsworld@users.noreply.github.com>
Date: Sat, 30 Mar 2019 14:50:17 +0900
Subject: [PATCH 21/27] Update pregenerate_training_data.py

If the value of rand_end is returned from the randint function, the value of sampled_doc_index that matches current_idx is returned from searchsorted.

example:
cumsum_max = {int64} 30
doc_cumsum = {ndarray} [ 5  7 11 19 30]
doc_lengths = {list} <class 'list'>: [5, 2, 4, 8, 11]
if current_idx  = 1,
rand_start = 7
rand_end = 35
sentence_index = randint(7, 35) % cumsum_max
if randint return 35, sentence_index becomes 5.
if sentence_index is 5, np.searchsorted returns 1 equal to current_index.
---
 examples/lm_finetuning/pregenerate_training_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/lm_finetuning/pregenerate_training_data.py b/examples/lm_finetuning/pregenerate_training_data.py
index 498ab22333..8cc28d2e78 100644
--- a/examples/lm_finetuning/pregenerate_training_data.py
+++ b/examples/lm_finetuning/pregenerate_training_data.py
@@ -49,7 +49,7 @@ class DocumentDatabase:
                 self._precalculate_doc_weights()
             rand_start = self.doc_cumsum[current_idx]
             rand_end = rand_start + self.cumsum_max - self.doc_lengths[current_idx]
-            sentence_index = randint(rand_start, rand_end) % self.cumsum_max
+            sentence_index = randint(rand_start, rand_end-1) % self.cumsum_max
             sampled_doc_index = np.searchsorted(self.doc_cumsum, sentence_index, side='right')
         else:
             # If we don't use sentence weighting, then every doc has an equal chance to be chosen

From d07db28f52b4faf6769d6b7b454c950ec02dc641 Mon Sep 17 00:00:00 2001
From: Weixin Wang <6220861+MottoX@users.noreply.github.com>
Date: Sun, 31 Mar 2019 01:20:18 +0800
Subject: [PATCH 22/27] Fix typo in example code

Modify 'unambigiously' to 'unambiguously'
---
 examples/run_classifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_classifier.py b/examples/run_classifier.py
index d79c2526b0..751d581ad9 100644
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -442,7 +442,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
         # sequence or the second sequence. The embedding vectors for `type=0` and
         # `type=1` were learned during pre-training and are added to the wordpiece
         # embedding vector (and position vector). This is not *strictly* necessary
-        # since the [SEP] token unambigiously separates the sequences, but it makes
+        # since the [SEP] token unambiguously separates the sequences, but it makes
         # it easier for the model to learn the concept of sequences.
         #
         # For classification tasks, the first vector (corresponding to [CLS]) is

From 8b5c63e4deffad8c1c421caee8fef4bb97881f70 Mon Sep 17 00:00:00 2001
From: Mike Arpaia <mike.arpaia@workday.com>
Date: Mon, 1 Apr 2019 12:53:51 -0600
Subject: [PATCH 23/27] Fixes to the TensorFlow conversion tool

---
 examples/extract_features.py        | 2 +-
 pytorch_pretrained_bert/modeling.py | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/examples/extract_features.py b/examples/extract_features.py
index 0d59aa7e81..13384a9d69 100644
--- a/examples/extract_features.py
+++ b/examples/extract_features.py
@@ -57,7 +57,7 @@ class InputFeatures(object):
 
 
 def convert_examples_to_features(examples, seq_length, tokenizer):
-    """Loads a data file into a list of `InputBatch`s."""
+    """Loads a data file into a list of `InputFeature`s."""
 
     features = []
     for (ex_index, example) in enumerate(examples):
diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index b92f3a87f1..938636142f 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -76,7 +76,7 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
         name = name.split('/')
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
-        if any(n in ["adam_v", "adam_m"] for n in name):
+        if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
             print("Skipping {}".format("/".join(name)))
             continue
         pointer = model
@@ -92,7 +92,11 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
             elif l[0] == 'output_weights':
                 pointer = getattr(pointer, 'weight')
             else:
-                pointer = getattr(pointer, l[0])
+                try:
+                    pointer = getattr(pointer, l[0])
+                except AttributeError:
+                    print("Skipping {}".format("/".join(name)))
+                    continue
             if len(l) >= 2:
                 num = int(l[1])
                 pointer = pointer[num]

From f26ce6992e683a49799515e86086cf66599e2762 Mon Sep 17 00:00:00 2001
From: Weixin Wang <6220861+MottoX@users.noreply.github.com>
Date: Tue, 2 Apr 2019 17:20:32 +0800
Subject: [PATCH 24/27] Fix links in README

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 6e9589aaa2..daac69de9f 100644
--- a/README.md
+++ b/README.md
@@ -1234,9 +1234,9 @@ A command-line interface is provided to convert a TensorFlow checkpoint in a PyT
 
 ### BERT
 
-You can convert any TensorFlow checkpoint for BERT (in particular [the pre-trained models released by Google](https://github.com/google-research/bert#pre-trained-models)) in a PyTorch save file by using the [`./pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py`](convert_tf_checkpoint_to_pytorch.py) script.
+You can convert any TensorFlow checkpoint for BERT (in particular [the pre-trained models released by Google](https://github.com/google-research/bert#pre-trained-models)) in a PyTorch save file by using the [`convert_tf_checkpoint_to_pytorch.py`](./pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py ) script.
 
-This CLI takes as input a TensorFlow checkpoint (three files starting with `bert_model.ckpt`) and the associated configuration file (`bert_config.json`), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using `torch.load()` (see examples in [`extract_features.py`](./examples/extract_features.py), [`run_classifier.py`](./examples/run_classifier.py) and [`run_squad.py`]((./examples/run_squad.py))).
+This CLI takes as input a TensorFlow checkpoint (three files starting with `bert_model.ckpt`) and the associated configuration file (`bert_config.json`), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using `torch.load()` (see examples in [`extract_features.py`](./examples/extract_features.py), [`run_classifier.py`](./examples/run_classifier.py) and [`run_squad.py`](./examples/run_squad.py)).
 
 You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow checkpoint (the three files starting with `bert_model.ckpt`) but be sure to keep the configuration file (`bert_config.json`) and the vocabulary file (`vocab.txt`) as these are needed for the PyTorch model too.
 

From 846b1fd6f81a40ca363a91afaf3f67c3098e5687 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 3 Apr 2019 10:50:38 +0200
Subject: [PATCH 25/27] Fix #419

---
 examples/run_squad.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index bb335ce75c..043b795326 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -85,9 +85,9 @@ class SquadExample(object):
         s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
         if self.start_position:
             s += ", start_position: %d" % (self.start_position)
-        if self.start_position:
+        if self.end_position:
             s += ", end_position: %d" % (self.end_position)
-        if self.start_position:
+        if self.is_impossible:
             s += ", is_impossible: %r" % (self.is_impossible)
         return s
 

From 1d8c23232444fbeae395a892fe4f353138e81fec Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 3 Apr 2019 10:51:03 +0200
Subject: [PATCH 26/27] Fix #436

---
 pytorch_pretrained_bert/tokenization.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/pytorch_pretrained_bert/tokenization.py b/pytorch_pretrained_bert/tokenization.py
index c549e06d78..bbb3e25fc7 100644
--- a/pytorch_pretrained_bert/tokenization.py
+++ b/pytorch_pretrained_bert/tokenization.py
@@ -105,13 +105,13 @@ class BertTokenizer(object):
         self.max_len = max_len if max_len is not None else int(1e12)
 
     def tokenize(self, text):
+        split_tokens = []
         if self.do_basic_tokenize:
-          split_tokens = []
-          for token in self.basic_tokenizer.tokenize(text):
-              for sub_token in self.wordpiece_tokenizer.tokenize(token):
-                  split_tokens.append(sub_token)
+            for token in self.basic_tokenizer.tokenize(text):
+                for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                    split_tokens.append(sub_token)
         else:
-          split_tokens = self.wordpiece_tokenizer.tokenize(text)
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
         return split_tokens
 
     def convert_tokens_to_ids(self, tokens):
@@ -142,6 +142,16 @@ class BertTokenizer(object):
         """
         if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
             vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
+            if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True):
+                logger.warning("The pre-trained model you are loading is a cased model but you have not set "
+                               "`do_lower_case` to False. We are setting `do_lower_case=False` for you but "
+                               "you may want to check this behavior.")
+                kwargs['do_lower_case'] = False
+            elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True):
+                logger.warning("The pre-trained model you are loading is an uncased model but you have set "
+                               "`do_lower_case` to False. We are setting `do_lower_case=True` for you "
+                               "but you may want to check this behavior.")
+                kwargs['do_lower_case'] = True
         else:
             vocab_file = pretrained_model_name_or_path
         if os.path.isdir(vocab_file):

From 19666dcb3bee3e379f1458e295869957aac8590c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 3 Apr 2019 11:01:01 +0200
Subject: [PATCH 27/27] Should fix #438

---
 pytorch_pretrained_bert/modeling.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py
index 938636142f..2736e34d7f 100644
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
@@ -91,6 +91,8 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path):
                 pointer = getattr(pointer, 'bias')
             elif l[0] == 'output_weights':
                 pointer = getattr(pointer, 'weight')
+            elif l[0] == 'squad':
+                pointer = getattr(pointer, 'classifier')
             else:
                 try:
                     pointer = getattr(pointer, l[0])