From 88874f6cf09e14fc482abc186adebb2767dca258 Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Fri, 8 Mar 2019 19:08:30 +0100 Subject: [PATCH 01/21] BertAdam schedule objects --- pytorch_pretrained_bert/optimization.py | 141 +++++++++++++++++------- 1 file changed, 99 insertions(+), 42 deletions(-) diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py index fa911e5c04..73afc71058 100644 --- a/pytorch_pretrained_bert/optimization.py +++ b/pytorch_pretrained_bert/optimization.py @@ -23,29 +23,99 @@ import logging logger = logging.getLogger(__name__) -def warmup_cosine(x, warmup=0.002): - if x < warmup: - return x/warmup - return 0.5 * (1.0 + torch.cos(math.pi * x)) -def warmup_constant(x, warmup=0.002): - """ Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps. - Learning rate is 1. afterwards. """ - if x < warmup: - return x/warmup - return 1.0 +class LRSchedule(object): + warn_t_total = False + def __init__(self, warmup=0.002, t_total=-1, **kw): + super(LRSchedule, self).__init__(**kw) + self.warmup, self.t_total = warmup, t_total + if t_total <= 0: + logger.warning("t_total value of {} results in schedule not being applied".format(t_total)) + if not 0.0 <= warmup < 1.0 and not warmup == -1: + raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup)) + self.warned_for_t_total_at_progress = -1 -def warmup_linear(x, warmup=0.002): - """ Specifies a triangular learning rate schedule where peak is reached at `warmup`*`t_total`-th (as provided to BertAdam) training step. - After `t_total`-th training step, learning rate is zero. """ - if x < warmup: - return x/warmup - return max((x-1.)/(warmup-1.), 0) + def get_lr(self, step, nowarn=False): + progress = step / self.t_total + ret = self.get_lr_(progress) + # warning for exceeding t_total (only active with warmup_linear + if not nowarn and self.warn_t_total and progress > 1. and progress > self.warned_for_t_total_at_progress: + logger.warning( + "Training beyond specified 't_total'. Learning rate multiplier set to {}. Please set 't_total' of {} correctly." + .format(ret, self.__class__.__name__)) + self.warned_for_t_total_at_progress = progress + # end warning + return ret + + def get_lr_(self, step): + return 1. + # raise NotImplemented("use subclass") + + +class WarmupCosineSchedule(LRSchedule): + warn_t_total = True + def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw): + super(WarmupCosineSchedule, self).__init__(warmup=warmup, t_total=t_total, **kw) + self.cycles = cycles + + def get_lr_(self, progress): + """ get learning rate multiplier """ + if self.t_total <= 0: + return 1. + if progress < self.warmup: + return progress / self.warmup + else: + progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup + return 0.5 * (1. + torch.cos(math.pi * self.cycles * 2 * progress)) + + +class WarmupConstantSchedule(LRSchedule): + warn_t_total = False + def get_lr_(self, progress): + if progress < self.warmup: + return progress / self.warmup + return 1. + + +class WarmupLinearSchedule(LRSchedule): + warn_t_total = True + def get_lr_(self, progress): + if progress < self.warmup: + return progress / self.warmup + return max((progress - 1.) / (self.warmup - 1.), 0) +# +# +# def warmup_cosine(x, warmup=0.002): +# if x < warmup: +# return x/warmup +# return 0.5 * (1.0 + torch.cos(math.pi * x)) +# +# def warmup_constant(x, warmup=0.002): +# """ Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps. +# Learning rate is 1. afterwards. """ +# if x < warmup: +# return x/warmup +# return 1.0 +# +# def warmup_linear(x, warmup=0.002): +# """ Specifies a triangular learning rate schedule where peak is reached at `warmup`*`t_total`-th (as provided to BertAdam) training step. +# After `t_total`-th training step, learning rate is zero. """ +# if x < warmup: +# return x/warmup +# return max((x-1.)/(warmup-1.), 0) +# +# SCHEDULES = { +# 'warmup_cosine': warmup_cosine, +# 'warmup_constant': warmup_constant, +# 'warmup_linear': warmup_linear, +# } SCHEDULES = { - 'warmup_cosine': warmup_cosine, - 'warmup_constant': warmup_constant, - 'warmup_linear': warmup_linear, + None: LRSchedule, + "none": LRSchedule, + "warmup_cosine": WarmupCosineSchedule, + "warmup_constant": WarmupConstantSchedule, + "warmup_linear": WarmupLinearSchedule } @@ -70,15 +140,16 @@ class BertAdam(Optimizer): raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) if schedule not in SCHEDULES: raise ValueError("Invalid schedule parameter: {}".format(schedule)) - if not 0.0 <= warmup < 1.0 and not warmup == -1: - raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup)) if not 0.0 <= b1 < 1.0: raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1)) if not 0.0 <= b2 < 1.0: raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2)) if not e >= 0.0: raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e)) - defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total, + # initialize schedule object + schedule_type = SCHEDULES[schedule] + sched = schedule_type(warmup=warmup, t_total=t_total) + defaults = dict(lr=lr, schedule=sched, b1=b1, b2=b2, e=e, weight_decay=weight_decay, max_grad_norm=max_grad_norm) super(BertAdam, self).__init__(params, defaults) @@ -90,11 +161,10 @@ class BertAdam(Optimizer): state = self.state[p] if len(state) == 0: return [0] - if group['t_total'] != -1: - schedule_fct = SCHEDULES[group['schedule']] - lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup']) - else: - lr_scheduled = group['lr'] + + lr_scheduled = group['lr'] + lr_scheduled *= group['schedule'](state['step']) + lr.append(lr_scheduled) return lr @@ -109,8 +179,6 @@ class BertAdam(Optimizer): if closure is not None: loss = closure() - warned_for_t_total = False - for group in self.param_groups: for p in group['params']: if p.grad is None: @@ -152,19 +220,8 @@ class BertAdam(Optimizer): if group['weight_decay'] > 0.0: update += group['weight_decay'] * p.data - if group['t_total'] != -1: - schedule_fct = SCHEDULES[group['schedule']] - progress = state['step']/group['t_total'] - lr_scheduled = group['lr'] * schedule_fct(progress, group['warmup']) - # warning for exceeding t_total (only active with warmup_linear - if group['schedule'] == "warmup_linear" and progress > 1. and not warned_for_t_total: - logger.warning( - "Training beyond specified 't_total' steps with schedule '{}'. Learning rate set to {}. " - "Please set 't_total' of {} correctly.".format(group['schedule'], lr_scheduled, self.__class__.__name__)) - warned_for_t_total = True - # end warning - else: - lr_scheduled = group['lr'] + lr_scheduled = group['lr'] + lr_scheduled *= group['schedule'](state['step']) update_with_lr = lr_scheduled * update p.data.add_(-update_with_lr) From 90a41dbe1404f734f6a25bfbaf89be71ba5e4613 Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Sat, 9 Mar 2019 02:23:20 +0100 Subject: [PATCH 02/21] BertAdam schedule objects --- pytorch_pretrained_bert/__init__.py | 2 +- pytorch_pretrained_bert/optimization.py | 48 +++++++++---------------- 2 files changed, 17 insertions(+), 33 deletions(-) diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py index bd455b8d9c..e82d409ee0 100644 --- a/pytorch_pretrained_bert/__init__.py +++ b/pytorch_pretrained_bert/__init__.py @@ -18,7 +18,7 @@ from .modeling_gpt2 import (GPT2Config, GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel, load_tf_weights_in_gpt2) -from .optimization import BertAdam +from .optimization import * from .optimization_openai import OpenAIAdam from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py index 73afc71058..cea35c39e9 100644 --- a/pytorch_pretrained_bert/optimization.py +++ b/pytorch_pretrained_bert/optimization.py @@ -24,6 +24,9 @@ import logging logger = logging.getLogger(__name__) +__all__ = ["LRSchedule", "WarmupLinearSchedule", "WarmupConstantSchedule", "WarmupCosineSchedule", "BertAdam"] + + class LRSchedule(object): warn_t_total = False def __init__(self, warmup=0.002, t_total=-1, **kw): @@ -83,32 +86,7 @@ class WarmupLinearSchedule(LRSchedule): if progress < self.warmup: return progress / self.warmup return max((progress - 1.) / (self.warmup - 1.), 0) -# -# -# def warmup_cosine(x, warmup=0.002): -# if x < warmup: -# return x/warmup -# return 0.5 * (1.0 + torch.cos(math.pi * x)) -# -# def warmup_constant(x, warmup=0.002): -# """ Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps. -# Learning rate is 1. afterwards. """ -# if x < warmup: -# return x/warmup -# return 1.0 -# -# def warmup_linear(x, warmup=0.002): -# """ Specifies a triangular learning rate schedule where peak is reached at `warmup`*`t_total`-th (as provided to BertAdam) training step. -# After `t_total`-th training step, learning rate is zero. """ -# if x < warmup: -# return x/warmup -# return max((x-1.)/(warmup-1.), 0) -# -# SCHEDULES = { -# 'warmup_cosine': warmup_cosine, -# 'warmup_constant': warmup_constant, -# 'warmup_linear': warmup_linear, -# } + SCHEDULES = { None: LRSchedule, @@ -126,7 +104,9 @@ class BertAdam(Optimizer): warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1 t_total: total number of training steps for the learning rate schedule, -1 means constant learning rate. Default: -1 - schedule: schedule to use for the warmup (see above). Default: 'warmup_linear' + schedule: schedule to use for the warmup (see above). + Can be 'warmup_linear', 'warmup_constant', 'warmup_cosine', or a LRSchedule object. + Default: 'warmup_linear' b1: Adams b1. Default: 0.9 b2: Adams b2. Default: 0.999 e: Adams epsilon. Default: 1e-6 @@ -147,9 +127,13 @@ class BertAdam(Optimizer): if not e >= 0.0: raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e)) # initialize schedule object - schedule_type = SCHEDULES[schedule] - sched = schedule_type(warmup=warmup, t_total=t_total) - defaults = dict(lr=lr, schedule=sched, + if not isinstance(schedule, LRSchedule): + schedule_type = SCHEDULES[schedule] + schedule = schedule_type(warmup=warmup, t_total=t_total) + else: + if warmup != -1 or t_total != -1: + logger.warning("Non-default warmup and t_total are ineffective when LRSchedule object is provided.") + defaults = dict(lr=lr, schedule=schedule, b1=b1, b2=b2, e=e, weight_decay=weight_decay, max_grad_norm=max_grad_norm) super(BertAdam, self).__init__(params, defaults) @@ -163,7 +147,7 @@ class BertAdam(Optimizer): return [0] lr_scheduled = group['lr'] - lr_scheduled *= group['schedule'](state['step']) + lr_scheduled *= group['schedule'].get_lr(state['step']) lr.append(lr_scheduled) return lr @@ -221,7 +205,7 @@ class BertAdam(Optimizer): update += group['weight_decay'] * p.data lr_scheduled = group['lr'] - lr_scheduled *= group['schedule'](state['step']) + lr_scheduled *= group['schedule'].get_lr(state['step']) update_with_lr = lr_scheduled * update p.data.add_(-update_with_lr) From f113a2dfdcf3116a35f856b274b4e4c2ecbeb6c0 Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Sat, 9 Mar 2019 02:29:57 +0100 Subject: [PATCH 03/21] readme de --- pytorch_pretrained_bert/optimization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py index cea35c39e9..84f329feae 100644 --- a/pytorch_pretrained_bert/optimization.py +++ b/pytorch_pretrained_bert/optimization.py @@ -118,7 +118,7 @@ class BertAdam(Optimizer): max_grad_norm=1.0): if lr is not required and lr < 0.0: raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) - if schedule not in SCHEDULES: + if not isinstance(schedule, LRSchedule) and schedule not in SCHEDULES: raise ValueError("Invalid schedule parameter: {}".format(schedule)) if not 0.0 <= b1 < 1.0: raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1)) From 51efde54a907c85495263a980d253dcdb3e75209 Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Sat, 9 Mar 2019 02:45:25 +0100 Subject: [PATCH 04/21] cos fix --- pytorch_pretrained_bert/optimization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py index 84f329feae..a92adb4c56 100644 --- a/pytorch_pretrained_bert/optimization.py +++ b/pytorch_pretrained_bert/optimization.py @@ -69,7 +69,7 @@ class WarmupCosineSchedule(LRSchedule): return progress / self.warmup else: progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup - return 0.5 * (1. + torch.cos(math.pi * self.cycles * 2 * progress)) + return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress)) class WarmupConstantSchedule(LRSchedule): From baf66d141958785feb0dfc90d6cd8558eb95a774 Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Tue, 12 Mar 2019 13:22:23 +0100 Subject: [PATCH 05/21] restart cosine lr schedule --- pytorch_pretrained_bert/optimization.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py index a92adb4c56..58e16f01a6 100644 --- a/pytorch_pretrained_bert/optimization.py +++ b/pytorch_pretrained_bert/optimization.py @@ -69,7 +69,23 @@ class WarmupCosineSchedule(LRSchedule): return progress / self.warmup else: progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup - return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress)) + return 0.5 * (1. + math.cos(math.pi * ((self.cycles * 2 * progress) % 1)) + + +class WarmupCosineWithRestartsSchedule(WarmupCosineSchedule): + warn_t_total = True + def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw): + super(WarmupCosineWithRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw) + + def get_lr_(self, progress): + if self.t_total <= 0: + return 1. + if progress < self.warmup: + return progress / self.warmup + else: + progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup + ret = 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress)) + return ret class WarmupConstantSchedule(LRSchedule): From 902461333715d16773cac1e5e1300be705f49205 Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Tue, 12 Mar 2019 13:23:58 +0100 Subject: [PATCH 06/21] changing docker --- pytorch_pretrained_bert/optimization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py index 58e16f01a6..481072c483 100644 --- a/pytorch_pretrained_bert/optimization.py +++ b/pytorch_pretrained_bert/optimization.py @@ -69,7 +69,7 @@ class WarmupCosineSchedule(LRSchedule): return progress / self.warmup else: progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup - return 0.5 * (1. + math.cos(math.pi * ((self.cycles * 2 * progress) % 1)) + return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress)) class WarmupCosineWithRestartsSchedule(WarmupCosineSchedule): @@ -84,7 +84,7 @@ class WarmupCosineWithRestartsSchedule(WarmupCosineSchedule): return progress / self.warmup else: progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup - ret = 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress)) + ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * 2 * progress) % 1))) return ret From 471daf1b6c0821e8b5ab6a173d7f41de079bae8a Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Tue, 12 Mar 2019 13:32:42 +0100 Subject: [PATCH 07/21] changing docker --- pytorch_pretrained_bert/optimization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py index 481072c483..dcd8cfff2f 100644 --- a/pytorch_pretrained_bert/optimization.py +++ b/pytorch_pretrained_bert/optimization.py @@ -84,7 +84,7 @@ class WarmupCosineWithRestartsSchedule(WarmupCosineSchedule): return progress / self.warmup else: progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup - ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * 2 * progress) % 1))) + ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * progress) % 1))) return ret From eac039d21f8141c501bd75d02d46ba5545797b63 Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Tue, 12 Mar 2019 13:45:12 +0100 Subject: [PATCH 08/21] changing docker --- pytorch_pretrained_bert/optimization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py index dcd8cfff2f..7eda3ba92a 100644 --- a/pytorch_pretrained_bert/optimization.py +++ b/pytorch_pretrained_bert/optimization.py @@ -24,7 +24,7 @@ import logging logger = logging.getLogger(__name__) -__all__ = ["LRSchedule", "WarmupLinearSchedule", "WarmupConstantSchedule", "WarmupCosineSchedule", "BertAdam"] +__all__ = ["LRSchedule", "WarmupLinearSchedule", "WarmupConstantSchedule", "WarmupCosineSchedule", "BertAdam", "WarmupCosineWithRestartsSchedule"] class LRSchedule(object): From 20e652209c7da7a73c9d1f3a65418d0ea118680e Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Wed, 13 Mar 2019 16:13:37 +0100 Subject: [PATCH 09/21] relation classification: replacing entity mention with mask token --- pytorch_pretrained_bert/optimization.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py index 7eda3ba92a..9a873e221b 100644 --- a/pytorch_pretrained_bert/optimization.py +++ b/pytorch_pretrained_bert/optimization.py @@ -130,7 +130,7 @@ class BertAdam(Optimizer): max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0 """ def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear', - b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, + b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, init_weight_decay=0., max_grad_norm=1.0): if lr is not required and lr < 0.0: raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) @@ -150,7 +150,7 @@ class BertAdam(Optimizer): if warmup != -1 or t_total != -1: logger.warning("Non-default warmup and t_total are ineffective when LRSchedule object is provided.") defaults = dict(lr=lr, schedule=schedule, - b1=b1, b2=b2, e=e, weight_decay=weight_decay, + b1=b1, b2=b2, e=e, weight_decay=weight_decay, init_weight_decay=init_weight_decay, max_grad_norm=max_grad_norm) super(BertAdam, self).__init__(params, defaults) @@ -220,6 +220,8 @@ class BertAdam(Optimizer): if group['weight_decay'] > 0.0: update += group['weight_decay'] * p.data + # TODO: init weight decay + lr_scheduled = group['lr'] lr_scheduled *= group['schedule'].get_lr(state['step']) From bed6408dcce8cf1b04e8dfa41f481500f40e47ca Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Mon, 18 Mar 2019 13:09:55 +0100 Subject: [PATCH 10/21] branches, optim cosine fix --- pytorch_pretrained_bert/optimization.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py index fa911e5c04..e553365b54 100644 --- a/pytorch_pretrained_bert/optimization.py +++ b/pytorch_pretrained_bert/optimization.py @@ -26,7 +26,9 @@ logger = logging.getLogger(__name__) def warmup_cosine(x, warmup=0.002): if x < warmup: return x/warmup - return 0.5 * (1.0 + torch.cos(math.pi * x)) + + x_ = (x - warmup) / (1 - warmup) # progress after warmup + return 0.5 * (1. + math.cos(math.pi * x_)) def warmup_constant(x, warmup=0.002): """ Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps. From ef28b2c74739162b88d78009ed3fae74deeb8b36 Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Mon, 18 Mar 2019 13:18:07 +0100 Subject: [PATCH 11/21] branches, optim cosine fix --- pytorch_pretrained_bert/optimization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py index 95411857c6..aa59c7d7ec 100644 --- a/pytorch_pretrained_bert/optimization.py +++ b/pytorch_pretrained_bert/optimization.py @@ -26,7 +26,7 @@ logger = logging.getLogger(__name__) def warmup_cosine(x, warmup=0.002): if x < warmup: return x/warmup - x_ = (x - warmup) / (1 - warmup) # progress after warmup + x_ = (x - warmup) / (1 - warmup) # progress after warmup - return 0.5 * (1. + math.cos(math.pi * x_)) def warmup_constant(x, warmup=0.002): From 2283dcca5e62438efbe445d518cddfaa2842a75b Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Mon, 18 Mar 2019 13:40:12 +0100 Subject: [PATCH 12/21] import revert --- pytorch_pretrained_bert/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py index e82d409ee0..bd455b8d9c 100644 --- a/pytorch_pretrained_bert/__init__.py +++ b/pytorch_pretrained_bert/__init__.py @@ -18,7 +18,7 @@ from .modeling_gpt2 import (GPT2Config, GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel, load_tf_weights_in_gpt2) -from .optimization import * +from .optimization import BertAdam from .optimization_openai import OpenAIAdam from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path From 19cc2c084e265d1cf2ae2218147cd52bdd71bf1a Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Mon, 18 Mar 2019 15:13:35 +0100 Subject: [PATCH 13/21] same --- pytorch_pretrained_bert/optimization_openai.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/optimization_openai.py b/pytorch_pretrained_bert/optimization_openai.py index 7f56a1284c..99ac15e108 100644 --- a/pytorch_pretrained_bert/optimization_openai.py +++ b/pytorch_pretrained_bert/optimization_openai.py @@ -26,7 +26,8 @@ logger = logging.getLogger(__name__) def warmup_cosine(x, warmup=0.002): if x < warmup: return x/warmup - return 0.5 * (1.0 + torch.cos(math.pi * x)) + x_ = (x - warmup) / (1 - warmup) # progress after warmup + return 0.5 * (1. + math.cos(math.pi * x_)) def warmup_constant(x, warmup=0.002): """ Linearly increases learning rate over `warmup`*`t_total` (as provided to OpenAIAdam) training steps. From 7797d21b8d7180af2114890c4c3393c762a69154 Mon Sep 17 00:00:00 2001 From: Catalin Voss Date: Sun, 24 Mar 2019 13:34:30 -0700 Subject: [PATCH 14/21] Fix GPT2 language modeling loss computation --- pytorch_pretrained_bert/modeling_gpt2.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py index c381b288f8..13ae7a2342 100644 --- a/pytorch_pretrained_bert/modeling_gpt2.py +++ b/pytorch_pretrained_bert/modeling_gpt2.py @@ -617,8 +617,16 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past) lm_logits = self.lm_head(hidden_states) if lm_labels is not None: + # Shift so that tokens < n predict n + shift_logits = lm_logits[:, :-1] + shift_labels = torch_batch[:, 1:] + + # In tensorflow, it's [batch, d_0, d_1, ..., d_{r-1}, num_classes] + # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}] + # We just flatten the tokens out this way. loss_fct = CrossEntropyLoss(ignore_index=-1) - loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1)) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)) + shift_labels.view(-1)) return loss return lm_logits, presents From 5938f31fa7aa28cdff662f79c7c038cab21bb370 Mon Sep 17 00:00:00 2001 From: Catalin Voss Date: Sun, 24 Mar 2019 13:35:32 -0700 Subject: [PATCH 15/21] Fix c/p typo from my experiment code --- pytorch_pretrained_bert/modeling_gpt2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py index 13ae7a2342..1733a5b3f4 100644 --- a/pytorch_pretrained_bert/modeling_gpt2.py +++ b/pytorch_pretrained_bert/modeling_gpt2.py @@ -619,7 +619,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): if lm_labels is not None: # Shift so that tokens < n predict n shift_logits = lm_logits[:, :-1] - shift_labels = torch_batch[:, 1:] + shift_labels = lm_labels[:, 1:] # In tensorflow, it's [batch, d_0, d_1, ..., d_{r-1}, num_classes] # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}] From 2e6f5ffb96029398f740b6eacdc86b117cccb86b Mon Sep 17 00:00:00 2001 From: Catalin Voss Date: Sun, 24 Mar 2019 13:36:46 -0700 Subject: [PATCH 16/21] Fix GPT language model loss here as well --- pytorch_pretrained_bert/modeling_openai.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py index 296abbfc31..9c708f88a2 100644 --- a/pytorch_pretrained_bert/modeling_openai.py +++ b/pytorch_pretrained_bert/modeling_openai.py @@ -716,8 +716,16 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): hidden_states = self.transformer(input_ids, position_ids, token_type_ids) lm_logits = self.lm_head(hidden_states) if lm_labels is not None: + # Shift so that tokens < n predict n + shift_logits = lm_logits[:, :-1] + shift_labels = lm_labels[:, 1:] + + # In tensorflow, it's [batch, d_0, d_1, ..., d_{r-1}, num_classes] + # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}] + # We just flatten the tokens out this way. loss_fct = CrossEntropyLoss(ignore_index=-1) - loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1)) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)) + shift_labels.view(-1)) return loss return lm_logits From 472857c47f3b6a142a7aaa53836e33cd8543088d Mon Sep 17 00:00:00 2001 From: Catalin Voss Date: Sun, 24 Mar 2019 13:49:42 -0700 Subject: [PATCH 17/21] Fix typo syntax err (sorry, c/p from my repo) --- pytorch_pretrained_bert/modeling_gpt2.py | 2 +- pytorch_pretrained_bert/modeling_openai.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py index 1733a5b3f4..15e7ca26e1 100644 --- a/pytorch_pretrained_bert/modeling_gpt2.py +++ b/pytorch_pretrained_bert/modeling_gpt2.py @@ -625,7 +625,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}] # We just flatten the tokens out this way. loss_fct = CrossEntropyLoss(ignore_index=-1) - loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) return loss return lm_logits, presents diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py index 9c708f88a2..ab4107667b 100644 --- a/pytorch_pretrained_bert/modeling_openai.py +++ b/pytorch_pretrained_bert/modeling_openai.py @@ -724,7 +724,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}] # We just flatten the tokens out this way. loss_fct = CrossEntropyLoss(ignore_index=-1) - loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) return loss return lm_logits From 0dd796e359d1fbf9c0ea39b04e9b5655e5a09dee Mon Sep 17 00:00:00 2001 From: Catalin Voss Date: Sun, 24 Mar 2019 14:35:55 -0700 Subject: [PATCH 18/21] Also fix loss function issue with the double head models --- pytorch_pretrained_bert/modeling_gpt2.py | 5 ++++- pytorch_pretrained_bert/modeling_openai.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py index 15e7ca26e1..635326b408 100644 --- a/pytorch_pretrained_bert/modeling_gpt2.py +++ b/pytorch_pretrained_bert/modeling_gpt2.py @@ -698,8 +698,11 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids) losses = [] if lm_labels is not None: + shift_logits = lm_logits[:, :-1] + shift_labels = lm_labels[:, 1:] loss_fct = CrossEntropyLoss(ignore_index=-1) - losses.append(loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1))) + losses.append(loss_fct(shift_logits.view(-1, + shift_logits.size(-1)), shift_labels.view(-1))) if mc_labels is not None: loss_fct = CrossEntropyLoss() losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))) diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py index ab4107667b..8c1dd5e4a3 100644 --- a/pytorch_pretrained_bert/modeling_openai.py +++ b/pytorch_pretrained_bert/modeling_openai.py @@ -811,8 +811,11 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids) losses = [] if lm_labels is not None: + shift_logits = lm_logits[:, :-1] + shift_labels = lm_labels[:, 1:] loss_fct = CrossEntropyLoss(ignore_index=-1) - losses.append(loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1))) + losses.append(loss_fct(shift_logits.view(-1, + shift_logits.size(-1)), shift_labels.view(-1))) if mc_labels is not None: loss_fct = CrossEntropyLoss() losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))) From fda2f623953bfe2290cd65429eb008f02ebdb152 Mon Sep 17 00:00:00 2001 From: Catalin Voss Date: Sun, 24 Mar 2019 14:37:13 -0700 Subject: [PATCH 19/21] Fix test failures due to old torch issue with non-contiguous view --- pytorch_pretrained_bert/modeling_gpt2.py | 8 ++++---- pytorch_pretrained_bert/modeling_openai.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py index 635326b408..7a0bb4db53 100644 --- a/pytorch_pretrained_bert/modeling_gpt2.py +++ b/pytorch_pretrained_bert/modeling_gpt2.py @@ -618,8 +618,8 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): lm_logits = self.lm_head(hidden_states) if lm_labels is not None: # Shift so that tokens < n predict n - shift_logits = lm_logits[:, :-1] - shift_labels = lm_labels[:, 1:] + shift_logits = lm_logits[:, :-1].contiguous() + shift_labels = lm_labels[:, 1:].contiguous() # In tensorflow, it's [batch, d_0, d_1, ..., d_{r-1}, num_classes] # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}] @@ -698,8 +698,8 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids) losses = [] if lm_labels is not None: - shift_logits = lm_logits[:, :-1] - shift_labels = lm_labels[:, 1:] + shift_logits = lm_logits[:, :-1].contiguous() + shift_labels = lm_labels[:, 1:].contiguous() loss_fct = CrossEntropyLoss(ignore_index=-1) losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))) diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py index 8c1dd5e4a3..4385c1eaa7 100644 --- a/pytorch_pretrained_bert/modeling_openai.py +++ b/pytorch_pretrained_bert/modeling_openai.py @@ -717,8 +717,8 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): lm_logits = self.lm_head(hidden_states) if lm_labels is not None: # Shift so that tokens < n predict n - shift_logits = lm_logits[:, :-1] - shift_labels = lm_labels[:, 1:] + shift_logits = lm_logits[:, :-1].contiguous() + shift_labels = lm_labels[:, 1:].contiguous() # In tensorflow, it's [batch, d_0, d_1, ..., d_{r-1}, num_classes] # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}] @@ -811,8 +811,8 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids) losses = [] if lm_labels is not None: - shift_logits = lm_logits[:, :-1] - shift_labels = lm_labels[:, 1:] + shift_logits = lm_logits[:, :-1].contiguous() + shift_labels = lm_labels[:, 1:].contiguous() loss_fct = CrossEntropyLoss(ignore_index=-1) losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))) From 01520d5412ab1b17c4ef0da5ed6cb9e62d6dfcb1 Mon Sep 17 00:00:00 2001 From: Catalin Voss Date: Wed, 27 Mar 2019 10:45:11 -0700 Subject: [PATCH 20/21] Remove my unhelpful comments :) --- pytorch_pretrained_bert/modeling_gpt2.py | 4 +--- pytorch_pretrained_bert/modeling_openai.py | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py index 7a0bb4db53..7b00ce7730 100644 --- a/pytorch_pretrained_bert/modeling_gpt2.py +++ b/pytorch_pretrained_bert/modeling_gpt2.py @@ -621,9 +621,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): shift_logits = lm_logits[:, :-1].contiguous() shift_labels = lm_labels[:, 1:].contiguous() - # In tensorflow, it's [batch, d_0, d_1, ..., d_{r-1}, num_classes] - # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}] - # We just flatten the tokens out this way. + # Flatten the tokens loss_fct = CrossEntropyLoss(ignore_index=-1) loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py index 4385c1eaa7..7273e75bf6 100644 --- a/pytorch_pretrained_bert/modeling_openai.py +++ b/pytorch_pretrained_bert/modeling_openai.py @@ -720,9 +720,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): shift_logits = lm_logits[:, :-1].contiguous() shift_labels = lm_labels[:, 1:].contiguous() - # In tensorflow, it's [batch, d_0, d_1, ..., d_{r-1}, num_classes] - # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}] - # We just flatten the tokens out this way. + # Flatten the tokens loss_fct = CrossEntropyLoss(ignore_index=-1) loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) From 60005f464d2069801a2cf26dc0f011da8ed639b0 Mon Sep 17 00:00:00 2001 From: jeonsworld <37530102+jeonsworld@users.noreply.github.com> Date: Sat, 30 Mar 2019 14:50:17 +0900 Subject: [PATCH 21/21] Update pregenerate_training_data.py If the value of rand_end is returned from the randint function, the value of sampled_doc_index that matches current_idx is returned from searchsorted. example: cumsum_max = {int64} 30 doc_cumsum = {ndarray} [ 5 7 11 19 30] doc_lengths = {list} : [5, 2, 4, 8, 11] if current_idx = 1, rand_start = 7 rand_end = 35 sentence_index = randint(7, 35) % cumsum_max if randint return 35, sentence_index becomes 5. if sentence_index is 5, np.searchsorted returns 1 equal to current_index. --- examples/lm_finetuning/pregenerate_training_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/lm_finetuning/pregenerate_training_data.py b/examples/lm_finetuning/pregenerate_training_data.py index 498ab22333..8cc28d2e78 100644 --- a/examples/lm_finetuning/pregenerate_training_data.py +++ b/examples/lm_finetuning/pregenerate_training_data.py @@ -49,7 +49,7 @@ class DocumentDatabase: self._precalculate_doc_weights() rand_start = self.doc_cumsum[current_idx] rand_end = rand_start + self.cumsum_max - self.doc_lengths[current_idx] - sentence_index = randint(rand_start, rand_end) % self.cumsum_max + sentence_index = randint(rand_start, rand_end-1) % self.cumsum_max sampled_doc_index = np.searchsorted(self.doc_cumsum, sentence_index, side='right') else: # If we don't use sentence weighting, then every doc has an equal chance to be chosen