From 88874f6cf09e14fc482abc186adebb2767dca258 Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Fri, 8 Mar 2019 19:08:30 +0100 Subject: [PATCH 01/27] BertAdam schedule objects --- pytorch_pretrained_bert/optimization.py | 141 +++++++++++++++++------- 1 file changed, 99 insertions(+), 42 deletions(-) diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py index fa911e5c04..73afc71058 100644 --- a/pytorch_pretrained_bert/optimization.py +++ b/pytorch_pretrained_bert/optimization.py @@ -23,29 +23,99 @@ import logging logger = logging.getLogger(__name__) -def warmup_cosine(x, warmup=0.002): - if x < warmup: - return x/warmup - return 0.5 * (1.0 + torch.cos(math.pi * x)) -def warmup_constant(x, warmup=0.002): - """ Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps. - Learning rate is 1. afterwards. """ - if x < warmup: - return x/warmup - return 1.0 +class LRSchedule(object): + warn_t_total = False + def __init__(self, warmup=0.002, t_total=-1, **kw): + super(LRSchedule, self).__init__(**kw) + self.warmup, self.t_total = warmup, t_total + if t_total <= 0: + logger.warning("t_total value of {} results in schedule not being applied".format(t_total)) + if not 0.0 <= warmup < 1.0 and not warmup == -1: + raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup)) + self.warned_for_t_total_at_progress = -1 -def warmup_linear(x, warmup=0.002): - """ Specifies a triangular learning rate schedule where peak is reached at `warmup`*`t_total`-th (as provided to BertAdam) training step. - After `t_total`-th training step, learning rate is zero. """ - if x < warmup: - return x/warmup - return max((x-1.)/(warmup-1.), 0) + def get_lr(self, step, nowarn=False): + progress = step / self.t_total + ret = self.get_lr_(progress) + # warning for exceeding t_total (only active with warmup_linear + if not nowarn and self.warn_t_total and progress > 1. and progress > self.warned_for_t_total_at_progress: + logger.warning( + "Training beyond specified 't_total'. Learning rate multiplier set to {}. Please set 't_total' of {} correctly." + .format(ret, self.__class__.__name__)) + self.warned_for_t_total_at_progress = progress + # end warning + return ret + + def get_lr_(self, step): + return 1. + # raise NotImplemented("use subclass") + + +class WarmupCosineSchedule(LRSchedule): + warn_t_total = True + def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw): + super(WarmupCosineSchedule, self).__init__(warmup=warmup, t_total=t_total, **kw) + self.cycles = cycles + + def get_lr_(self, progress): + """ get learning rate multiplier """ + if self.t_total <= 0: + return 1. + if progress < self.warmup: + return progress / self.warmup + else: + progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup + return 0.5 * (1. + torch.cos(math.pi * self.cycles * 2 * progress)) + + +class WarmupConstantSchedule(LRSchedule): + warn_t_total = False + def get_lr_(self, progress): + if progress < self.warmup: + return progress / self.warmup + return 1. + + +class WarmupLinearSchedule(LRSchedule): + warn_t_total = True + def get_lr_(self, progress): + if progress < self.warmup: + return progress / self.warmup + return max((progress - 1.) / (self.warmup - 1.), 0) +# +# +# def warmup_cosine(x, warmup=0.002): +# if x < warmup: +# return x/warmup +# return 0.5 * (1.0 + torch.cos(math.pi * x)) +# +# def warmup_constant(x, warmup=0.002): +# """ Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps. +# Learning rate is 1. afterwards. """ +# if x < warmup: +# return x/warmup +# return 1.0 +# +# def warmup_linear(x, warmup=0.002): +# """ Specifies a triangular learning rate schedule where peak is reached at `warmup`*`t_total`-th (as provided to BertAdam) training step. +# After `t_total`-th training step, learning rate is zero. """ +# if x < warmup: +# return x/warmup +# return max((x-1.)/(warmup-1.), 0) +# +# SCHEDULES = { +# 'warmup_cosine': warmup_cosine, +# 'warmup_constant': warmup_constant, +# 'warmup_linear': warmup_linear, +# } SCHEDULES = { - 'warmup_cosine': warmup_cosine, - 'warmup_constant': warmup_constant, - 'warmup_linear': warmup_linear, + None: LRSchedule, + "none": LRSchedule, + "warmup_cosine": WarmupCosineSchedule, + "warmup_constant": WarmupConstantSchedule, + "warmup_linear": WarmupLinearSchedule } @@ -70,15 +140,16 @@ class BertAdam(Optimizer): raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) if schedule not in SCHEDULES: raise ValueError("Invalid schedule parameter: {}".format(schedule)) - if not 0.0 <= warmup < 1.0 and not warmup == -1: - raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup)) if not 0.0 <= b1 < 1.0: raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1)) if not 0.0 <= b2 < 1.0: raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2)) if not e >= 0.0: raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e)) - defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total, + # initialize schedule object + schedule_type = SCHEDULES[schedule] + sched = schedule_type(warmup=warmup, t_total=t_total) + defaults = dict(lr=lr, schedule=sched, b1=b1, b2=b2, e=e, weight_decay=weight_decay, max_grad_norm=max_grad_norm) super(BertAdam, self).__init__(params, defaults) @@ -90,11 +161,10 @@ class BertAdam(Optimizer): state = self.state[p] if len(state) == 0: return [0] - if group['t_total'] != -1: - schedule_fct = SCHEDULES[group['schedule']] - lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup']) - else: - lr_scheduled = group['lr'] + + lr_scheduled = group['lr'] + lr_scheduled *= group['schedule'](state['step']) + lr.append(lr_scheduled) return lr @@ -109,8 +179,6 @@ class BertAdam(Optimizer): if closure is not None: loss = closure() - warned_for_t_total = False - for group in self.param_groups: for p in group['params']: if p.grad is None: @@ -152,19 +220,8 @@ class BertAdam(Optimizer): if group['weight_decay'] > 0.0: update += group['weight_decay'] * p.data - if group['t_total'] != -1: - schedule_fct = SCHEDULES[group['schedule']] - progress = state['step']/group['t_total'] - lr_scheduled = group['lr'] * schedule_fct(progress, group['warmup']) - # warning for exceeding t_total (only active with warmup_linear - if group['schedule'] == "warmup_linear" and progress > 1. and not warned_for_t_total: - logger.warning( - "Training beyond specified 't_total' steps with schedule '{}'. Learning rate set to {}. " - "Please set 't_total' of {} correctly.".format(group['schedule'], lr_scheduled, self.__class__.__name__)) - warned_for_t_total = True - # end warning - else: - lr_scheduled = group['lr'] + lr_scheduled = group['lr'] + lr_scheduled *= group['schedule'](state['step']) update_with_lr = lr_scheduled * update p.data.add_(-update_with_lr) From 90a41dbe1404f734f6a25bfbaf89be71ba5e4613 Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Sat, 9 Mar 2019 02:23:20 +0100 Subject: [PATCH 02/27] BertAdam schedule objects --- pytorch_pretrained_bert/__init__.py | 2 +- pytorch_pretrained_bert/optimization.py | 48 +++++++++---------------- 2 files changed, 17 insertions(+), 33 deletions(-) diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py index bd455b8d9c..e82d409ee0 100644 --- a/pytorch_pretrained_bert/__init__.py +++ b/pytorch_pretrained_bert/__init__.py @@ -18,7 +18,7 @@ from .modeling_gpt2 import (GPT2Config, GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel, load_tf_weights_in_gpt2) -from .optimization import BertAdam +from .optimization import * from .optimization_openai import OpenAIAdam from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py index 73afc71058..cea35c39e9 100644 --- a/pytorch_pretrained_bert/optimization.py +++ b/pytorch_pretrained_bert/optimization.py @@ -24,6 +24,9 @@ import logging logger = logging.getLogger(__name__) +__all__ = ["LRSchedule", "WarmupLinearSchedule", "WarmupConstantSchedule", "WarmupCosineSchedule", "BertAdam"] + + class LRSchedule(object): warn_t_total = False def __init__(self, warmup=0.002, t_total=-1, **kw): @@ -83,32 +86,7 @@ class WarmupLinearSchedule(LRSchedule): if progress < self.warmup: return progress / self.warmup return max((progress - 1.) / (self.warmup - 1.), 0) -# -# -# def warmup_cosine(x, warmup=0.002): -# if x < warmup: -# return x/warmup -# return 0.5 * (1.0 + torch.cos(math.pi * x)) -# -# def warmup_constant(x, warmup=0.002): -# """ Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps. -# Learning rate is 1. afterwards. """ -# if x < warmup: -# return x/warmup -# return 1.0 -# -# def warmup_linear(x, warmup=0.002): -# """ Specifies a triangular learning rate schedule where peak is reached at `warmup`*`t_total`-th (as provided to BertAdam) training step. -# After `t_total`-th training step, learning rate is zero. """ -# if x < warmup: -# return x/warmup -# return max((x-1.)/(warmup-1.), 0) -# -# SCHEDULES = { -# 'warmup_cosine': warmup_cosine, -# 'warmup_constant': warmup_constant, -# 'warmup_linear': warmup_linear, -# } + SCHEDULES = { None: LRSchedule, @@ -126,7 +104,9 @@ class BertAdam(Optimizer): warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1 t_total: total number of training steps for the learning rate schedule, -1 means constant learning rate. Default: -1 - schedule: schedule to use for the warmup (see above). Default: 'warmup_linear' + schedule: schedule to use for the warmup (see above). + Can be 'warmup_linear', 'warmup_constant', 'warmup_cosine', or a LRSchedule object. + Default: 'warmup_linear' b1: Adams b1. Default: 0.9 b2: Adams b2. Default: 0.999 e: Adams epsilon. Default: 1e-6 @@ -147,9 +127,13 @@ class BertAdam(Optimizer): if not e >= 0.0: raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e)) # initialize schedule object - schedule_type = SCHEDULES[schedule] - sched = schedule_type(warmup=warmup, t_total=t_total) - defaults = dict(lr=lr, schedule=sched, + if not isinstance(schedule, LRSchedule): + schedule_type = SCHEDULES[schedule] + schedule = schedule_type(warmup=warmup, t_total=t_total) + else: + if warmup != -1 or t_total != -1: + logger.warning("Non-default warmup and t_total are ineffective when LRSchedule object is provided.") + defaults = dict(lr=lr, schedule=schedule, b1=b1, b2=b2, e=e, weight_decay=weight_decay, max_grad_norm=max_grad_norm) super(BertAdam, self).__init__(params, defaults) @@ -163,7 +147,7 @@ class BertAdam(Optimizer): return [0] lr_scheduled = group['lr'] - lr_scheduled *= group['schedule'](state['step']) + lr_scheduled *= group['schedule'].get_lr(state['step']) lr.append(lr_scheduled) return lr @@ -221,7 +205,7 @@ class BertAdam(Optimizer): update += group['weight_decay'] * p.data lr_scheduled = group['lr'] - lr_scheduled *= group['schedule'](state['step']) + lr_scheduled *= group['schedule'].get_lr(state['step']) update_with_lr = lr_scheduled * update p.data.add_(-update_with_lr) From f113a2dfdcf3116a35f856b274b4e4c2ecbeb6c0 Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Sat, 9 Mar 2019 02:29:57 +0100 Subject: [PATCH 03/27] readme de --- pytorch_pretrained_bert/optimization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py index cea35c39e9..84f329feae 100644 --- a/pytorch_pretrained_bert/optimization.py +++ b/pytorch_pretrained_bert/optimization.py @@ -118,7 +118,7 @@ class BertAdam(Optimizer): max_grad_norm=1.0): if lr is not required and lr < 0.0: raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) - if schedule not in SCHEDULES: + if not isinstance(schedule, LRSchedule) and schedule not in SCHEDULES: raise ValueError("Invalid schedule parameter: {}".format(schedule)) if not 0.0 <= b1 < 1.0: raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1)) From 51efde54a907c85495263a980d253dcdb3e75209 Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Sat, 9 Mar 2019 02:45:25 +0100 Subject: [PATCH 04/27] cos fix --- pytorch_pretrained_bert/optimization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py index 84f329feae..a92adb4c56 100644 --- a/pytorch_pretrained_bert/optimization.py +++ b/pytorch_pretrained_bert/optimization.py @@ -69,7 +69,7 @@ class WarmupCosineSchedule(LRSchedule): return progress / self.warmup else: progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup - return 0.5 * (1. + torch.cos(math.pi * self.cycles * 2 * progress)) + return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress)) class WarmupConstantSchedule(LRSchedule): From baf66d141958785feb0dfc90d6cd8558eb95a774 Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Tue, 12 Mar 2019 13:22:23 +0100 Subject: [PATCH 05/27] restart cosine lr schedule --- pytorch_pretrained_bert/optimization.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py index a92adb4c56..58e16f01a6 100644 --- a/pytorch_pretrained_bert/optimization.py +++ b/pytorch_pretrained_bert/optimization.py @@ -69,7 +69,23 @@ class WarmupCosineSchedule(LRSchedule): return progress / self.warmup else: progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup - return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress)) + return 0.5 * (1. + math.cos(math.pi * ((self.cycles * 2 * progress) % 1)) + + +class WarmupCosineWithRestartsSchedule(WarmupCosineSchedule): + warn_t_total = True + def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw): + super(WarmupCosineWithRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw) + + def get_lr_(self, progress): + if self.t_total <= 0: + return 1. + if progress < self.warmup: + return progress / self.warmup + else: + progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup + ret = 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress)) + return ret class WarmupConstantSchedule(LRSchedule): From 902461333715d16773cac1e5e1300be705f49205 Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Tue, 12 Mar 2019 13:23:58 +0100 Subject: [PATCH 06/27] changing docker --- pytorch_pretrained_bert/optimization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py index 58e16f01a6..481072c483 100644 --- a/pytorch_pretrained_bert/optimization.py +++ b/pytorch_pretrained_bert/optimization.py @@ -69,7 +69,7 @@ class WarmupCosineSchedule(LRSchedule): return progress / self.warmup else: progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup - return 0.5 * (1. + math.cos(math.pi * ((self.cycles * 2 * progress) % 1)) + return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress)) class WarmupCosineWithRestartsSchedule(WarmupCosineSchedule): @@ -84,7 +84,7 @@ class WarmupCosineWithRestartsSchedule(WarmupCosineSchedule): return progress / self.warmup else: progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup - ret = 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress)) + ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * 2 * progress) % 1))) return ret From 471daf1b6c0821e8b5ab6a173d7f41de079bae8a Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Tue, 12 Mar 2019 13:32:42 +0100 Subject: [PATCH 07/27] changing docker --- pytorch_pretrained_bert/optimization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py index 481072c483..dcd8cfff2f 100644 --- a/pytorch_pretrained_bert/optimization.py +++ b/pytorch_pretrained_bert/optimization.py @@ -84,7 +84,7 @@ class WarmupCosineWithRestartsSchedule(WarmupCosineSchedule): return progress / self.warmup else: progress = (progress - self.warmup) / (1 - self.warmup) # progress after warmup - ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * 2 * progress) % 1))) + ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * progress) % 1))) return ret From eac039d21f8141c501bd75d02d46ba5545797b63 Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Tue, 12 Mar 2019 13:45:12 +0100 Subject: [PATCH 08/27] changing docker --- pytorch_pretrained_bert/optimization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py index dcd8cfff2f..7eda3ba92a 100644 --- a/pytorch_pretrained_bert/optimization.py +++ b/pytorch_pretrained_bert/optimization.py @@ -24,7 +24,7 @@ import logging logger = logging.getLogger(__name__) -__all__ = ["LRSchedule", "WarmupLinearSchedule", "WarmupConstantSchedule", "WarmupCosineSchedule", "BertAdam"] +__all__ = ["LRSchedule", "WarmupLinearSchedule", "WarmupConstantSchedule", "WarmupCosineSchedule", "BertAdam", "WarmupCosineWithRestartsSchedule"] class LRSchedule(object): From 20e652209c7da7a73c9d1f3a65418d0ea118680e Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Wed, 13 Mar 2019 16:13:37 +0100 Subject: [PATCH 09/27] relation classification: replacing entity mention with mask token --- pytorch_pretrained_bert/optimization.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py index 7eda3ba92a..9a873e221b 100644 --- a/pytorch_pretrained_bert/optimization.py +++ b/pytorch_pretrained_bert/optimization.py @@ -130,7 +130,7 @@ class BertAdam(Optimizer): max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0 """ def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear', - b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, + b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, init_weight_decay=0., max_grad_norm=1.0): if lr is not required and lr < 0.0: raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) @@ -150,7 +150,7 @@ class BertAdam(Optimizer): if warmup != -1 or t_total != -1: logger.warning("Non-default warmup and t_total are ineffective when LRSchedule object is provided.") defaults = dict(lr=lr, schedule=schedule, - b1=b1, b2=b2, e=e, weight_decay=weight_decay, + b1=b1, b2=b2, e=e, weight_decay=weight_decay, init_weight_decay=init_weight_decay, max_grad_norm=max_grad_norm) super(BertAdam, self).__init__(params, defaults) @@ -220,6 +220,8 @@ class BertAdam(Optimizer): if group['weight_decay'] > 0.0: update += group['weight_decay'] * p.data + # TODO: init weight decay + lr_scheduled = group['lr'] lr_scheduled *= group['schedule'].get_lr(state['step']) From bed6408dcce8cf1b04e8dfa41f481500f40e47ca Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Mon, 18 Mar 2019 13:09:55 +0100 Subject: [PATCH 10/27] branches, optim cosine fix --- pytorch_pretrained_bert/optimization.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py index fa911e5c04..e553365b54 100644 --- a/pytorch_pretrained_bert/optimization.py +++ b/pytorch_pretrained_bert/optimization.py @@ -26,7 +26,9 @@ logger = logging.getLogger(__name__) def warmup_cosine(x, warmup=0.002): if x < warmup: return x/warmup - return 0.5 * (1.0 + torch.cos(math.pi * x)) + + x_ = (x - warmup) / (1 - warmup) # progress after warmup + return 0.5 * (1. + math.cos(math.pi * x_)) def warmup_constant(x, warmup=0.002): """ Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps. From ef28b2c74739162b88d78009ed3fae74deeb8b36 Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Mon, 18 Mar 2019 13:18:07 +0100 Subject: [PATCH 11/27] branches, optim cosine fix --- pytorch_pretrained_bert/optimization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py index 95411857c6..aa59c7d7ec 100644 --- a/pytorch_pretrained_bert/optimization.py +++ b/pytorch_pretrained_bert/optimization.py @@ -26,7 +26,7 @@ logger = logging.getLogger(__name__) def warmup_cosine(x, warmup=0.002): if x < warmup: return x/warmup - x_ = (x - warmup) / (1 - warmup) # progress after warmup + x_ = (x - warmup) / (1 - warmup) # progress after warmup - return 0.5 * (1. + math.cos(math.pi * x_)) def warmup_constant(x, warmup=0.002): From 2283dcca5e62438efbe445d518cddfaa2842a75b Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Mon, 18 Mar 2019 13:40:12 +0100 Subject: [PATCH 12/27] import revert --- pytorch_pretrained_bert/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/__init__.py b/pytorch_pretrained_bert/__init__.py index e82d409ee0..bd455b8d9c 100644 --- a/pytorch_pretrained_bert/__init__.py +++ b/pytorch_pretrained_bert/__init__.py @@ -18,7 +18,7 @@ from .modeling_gpt2 import (GPT2Config, GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel, load_tf_weights_in_gpt2) -from .optimization import * +from .optimization import BertAdam from .optimization_openai import OpenAIAdam from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path From 19cc2c084e265d1cf2ae2218147cd52bdd71bf1a Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Mon, 18 Mar 2019 15:13:35 +0100 Subject: [PATCH 13/27] same --- pytorch_pretrained_bert/optimization_openai.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/optimization_openai.py b/pytorch_pretrained_bert/optimization_openai.py index 7f56a1284c..99ac15e108 100644 --- a/pytorch_pretrained_bert/optimization_openai.py +++ b/pytorch_pretrained_bert/optimization_openai.py @@ -26,7 +26,8 @@ logger = logging.getLogger(__name__) def warmup_cosine(x, warmup=0.002): if x < warmup: return x/warmup - return 0.5 * (1.0 + torch.cos(math.pi * x)) + x_ = (x - warmup) / (1 - warmup) # progress after warmup + return 0.5 * (1. + math.cos(math.pi * x_)) def warmup_constant(x, warmup=0.002): """ Linearly increases learning rate over `warmup`*`t_total` (as provided to OpenAIAdam) training steps. From 7797d21b8d7180af2114890c4c3393c762a69154 Mon Sep 17 00:00:00 2001 From: Catalin Voss Date: Sun, 24 Mar 2019 13:34:30 -0700 Subject: [PATCH 14/27] Fix GPT2 language modeling loss computation --- pytorch_pretrained_bert/modeling_gpt2.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py index c381b288f8..13ae7a2342 100644 --- a/pytorch_pretrained_bert/modeling_gpt2.py +++ b/pytorch_pretrained_bert/modeling_gpt2.py @@ -617,8 +617,16 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past) lm_logits = self.lm_head(hidden_states) if lm_labels is not None: + # Shift so that tokens < n predict n + shift_logits = lm_logits[:, :-1] + shift_labels = torch_batch[:, 1:] + + # In tensorflow, it's [batch, d_0, d_1, ..., d_{r-1}, num_classes] + # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}] + # We just flatten the tokens out this way. loss_fct = CrossEntropyLoss(ignore_index=-1) - loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1)) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)) + shift_labels.view(-1)) return loss return lm_logits, presents From 5938f31fa7aa28cdff662f79c7c038cab21bb370 Mon Sep 17 00:00:00 2001 From: Catalin Voss Date: Sun, 24 Mar 2019 13:35:32 -0700 Subject: [PATCH 15/27] Fix c/p typo from my experiment code --- pytorch_pretrained_bert/modeling_gpt2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py index 13ae7a2342..1733a5b3f4 100644 --- a/pytorch_pretrained_bert/modeling_gpt2.py +++ b/pytorch_pretrained_bert/modeling_gpt2.py @@ -619,7 +619,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): if lm_labels is not None: # Shift so that tokens < n predict n shift_logits = lm_logits[:, :-1] - shift_labels = torch_batch[:, 1:] + shift_labels = lm_labels[:, 1:] # In tensorflow, it's [batch, d_0, d_1, ..., d_{r-1}, num_classes] # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}] From 2e6f5ffb96029398f740b6eacdc86b117cccb86b Mon Sep 17 00:00:00 2001 From: Catalin Voss Date: Sun, 24 Mar 2019 13:36:46 -0700 Subject: [PATCH 16/27] Fix GPT language model loss here as well --- pytorch_pretrained_bert/modeling_openai.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py index 296abbfc31..9c708f88a2 100644 --- a/pytorch_pretrained_bert/modeling_openai.py +++ b/pytorch_pretrained_bert/modeling_openai.py @@ -716,8 +716,16 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): hidden_states = self.transformer(input_ids, position_ids, token_type_ids) lm_logits = self.lm_head(hidden_states) if lm_labels is not None: + # Shift so that tokens < n predict n + shift_logits = lm_logits[:, :-1] + shift_labels = lm_labels[:, 1:] + + # In tensorflow, it's [batch, d_0, d_1, ..., d_{r-1}, num_classes] + # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}] + # We just flatten the tokens out this way. loss_fct = CrossEntropyLoss(ignore_index=-1) - loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1)) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)) + shift_labels.view(-1)) return loss return lm_logits From 472857c47f3b6a142a7aaa53836e33cd8543088d Mon Sep 17 00:00:00 2001 From: Catalin Voss Date: Sun, 24 Mar 2019 13:49:42 -0700 Subject: [PATCH 17/27] Fix typo syntax err (sorry, c/p from my repo) --- pytorch_pretrained_bert/modeling_gpt2.py | 2 +- pytorch_pretrained_bert/modeling_openai.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py index 1733a5b3f4..15e7ca26e1 100644 --- a/pytorch_pretrained_bert/modeling_gpt2.py +++ b/pytorch_pretrained_bert/modeling_gpt2.py @@ -625,7 +625,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}] # We just flatten the tokens out this way. loss_fct = CrossEntropyLoss(ignore_index=-1) - loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) return loss return lm_logits, presents diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py index 9c708f88a2..ab4107667b 100644 --- a/pytorch_pretrained_bert/modeling_openai.py +++ b/pytorch_pretrained_bert/modeling_openai.py @@ -724,7 +724,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}] # We just flatten the tokens out this way. loss_fct = CrossEntropyLoss(ignore_index=-1) - loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) return loss return lm_logits From 0dd796e359d1fbf9c0ea39b04e9b5655e5a09dee Mon Sep 17 00:00:00 2001 From: Catalin Voss Date: Sun, 24 Mar 2019 14:35:55 -0700 Subject: [PATCH 18/27] Also fix loss function issue with the double head models --- pytorch_pretrained_bert/modeling_gpt2.py | 5 ++++- pytorch_pretrained_bert/modeling_openai.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py index 15e7ca26e1..635326b408 100644 --- a/pytorch_pretrained_bert/modeling_gpt2.py +++ b/pytorch_pretrained_bert/modeling_gpt2.py @@ -698,8 +698,11 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids) losses = [] if lm_labels is not None: + shift_logits = lm_logits[:, :-1] + shift_labels = lm_labels[:, 1:] loss_fct = CrossEntropyLoss(ignore_index=-1) - losses.append(loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1))) + losses.append(loss_fct(shift_logits.view(-1, + shift_logits.size(-1)), shift_labels.view(-1))) if mc_labels is not None: loss_fct = CrossEntropyLoss() losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))) diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py index ab4107667b..8c1dd5e4a3 100644 --- a/pytorch_pretrained_bert/modeling_openai.py +++ b/pytorch_pretrained_bert/modeling_openai.py @@ -811,8 +811,11 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids) losses = [] if lm_labels is not None: + shift_logits = lm_logits[:, :-1] + shift_labels = lm_labels[:, 1:] loss_fct = CrossEntropyLoss(ignore_index=-1) - losses.append(loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1))) + losses.append(loss_fct(shift_logits.view(-1, + shift_logits.size(-1)), shift_labels.view(-1))) if mc_labels is not None: loss_fct = CrossEntropyLoss() losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))) From fda2f623953bfe2290cd65429eb008f02ebdb152 Mon Sep 17 00:00:00 2001 From: Catalin Voss Date: Sun, 24 Mar 2019 14:37:13 -0700 Subject: [PATCH 19/27] Fix test failures due to old torch issue with non-contiguous view --- pytorch_pretrained_bert/modeling_gpt2.py | 8 ++++---- pytorch_pretrained_bert/modeling_openai.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py index 635326b408..7a0bb4db53 100644 --- a/pytorch_pretrained_bert/modeling_gpt2.py +++ b/pytorch_pretrained_bert/modeling_gpt2.py @@ -618,8 +618,8 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): lm_logits = self.lm_head(hidden_states) if lm_labels is not None: # Shift so that tokens < n predict n - shift_logits = lm_logits[:, :-1] - shift_labels = lm_labels[:, 1:] + shift_logits = lm_logits[:, :-1].contiguous() + shift_labels = lm_labels[:, 1:].contiguous() # In tensorflow, it's [batch, d_0, d_1, ..., d_{r-1}, num_classes] # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}] @@ -698,8 +698,8 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids) losses = [] if lm_labels is not None: - shift_logits = lm_logits[:, :-1] - shift_labels = lm_labels[:, 1:] + shift_logits = lm_logits[:, :-1].contiguous() + shift_labels = lm_labels[:, 1:].contiguous() loss_fct = CrossEntropyLoss(ignore_index=-1) losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))) diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py index 8c1dd5e4a3..4385c1eaa7 100644 --- a/pytorch_pretrained_bert/modeling_openai.py +++ b/pytorch_pretrained_bert/modeling_openai.py @@ -717,8 +717,8 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): lm_logits = self.lm_head(hidden_states) if lm_labels is not None: # Shift so that tokens < n predict n - shift_logits = lm_logits[:, :-1] - shift_labels = lm_labels[:, 1:] + shift_logits = lm_logits[:, :-1].contiguous() + shift_labels = lm_labels[:, 1:].contiguous() # In tensorflow, it's [batch, d_0, d_1, ..., d_{r-1}, num_classes] # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}] @@ -811,8 +811,8 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids) losses = [] if lm_labels is not None: - shift_logits = lm_logits[:, :-1] - shift_labels = lm_labels[:, 1:] + shift_logits = lm_logits[:, :-1].contiguous() + shift_labels = lm_labels[:, 1:].contiguous() loss_fct = CrossEntropyLoss(ignore_index=-1) losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))) From 01520d5412ab1b17c4ef0da5ed6cb9e62d6dfcb1 Mon Sep 17 00:00:00 2001 From: Catalin Voss Date: Wed, 27 Mar 2019 10:45:11 -0700 Subject: [PATCH 20/27] Remove my unhelpful comments :) --- pytorch_pretrained_bert/modeling_gpt2.py | 4 +--- pytorch_pretrained_bert/modeling_openai.py | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py index 7a0bb4db53..7b00ce7730 100644 --- a/pytorch_pretrained_bert/modeling_gpt2.py +++ b/pytorch_pretrained_bert/modeling_gpt2.py @@ -621,9 +621,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): shift_logits = lm_logits[:, :-1].contiguous() shift_labels = lm_labels[:, 1:].contiguous() - # In tensorflow, it's [batch, d_0, d_1, ..., d_{r-1}, num_classes] - # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}] - # We just flatten the tokens out this way. + # Flatten the tokens loss_fct = CrossEntropyLoss(ignore_index=-1) loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py index 4385c1eaa7..7273e75bf6 100644 --- a/pytorch_pretrained_bert/modeling_openai.py +++ b/pytorch_pretrained_bert/modeling_openai.py @@ -720,9 +720,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): shift_logits = lm_logits[:, :-1].contiguous() shift_labels = lm_labels[:, 1:].contiguous() - # In tensorflow, it's [batch, d_0, d_1, ..., d_{r-1}, num_classes] - # in pytorch, it's [batch, num_classes, d_0, d_1, ..., d_{r-1}] - # We just flatten the tokens out this way. + # Flatten the tokens loss_fct = CrossEntropyLoss(ignore_index=-1) loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) From 60005f464d2069801a2cf26dc0f011da8ed639b0 Mon Sep 17 00:00:00 2001 From: jeonsworld <37530102+jeonsworld@users.noreply.github.com> Date: Sat, 30 Mar 2019 14:50:17 +0900 Subject: [PATCH 21/27] Update pregenerate_training_data.py If the value of rand_end is returned from the randint function, the value of sampled_doc_index that matches current_idx is returned from searchsorted. example: cumsum_max = {int64} 30 doc_cumsum = {ndarray} [ 5 7 11 19 30] doc_lengths = {list} : [5, 2, 4, 8, 11] if current_idx = 1, rand_start = 7 rand_end = 35 sentence_index = randint(7, 35) % cumsum_max if randint return 35, sentence_index becomes 5. if sentence_index is 5, np.searchsorted returns 1 equal to current_index. --- examples/lm_finetuning/pregenerate_training_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/lm_finetuning/pregenerate_training_data.py b/examples/lm_finetuning/pregenerate_training_data.py index 498ab22333..8cc28d2e78 100644 --- a/examples/lm_finetuning/pregenerate_training_data.py +++ b/examples/lm_finetuning/pregenerate_training_data.py @@ -49,7 +49,7 @@ class DocumentDatabase: self._precalculate_doc_weights() rand_start = self.doc_cumsum[current_idx] rand_end = rand_start + self.cumsum_max - self.doc_lengths[current_idx] - sentence_index = randint(rand_start, rand_end) % self.cumsum_max + sentence_index = randint(rand_start, rand_end-1) % self.cumsum_max sampled_doc_index = np.searchsorted(self.doc_cumsum, sentence_index, side='right') else: # If we don't use sentence weighting, then every doc has an equal chance to be chosen From d07db28f52b4faf6769d6b7b454c950ec02dc641 Mon Sep 17 00:00:00 2001 From: Weixin Wang <6220861+MottoX@users.noreply.github.com> Date: Sun, 31 Mar 2019 01:20:18 +0800 Subject: [PATCH 22/27] Fix typo in example code Modify 'unambigiously' to 'unambiguously' --- examples/run_classifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/run_classifier.py b/examples/run_classifier.py index d79c2526b0..751d581ad9 100644 --- a/examples/run_classifier.py +++ b/examples/run_classifier.py @@ -442,7 +442,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length, # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary - # since the [SEP] token unambigiously separates the sequences, but it makes + # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is From 8b5c63e4deffad8c1c421caee8fef4bb97881f70 Mon Sep 17 00:00:00 2001 From: Mike Arpaia Date: Mon, 1 Apr 2019 12:53:51 -0600 Subject: [PATCH 23/27] Fixes to the TensorFlow conversion tool --- examples/extract_features.py | 2 +- pytorch_pretrained_bert/modeling.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/examples/extract_features.py b/examples/extract_features.py index 0d59aa7e81..13384a9d69 100644 --- a/examples/extract_features.py +++ b/examples/extract_features.py @@ -57,7 +57,7 @@ class InputFeatures(object): def convert_examples_to_features(examples, seq_length, tokenizer): - """Loads a data file into a list of `InputBatch`s.""" + """Loads a data file into a list of `InputFeature`s.""" features = [] for (ex_index, example) in enumerate(examples): diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py index b92f3a87f1..938636142f 100644 --- a/pytorch_pretrained_bert/modeling.py +++ b/pytorch_pretrained_bert/modeling.py @@ -76,7 +76,7 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path): name = name.split('/') # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model - if any(n in ["adam_v", "adam_m"] for n in name): + if any(n in ["adam_v", "adam_m", "global_step"] for n in name): print("Skipping {}".format("/".join(name))) continue pointer = model @@ -92,7 +92,11 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path): elif l[0] == 'output_weights': pointer = getattr(pointer, 'weight') else: - pointer = getattr(pointer, l[0]) + try: + pointer = getattr(pointer, l[0]) + except AttributeError: + print("Skipping {}".format("/".join(name))) + continue if len(l) >= 2: num = int(l[1]) pointer = pointer[num] From f26ce6992e683a49799515e86086cf66599e2762 Mon Sep 17 00:00:00 2001 From: Weixin Wang <6220861+MottoX@users.noreply.github.com> Date: Tue, 2 Apr 2019 17:20:32 +0800 Subject: [PATCH 24/27] Fix links in README --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6e9589aaa2..daac69de9f 100644 --- a/README.md +++ b/README.md @@ -1234,9 +1234,9 @@ A command-line interface is provided to convert a TensorFlow checkpoint in a PyT ### BERT -You can convert any TensorFlow checkpoint for BERT (in particular [the pre-trained models released by Google](https://github.com/google-research/bert#pre-trained-models)) in a PyTorch save file by using the [`./pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py`](convert_tf_checkpoint_to_pytorch.py) script. +You can convert any TensorFlow checkpoint for BERT (in particular [the pre-trained models released by Google](https://github.com/google-research/bert#pre-trained-models)) in a PyTorch save file by using the [`convert_tf_checkpoint_to_pytorch.py`](./pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py ) script. -This CLI takes as input a TensorFlow checkpoint (three files starting with `bert_model.ckpt`) and the associated configuration file (`bert_config.json`), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using `torch.load()` (see examples in [`extract_features.py`](./examples/extract_features.py), [`run_classifier.py`](./examples/run_classifier.py) and [`run_squad.py`]((./examples/run_squad.py))). +This CLI takes as input a TensorFlow checkpoint (three files starting with `bert_model.ckpt`) and the associated configuration file (`bert_config.json`), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using `torch.load()` (see examples in [`extract_features.py`](./examples/extract_features.py), [`run_classifier.py`](./examples/run_classifier.py) and [`run_squad.py`](./examples/run_squad.py)). You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow checkpoint (the three files starting with `bert_model.ckpt`) but be sure to keep the configuration file (`bert_config.json`) and the vocabulary file (`vocab.txt`) as these are needed for the PyTorch model too. From 846b1fd6f81a40ca363a91afaf3f67c3098e5687 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Wed, 3 Apr 2019 10:50:38 +0200 Subject: [PATCH 25/27] Fix #419 --- examples/run_squad.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index bb335ce75c..043b795326 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -85,9 +85,9 @@ class SquadExample(object): s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) if self.start_position: s += ", start_position: %d" % (self.start_position) - if self.start_position: + if self.end_position: s += ", end_position: %d" % (self.end_position) - if self.start_position: + if self.is_impossible: s += ", is_impossible: %r" % (self.is_impossible) return s From 1d8c23232444fbeae395a892fe4f353138e81fec Mon Sep 17 00:00:00 2001 From: thomwolf Date: Wed, 3 Apr 2019 10:51:03 +0200 Subject: [PATCH 26/27] Fix #436 --- pytorch_pretrained_bert/tokenization.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/pytorch_pretrained_bert/tokenization.py b/pytorch_pretrained_bert/tokenization.py index c549e06d78..bbb3e25fc7 100644 --- a/pytorch_pretrained_bert/tokenization.py +++ b/pytorch_pretrained_bert/tokenization.py @@ -105,13 +105,13 @@ class BertTokenizer(object): self.max_len = max_len if max_len is not None else int(1e12) def tokenize(self, text): + split_tokens = [] if self.do_basic_tokenize: - split_tokens = [] - for token in self.basic_tokenizer.tokenize(text): - for sub_token in self.wordpiece_tokenizer.tokenize(token): - split_tokens.append(sub_token) + for token in self.basic_tokenizer.tokenize(text): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) else: - split_tokens = self.wordpiece_tokenizer.tokenize(text) + split_tokens = self.wordpiece_tokenizer.tokenize(text) return split_tokens def convert_tokens_to_ids(self, tokens): @@ -142,6 +142,16 @@ class BertTokenizer(object): """ if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path] + if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True): + logger.warning("The pre-trained model you are loading is a cased model but you have not set " + "`do_lower_case` to False. We are setting `do_lower_case=False` for you but " + "you may want to check this behavior.") + kwargs['do_lower_case'] = False + elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True): + logger.warning("The pre-trained model you are loading is an uncased model but you have set " + "`do_lower_case` to False. We are setting `do_lower_case=True` for you " + "but you may want to check this behavior.") + kwargs['do_lower_case'] = True else: vocab_file = pretrained_model_name_or_path if os.path.isdir(vocab_file): From 19666dcb3bee3e379f1458e295869957aac8590c Mon Sep 17 00:00:00 2001 From: thomwolf Date: Wed, 3 Apr 2019 11:01:01 +0200 Subject: [PATCH 27/27] Should fix #438 --- pytorch_pretrained_bert/modeling.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py index 938636142f..2736e34d7f 100644 --- a/pytorch_pretrained_bert/modeling.py +++ b/pytorch_pretrained_bert/modeling.py @@ -91,6 +91,8 @@ def load_tf_weights_in_bert(model, tf_checkpoint_path): pointer = getattr(pointer, 'bias') elif l[0] == 'output_weights': pointer = getattr(pointer, 'weight') + elif l[0] == 'squad': + pointer = getattr(pointer, 'classifier') else: try: pointer = getattr(pointer, l[0])