updating readme and notebooks

This commit is contained in:
thomwolf
2018-11-16 14:31:15 +01:00
parent fd647e8c87
commit 886cb49792
6 changed files with 5367 additions and 330 deletions

View File

@@ -42,7 +42,7 @@ SCHEDULES = {
class BERTAdam(Optimizer):
"""Implements BERT version of Adam algorithm with weight decay fix (and no ).
"""Implements BERT version of Adam algorithm with weight decay fix.
Params:
lr: learning rate
warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1
@@ -136,7 +136,7 @@ class BERTAdam(Optimizer):
# the correct way of using L2 regularization/weight decay with Adam,
# since that will interact with the m and v parameters in strange ways.
#
# Instead we want ot decay the weights in a manner that doesn't interact
# Instead we want to decay the weights in a manner that doesn't interact
# with the m/v parameters. This is equivalent to adding the square
# of the weights to the loss with plain (non-momentum) SGD.
if group['weight_decay_rate'] > 0.0:
@@ -154,6 +154,7 @@ class BERTAdam(Optimizer):
state['step'] += 1
# step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
# No bias correction
# bias_correction1 = 1 - beta1 ** state['step']
# bias_correction2 = 1 - beta2 ** state['step']