From 92a782b10836d8d8aae85a8c17932f36729d01bd Mon Sep 17 00:00:00 2001 From: thomwolf Date: Thu, 11 Jul 2019 22:20:10 +0200 Subject: [PATCH] fix run_glue test --- examples/run_glue.py | 18 ++++++++++++------ pytorch_transformers/optimization.py | 10 +++++----- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/examples/run_glue.py b/examples/run_glue.py index c3dffb4fdb..7e615804c1 100644 --- a/examples/run_glue.py +++ b/examples/run_glue.py @@ -53,6 +53,15 @@ MODEL_CLASSES = { 'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer), } + +def set_seed(args): + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if args.n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) + + def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: @@ -97,6 +106,7 @@ def train(args, train_dataset, model, tokenizer): tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) + set_seed(args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): @@ -371,12 +381,8 @@ def main(): logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16) - # Setup seeds - random.seed(args.seed) - np.random.seed(args.seed) - torch.manual_seed(args.seed) - if args.n_gpu > 0: - torch.cuda.manual_seed_all(args.seed) + # Set seed + set_seed(args) # Prepare GLUE task args.task_name = args.task_name.lower() diff --git a/pytorch_transformers/optimization.py b/pytorch_transformers/optimization.py index 8d224f1294..f0ac914341 100644 --- a/pytorch_transformers/optimization.py +++ b/pytorch_transformers/optimization.py @@ -167,14 +167,14 @@ class AdamW(Optimizer): # Decay the first and second moment running average coefficient # In-place operations to update the averages at the same time - exp_avg.mul_(beta1).add_(1 - beta1, grad) - exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad) + exp_avg.mul_(beta1).add_(1.0 - beta1, grad) + exp_avg_sq.mul_(beta2).addcmul_(1.0 - beta2, grad, grad) denom = exp_avg_sq.sqrt().add_(group['eps']) step_size = group['lr'] if group['correct_bias']: # No bias correction for Bert - bias_correction1 = 1 - beta1 ** state['step'] - bias_correction2 = 1 - beta2 ** state['step'] + bias_correction1 = 1.0 - beta1 ** state['step'] + bias_correction2 = 1.0 - beta2 ** state['step'] step_size = step_size * math.sqrt(bias_correction2) / bias_correction1 p.data.addcdiv_(-step_size, exp_avg, denom) @@ -187,7 +187,7 @@ class AdamW(Optimizer): # with the m/v parameters. This is equivalent to adding the square # of the weights to the loss with plain (non-momentum) SGD. # Add weight decay at the end (fixed version) - if group['weight_decay'] > 0: + if group['weight_decay'] > 0.0: p.data.add_(-group['lr'] * group['weight_decay'], p.data) return loss