From 3775550c4b27e29fac18a545ed87f84c7451aa61 Mon Sep 17 00:00:00 2001
From: Pasquale Minervini
Date: Sun, 20 Oct 2019 22:33:56 +0100
Subject: [PATCH] gradient norm clipping should be done right before calling
the optimiser
---
examples/run_squad.py | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/examples/run_squad.py b/examples/run_squad.py
index 71c656a13d..aaf4952198 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -157,13 +157,16 @@ def train(args, train_dataset, model, tokenizer):
if args.fp16:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
- torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
else:
loss.backward()
- torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
tr_loss += loss.item()
if (step + 1) % args.gradient_accumulation_steps == 0:
+ if args.fp16:
+ torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+ else:
+ torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
optimizer.step()
scheduler.step() # Update learning rate schedule
model.zero_grad()