From 39c3b1d9de3eb6353b0d6d8b01275a0655886b28 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Mon, 17 Aug 2020 05:33:12 -0700
Subject: [PATCH] [sched] polynomial_decay_schedule use default power=1.0
 (#6473)

---
 src/transformers/optimization.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py
index 410e54f0e8..7ae28d3ecf 100644
--- a/src/transformers/optimization.py
+++ b/src/transformers/optimization.py
@@ -166,7 +166,7 @@ def get_cosine_with_hard_restarts_schedule_with_warmup(
 
 
 def get_polynomial_decay_schedule_with_warmup(
-    optimizer, num_warmup_steps, num_training_steps, lr_end=1e-7, power=2.0, last_epoch=-1
+    optimizer, num_warmup_steps, num_training_steps, lr_end=1e-7, power=1.0, last_epoch=-1
 ):
     """
     Create a schedule with a learning rate that decreases as a polynomial decay
@@ -188,6 +188,10 @@ def get_polynomial_decay_schedule_with_warmup(
         last_epoch (:obj:`int`, `optional`, defaults to -1):
             The index of the last epoch when resuming training.
 
+    Note: `power` defaults to 1.0 as in the fairseq implementation, which in turn is
+    based on the original BERT implementation at
+    https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/optimization.py#L37
+
     Return:
         :obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.