From 24107c2c83e79d195826f18f66892feab6b000e9 Mon Sep 17 00:00:00 2001
From: "Jin Young (Daniel) Sohn" <jysohn@google.com>
Date: Fri, 14 Aug 2020 09:47:37 -0700
Subject: [PATCH] Fix TPU Convergence bug introduced by PR#6151 (#6488)

Currently with the bug introduced we're taking two optimizer steps per
batch: one global one, where `xm.optimizer_step` injects a CRS between
all cores in training, and one without. This has been affecting training
accuracy (for example, XLNet GLUE on MNLI is not converging, etc.).
---
 src/transformers/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 942975d407..baaf77ade3 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -572,7 +572,7 @@ class Trainer:
 
                     if is_torch_tpu_available():
                         xm.optimizer_step(self.optimizer)
-                    if self.args.fp16 and _use_native_amp:
+                    elif self.args.fp16 and _use_native_amp:
                         self.scaler.step(self.optimizer)
                         self.scaler.update()
                     else: