From 24107c2c83e79d195826f18f66892feab6b000e9 Mon Sep 17 00:00:00 2001 From: "Jin Young (Daniel) Sohn" Date: Fri, 14 Aug 2020 09:47:37 -0700 Subject: [PATCH] Fix TPU Convergence bug introduced by PR#6151 (#6488) Currently with the bug introduced we're taking two optimizer steps per batch: one global one, where `xm.optimizer_step` injects a CRS between all cores in training, and one without. This has been affecting training accuracy (for example, XLNet GLUE on MNLI is not converging, etc.). --- src/transformers/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 942975d407..baaf77ade3 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -572,7 +572,7 @@ class Trainer: if is_torch_tpu_available(): xm.optimizer_step(self.optimizer) - if self.args.fp16 and _use_native_amp: + elif self.args.fp16 and _use_native_amp: self.scaler.step(self.optimizer) self.scaler.update() else: