From c60e0e1ee45f4bf1017736b146c51729f120bb83 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Fri, 15 Jan 2021 10:12:26 -0800 Subject: [PATCH] deepspeed + grad acumm (#9622) --- examples/seq2seq/test_finetune_trainer.py | 5 +++++ src/transformers/trainer.py | 4 +++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/examples/seq2seq/test_finetune_trainer.py b/examples/seq2seq/test_finetune_trainer.py index 0affe52902..4a925a8e42 100644 --- a/examples/seq2seq/test_finetune_trainer.py +++ b/examples/seq2seq/test_finetune_trainer.py @@ -112,6 +112,11 @@ class TestFinetuneTrainer(TestCasePlus): def test_finetune_trainer_deepspeed(self): self.finetune_trainer_quick(deepspeed=True) + @require_torch_multi_gpu + @require_deepspeed + def test_finetune_trainer_deepspeed_grad_acum(self): + self.finetune_trainer_quick(deepspeed=True, extra_args_str="--gradient_accumulation_steps 2") + @slow def test_finetune_trainer_slow(self): # There is a missing call to __init__process_group somewhere diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index a58119f88f..a6ca42a9b1 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -931,7 +931,9 @@ class Trainer: ) # Optimizer step - if is_torch_tpu_available(): + if self.deepspeed: + self.deepspeed.step() + elif is_torch_tpu_available(): xm.optimizer_step(self.optimizer) elif self.use_amp: self.scaler.step(self.optimizer)