From 08de989a0aac293278b353a85dfb9986b90d20ed Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Mon, 7 Sep 2020 04:54:00 -0400
Subject: [PATCH] Trainer with grad accum (#6930)

* Add warning for gradient accumulation

* Formatting
---
 src/transformers/training_args.py    | 6 ++++++
 src/transformers/training_args_tf.py | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index a9e0948dfe..60b86d28b7 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -60,6 +60,12 @@ class TrainingArguments:
             The batch size per GPU/TPU core/CPU for evaluation.
         gradient_accumulation_steps: (:obj:`int`, `optional`, defaults to 1):
             Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
+
+            .. warning::
+
+                When using gradient accumulation, one step is counted as one step with backward pass. Therefore,
+                logging, evaluation, save will be conducted every ``gradient_accumulation_steps * xxx_step`` training
+                examples.
         learning_rate (:obj:`float`, `optional`, defaults to 5e-5):
             The initial learning rate for Adam.
         weight_decay (:obj:`float`, `optional`, defaults to 0):
diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py
index 94e5c3f320..486538fd2b 100644
--- a/src/transformers/training_args_tf.py
+++ b/src/transformers/training_args_tf.py
@@ -42,6 +42,12 @@ class TFTrainingArguments(TrainingArguments):
             The batch size per GPU/TPU core/CPU for evaluation.
         gradient_accumulation_steps: (:obj:`int`, `optional`, defaults to 1):
             Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
+
+            .. warning::
+
+                When using gradient accumulation, one step is counted as one step with backward pass. Therefore,
+                logging, evaluation, save will be conducted every ``gradient_accumulation_steps * xxx_step`` training
+                examples.
         learning_rate (:obj:`float`, `optional`, defaults to 5e-5):
             The initial learning rate for Adam.
         weight_decay (:obj:`float`, `optional`, defaults to 0):