From 08de989a0aac293278b353a85dfb9986b90d20ed Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Mon, 7 Sep 2020 04:54:00 -0400 Subject: [PATCH] Trainer with grad accum (#6930) * Add warning for gradient accumulation * Formatting --- src/transformers/training_args.py | 6 ++++++ src/transformers/training_args_tf.py | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index a9e0948dfe..60b86d28b7 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -60,6 +60,12 @@ class TrainingArguments: The batch size per GPU/TPU core/CPU for evaluation. gradient_accumulation_steps: (:obj:`int`, `optional`, defaults to 1): Number of updates steps to accumulate the gradients for, before performing a backward/update pass. + + .. warning:: + + When using gradient accumulation, one step is counted as one step with backward pass. Therefore, + logging, evaluation, save will be conducted every ``gradient_accumulation_steps * xxx_step`` training + examples. learning_rate (:obj:`float`, `optional`, defaults to 5e-5): The initial learning rate for Adam. weight_decay (:obj:`float`, `optional`, defaults to 0): diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py index 94e5c3f320..486538fd2b 100644 --- a/src/transformers/training_args_tf.py +++ b/src/transformers/training_args_tf.py @@ -42,6 +42,12 @@ class TFTrainingArguments(TrainingArguments): The batch size per GPU/TPU core/CPU for evaluation. gradient_accumulation_steps: (:obj:`int`, `optional`, defaults to 1): Number of updates steps to accumulate the gradients for, before performing a backward/update pass. + + .. warning:: + + When using gradient accumulation, one step is counted as one step with backward pass. Therefore, + logging, evaluation, save will be conducted every ``gradient_accumulation_steps * xxx_step`` training + examples. learning_rate (:obj:`float`, `optional`, defaults to 5e-5): The initial learning rate for Adam. weight_decay (:obj:`float`, `optional`, defaults to 0):