From 1ceac85e23d8834af1fac6bcbbc16ab8f8f58c40 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Sun, 4 Nov 2018 15:26:14 +0100
Subject: [PATCH] add gradient accumulation

---
 run_classifier.py | 25 ++++++++++++++++---------
 run_squad.py      | 21 +++++++++++++--------
 2 files changed, 29 insertions(+), 17 deletions(-)

diff --git a/run_classifier.py b/run_classifier.py
index f6fe12ff98..6983dec5b5 100644
--- a/run_classifier.py
+++ b/run_classifier.py
@@ -426,7 +426,7 @@ def main():
     parser.add_argument("--accumulate_gradients",
                         type=int,
                         default=1,
-                        help="Number of steps to accumulate gradient on (divide the single step batch_size)")
+                        help="Number of steps to accumulate gradient on (divide the batch_size and accumulate)")
     parser.add_argument("--local_rank",
                         type=int,
                         default=-1,
@@ -452,10 +452,17 @@ def main():
         # print("Initializing the distributed backend: NCCL")
     print("device", device, "n_gpu", n_gpu)
 
+    if args.accumulate_gradients < 1:
+        raise ValueError("Invalid accumulate_gradients parameter: {}, should be >= 1".format(
+                            args.accumulate_gradients))
+
+    args.batch_size = args.batch_size / args.accumulate_gradients
+
     random.seed(args.seed)
     np.random.seed(args.seed)
     torch.manual_seed(args.seed)
-    if n_gpu>0: torch.cuda.manual_seed_all(args.seed)
+    if n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
 
     if not args.do_train and not args.do_eval:
         raise ValueError("At least one of `do_train` or `do_eval` must be True.")
@@ -531,11 +538,10 @@ def main():
         train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
 
         model.train()
-        
         for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
             tr_loss = 0
             nb_tr_examples, nb_tr_steps = 0, 0
-            for input_ids, input_mask, segment_ids, label_ids in tqdm(train_dataloader, desc="Iteration"):
+            for step, (input_ids, input_mask, segment_ids, label_ids) in enumerate(tqdm(train_dataloader, desc="Iteration")):
                 input_ids = input_ids.to(device)
                 input_mask = input_mask.float().to(device)
                 segment_ids = segment_ids.to(device)
@@ -546,12 +552,13 @@ def main():
                     loss = loss.mean() # mean() to average on multi-gpu.
                 tr_loss += loss.item()
                 nb_tr_examples += input_ids.size(0)
-
-                model.zero_grad()
-                loss.backward()
-                optimizer.step()
-                global_step += 1
                 nb_tr_steps += 1
+                loss.backward()
+
+                if (step + 1) % args.gradient_accumulation_steps == 0:
+                    optimizer.step()    # We have accumulated enought gradients
+                    model.zero_grad()
+                    global_step += 1
 
     if args.do_eval:
         eval_examples = processor.get_dev_examples(args.data_dir)
diff --git a/run_squad.py b/run_squad.py
index 434fee99de..6bf2f5d79d 100644
--- a/run_squad.py
+++ b/run_squad.py
@@ -731,10 +731,14 @@ def main():
                         type=int,
                         default=-1,
                         help="local_rank for distributed training on gpus")
+    parser.add_argument("--accumulate_gradients",
+                        type=int,
+                        default=1,
+                        help="Number of steps to accumulate gradient on (divide the batch_size and accumulate)")
     parser.add_argument('--seed', 
-                    type=int, 
-                    default=42,
-                    help="random seed for initialization")
+                        type=int, 
+                        default=42,
+                        help="random seed for initialization")
 
     args = parser.parse_args()
 
@@ -836,8 +840,8 @@ def main():
 
         model.train()
         for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
-            for input_ids, input_mask, segment_ids, start_positions, end_positions in tqdm(train_dataloader,
-                                                                                           desc="Iteration"):
+            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
+                input_ids, input_mask, segment_ids, start_positions, end_positions = batch
                 input_ids = input_ids.to(device)
                 input_mask = input_mask.float().to(device)
                 segment_ids = segment_ids.to(device)
@@ -851,10 +855,11 @@ def main():
                 if n_gpu > 1:
                     loss = loss.mean() # mean() to average on multi-gpu.
 
-                model.zero_grad()
                 loss.backward()
-                optimizer.step()
-                global_step += 1
+                if (step + 1) % args.gradient_accumulation_steps == 0:
+                    optimizer.step()    # We have accumulated enought gradients
+                    model.zero_grad()
+                    global_step += 1
 
     if args.do_predict:
         eval_examples = read_squad_examples(