From 1ceac85e23d8834af1fac6bcbbc16ab8f8f58c40 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Sun, 4 Nov 2018 15:26:14 +0100 Subject: [PATCH] add gradient accumulation --- run_classifier.py | 25 ++++++++++++++++--------- run_squad.py | 21 +++++++++++++-------- 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/run_classifier.py b/run_classifier.py index f6fe12ff98..6983dec5b5 100644 --- a/run_classifier.py +++ b/run_classifier.py @@ -426,7 +426,7 @@ def main(): parser.add_argument("--accumulate_gradients", type=int, default=1, - help="Number of steps to accumulate gradient on (divide the single step batch_size)") + help="Number of steps to accumulate gradient on (divide the batch_size and accumulate)") parser.add_argument("--local_rank", type=int, default=-1, @@ -452,10 +452,17 @@ def main(): # print("Initializing the distributed backend: NCCL") print("device", device, "n_gpu", n_gpu) + if args.accumulate_gradients < 1: + raise ValueError("Invalid accumulate_gradients parameter: {}, should be >= 1".format( + args.accumulate_gradients)) + + args.batch_size = args.batch_size / args.accumulate_gradients + random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) - if n_gpu>0: torch.cuda.manual_seed_all(args.seed) + if n_gpu > 0: + torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") @@ -531,11 +538,10 @@ def main(): train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) model.train() - for epoch in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 - for input_ids, input_mask, segment_ids, label_ids in tqdm(train_dataloader, desc="Iteration"): + for step, (input_ids, input_mask, segment_ids, label_ids) in enumerate(tqdm(train_dataloader, desc="Iteration")): input_ids = input_ids.to(device) input_mask = input_mask.float().to(device) segment_ids = segment_ids.to(device) @@ -546,12 +552,13 @@ def main(): loss = loss.mean() # mean() to average on multi-gpu. tr_loss += loss.item() nb_tr_examples += input_ids.size(0) - - model.zero_grad() - loss.backward() - optimizer.step() - global_step += 1 nb_tr_steps += 1 + loss.backward() + + if (step + 1) % args.gradient_accumulation_steps == 0: + optimizer.step() # We have accumulated enought gradients + model.zero_grad() + global_step += 1 if args.do_eval: eval_examples = processor.get_dev_examples(args.data_dir) diff --git a/run_squad.py b/run_squad.py index 434fee99de..6bf2f5d79d 100644 --- a/run_squad.py +++ b/run_squad.py @@ -731,10 +731,14 @@ def main(): type=int, default=-1, help="local_rank for distributed training on gpus") + parser.add_argument("--accumulate_gradients", + type=int, + default=1, + help="Number of steps to accumulate gradient on (divide the batch_size and accumulate)") parser.add_argument('--seed', - type=int, - default=42, - help="random seed for initialization") + type=int, + default=42, + help="random seed for initialization") args = parser.parse_args() @@ -836,8 +840,8 @@ def main(): model.train() for epoch in trange(int(args.num_train_epochs), desc="Epoch"): - for input_ids, input_mask, segment_ids, start_positions, end_positions in tqdm(train_dataloader, - desc="Iteration"): + for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): + input_ids, input_mask, segment_ids, start_positions, end_positions = batch input_ids = input_ids.to(device) input_mask = input_mask.float().to(device) segment_ids = segment_ids.to(device) @@ -851,10 +855,11 @@ def main(): if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. - model.zero_grad() loss.backward() - optimizer.step() - global_step += 1 + if (step + 1) % args.gradient_accumulation_steps == 0: + optimizer.step() # We have accumulated enought gradients + model.zero_grad() + global_step += 1 if args.do_predict: eval_examples = read_squad_examples(