diff --git a/run_classifier.py b/run_classifier.py index 54b7d5a26c..ab5251b1c0 100644 --- a/run_classifier.py +++ b/run_classifier.py @@ -392,10 +392,6 @@ def main(): default=False, action='store_true', help="Whether not to use CUDA when available") - parser.add_argument("--accumulate_gradients", - type=int, - default=1, - help="Number of steps to accumulate gradient on (divide the batch_size and accumulate)") parser.add_argument("--local_rank", type=int, default=-1, @@ -426,11 +422,11 @@ def main(): torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) - if args.accumulate_gradients < 1: - raise ValueError("Invalid accumulate_gradients parameter: {}, should be >= 1".format( - args.accumulate_gradients)) + if args.gradient_accumulation_steps < 1: + raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( + args.gradient_accumulation_steps)) - args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) + args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) diff --git a/run_squad.py b/run_squad.py index 78dff7dea5..59bd32c7c6 100644 --- a/run_squad.py +++ b/run_squad.py @@ -731,10 +731,6 @@ def main(): type=int, default=-1, help="local_rank for distributed training on gpus") - parser.add_argument("--accumulate_gradients", - type=int, - default=1, - help="Number of steps to accumulate gradient on (divide the batch_size and accumulate)") parser.add_argument('--seed', type=int, default=42, @@ -756,11 +752,11 @@ def main(): torch.distributed.init_process_group(backend='nccl') logger.info("device %s n_gpu %d distributed training %r", device, n_gpu, bool(args.local_rank != -1)) - if args.accumulate_gradients < 1: - raise ValueError("Invalid accumulate_gradients parameter: {}, should be >= 1".format( - args.accumulate_gradients)) + if args.gradient_accumulation_steps < 1: + raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( + args.gradient_accumulation_steps)) - args.train_batch_size = int(args.train_batch_size / args.accumulate_gradients) + args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed)