From 4faeb38b51055d329f4cc5839cd1fefbe27f9d8f Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Sat, 3 Nov 2018 17:52:51 +0000 Subject: [PATCH 1/2] Fix loss loss logging for multi-gpu compatibility --- run_classifier_pytorch.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/run_classifier_pytorch.py b/run_classifier_pytorch.py index f8cf4af808..c8ec8ab6e2 100644 --- a/run_classifier_pytorch.py +++ b/run_classifier_pytorch.py @@ -529,10 +529,10 @@ def main(): label_ids = label_ids.to(device) loss, _ = model(input_ids, segment_ids, input_mask, label_ids) - total_tr_loss += loss.item() + total_tr_loss += loss.sum().item() # sum() is to account for multi-gpu support. nb_tr_examples += input_ids.size(0) model.zero_grad() - loss.backward() + loss.sum().backward() # sum() is to account for multi-gpu support. optimizer.step() global_step += 1 @@ -573,7 +573,7 @@ def main(): label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) - eval_loss += tmp_eval_loss.item() + eval_loss += tmp_eval_loss.sum().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) From a1af5247e171354e8f39e577d861e63d7fa67a1e Mon Sep 17 00:00:00 2001 From: VictorSanh Date: Sat, 3 Nov 2018 14:00:36 -0400 Subject: [PATCH 2/2] Add seed in initialization --- run_classifier_pytorch.py | 12 ++++++++++-- run_squad_pytorch.py | 9 +++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/run_classifier_pytorch.py b/run_classifier_pytorch.py index c8ec8ab6e2..3410a09b04 100644 --- a/run_classifier_pytorch.py +++ b/run_classifier_pytorch.py @@ -427,7 +427,10 @@ def main(): type=int, default=-1, help="local_rank for distributed training on gpus") - + parser.add_argument('--seed', + type=int, + default=42, + help="random seed for initialization") args = parser.parse_args() processors = { @@ -444,7 +447,12 @@ def main(): n_gpu = 1 # print("Initializing the distributed backend: NCCL") print("device", device, "n_gpu", n_gpu) - + + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if n_gpu>0: torch.cuda.manual_seed_all(args.seed) + if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") diff --git a/run_squad_pytorch.py b/run_squad_pytorch.py index 2a67262d96..a1db682cd4 100644 --- a/run_squad_pytorch.py +++ b/run_squad_pytorch.py @@ -745,6 +745,10 @@ def main(): type=int, default=-1, help="local_rank for distributed training on gpus") + parser.add_argument('--seed', + type=int, + default=42, + help="random seed for initialization") args = parser.parse_args() @@ -757,6 +761,11 @@ def main(): # print("Initializing the distributed backend: NCCL") print("device", device, "n_gpu", n_gpu) + random.seed(args.seed) + np.random.seed(args.seed) + torch.manual_seed(args.seed) + if n_gpu>0: torch.cuda.manual_seed_all(args.seed) + if not args.do_train and not args.do_predict: raise ValueError("At least one of `do_train` or `do_predict` must be True.")