From b8ff56896ccbd27a54035a90a3bc278a44541a74 Mon Sep 17 00:00:00 2001 From: wangfei <1140554608@qq.com> Date: Fri, 16 Aug 2019 12:11:05 +0800 Subject: [PATCH 1/4] Fix bug of multi-gpu training in lm finetuning --- examples/lm_finetuning/finetune_on_pregenerated.py | 2 +- examples/lm_finetuning/simple_lm_finetuning.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py index 9fcc5f2cb1..7c40342f18 100644 --- a/examples/lm_finetuning/finetune_on_pregenerated.py +++ b/examples/lm_finetuning/finetune_on_pregenerated.py @@ -320,7 +320,7 @@ def main(): global_step += 1 # Save a trained model - if n_gpu > 1 and torch.distributed.get_rank() == 0 or n_gpu <=1 : + if args.local_rank == -1 or torch.distributed.get_rank() == 0: logging.info("** ** * Saving fine-tuned model ** ** * ") model.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) diff --git a/examples/lm_finetuning/simple_lm_finetuning.py b/examples/lm_finetuning/simple_lm_finetuning.py index ba5f832827..25333de0ed 100644 --- a/examples/lm_finetuning/simple_lm_finetuning.py +++ b/examples/lm_finetuning/simple_lm_finetuning.py @@ -507,7 +507,7 @@ def main(): if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) - if not os.path.exists(args.output_dir) and ( n_gpu > 1 and torch.distributed.get_rank() == 0 or n_gpu <=1 ): + if not os.path.exists(args.output_dir) and (args.local_rank == -1 or torch.distributed.get_rank() == 0): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) @@ -608,7 +608,7 @@ def main(): global_step += 1 # Save a trained model - if args.do_train and ( n_gpu > 1 and torch.distributed.get_rank() == 0 or n_gpu <=1): + if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info("** ** * Saving fine - tuned model ** ** * ") model.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) From 00e9c4cc9616cab1666cab0a331b5d7e68946928 Mon Sep 17 00:00:00 2001 From: wangfei <1140554608@qq.com> Date: Sun, 18 Aug 2019 11:02:02 +0800 Subject: [PATCH 2/4] Fix: save model/model.module --- examples/lm_finetuning/finetune_on_pregenerated.py | 11 ++++++----- examples/lm_finetuning/simple_lm_finetuning.py | 3 ++- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py index 7c40342f18..1177d84cd4 100644 --- a/examples/lm_finetuning/finetune_on_pregenerated.py +++ b/examples/lm_finetuning/finetune_on_pregenerated.py @@ -155,12 +155,12 @@ def main(): help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") - parser.add_argument("--warmup_steps", - default=0, + parser.add_argument("--warmup_steps", + default=0, type=int, help="Linear warmup over warmup_steps.") - parser.add_argument("--adam_epsilon", - default=1e-8, + parser.add_argument("--adam_epsilon", + default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--learning_rate", @@ -322,7 +322,8 @@ def main(): # Save a trained model if args.local_rank == -1 or torch.distributed.get_rank() == 0: logging.info("** ** * Saving fine-tuned model ** ** * ") - model.save_pretrained(args.output_dir) + model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) diff --git a/examples/lm_finetuning/simple_lm_finetuning.py b/examples/lm_finetuning/simple_lm_finetuning.py index 25333de0ed..9633640faf 100644 --- a/examples/lm_finetuning/simple_lm_finetuning.py +++ b/examples/lm_finetuning/simple_lm_finetuning.py @@ -610,7 +610,8 @@ def main(): # Save a trained model if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info("** ** * Saving fine - tuned model ** ** * ") - model.save_pretrained(args.output_dir) + model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) From 1ef41b83374ce5756e24746201d21432d7ecada0 Mon Sep 17 00:00:00 2001 From: wangfei <1140554608@qq.com> Date: Sun, 18 Aug 2019 11:03:12 +0800 Subject: [PATCH 3/4] Revert "Fix: save model/model.module" This reverts commit 00e9c4cc9616cab1666cab0a331b5d7e68946928. --- examples/lm_finetuning/finetune_on_pregenerated.py | 11 +++++------ examples/lm_finetuning/simple_lm_finetuning.py | 3 +-- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py index 1177d84cd4..7c40342f18 100644 --- a/examples/lm_finetuning/finetune_on_pregenerated.py +++ b/examples/lm_finetuning/finetune_on_pregenerated.py @@ -155,12 +155,12 @@ def main(): help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") - parser.add_argument("--warmup_steps", - default=0, + parser.add_argument("--warmup_steps", + default=0, type=int, help="Linear warmup over warmup_steps.") - parser.add_argument("--adam_epsilon", - default=1e-8, + parser.add_argument("--adam_epsilon", + default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--learning_rate", @@ -322,8 +322,7 @@ def main(): # Save a trained model if args.local_rank == -1 or torch.distributed.get_rank() == 0: logging.info("** ** * Saving fine-tuned model ** ** * ") - model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training - model_to_save.save_pretrained(args.output_dir) + model.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) diff --git a/examples/lm_finetuning/simple_lm_finetuning.py b/examples/lm_finetuning/simple_lm_finetuning.py index 9633640faf..25333de0ed 100644 --- a/examples/lm_finetuning/simple_lm_finetuning.py +++ b/examples/lm_finetuning/simple_lm_finetuning.py @@ -610,8 +610,7 @@ def main(): # Save a trained model if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info("** ** * Saving fine - tuned model ** ** * ") - model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training - model_to_save.save_pretrained(args.output_dir) + model.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) From 856a63da4d1f0f302633dc73e2d4a1f698bbafda Mon Sep 17 00:00:00 2001 From: wangfei <1140554608@qq.com> Date: Sun, 18 Aug 2019 11:03:47 +0800 Subject: [PATCH 4/4] Fix: save model/model.module --- examples/lm_finetuning/finetune_on_pregenerated.py | 3 ++- examples/lm_finetuning/simple_lm_finetuning.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py index 7c40342f18..eefa56c824 100644 --- a/examples/lm_finetuning/finetune_on_pregenerated.py +++ b/examples/lm_finetuning/finetune_on_pregenerated.py @@ -322,7 +322,8 @@ def main(): # Save a trained model if args.local_rank == -1 or torch.distributed.get_rank() == 0: logging.info("** ** * Saving fine-tuned model ** ** * ") - model.save_pretrained(args.output_dir) + model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) diff --git a/examples/lm_finetuning/simple_lm_finetuning.py b/examples/lm_finetuning/simple_lm_finetuning.py index 25333de0ed..9633640faf 100644 --- a/examples/lm_finetuning/simple_lm_finetuning.py +++ b/examples/lm_finetuning/simple_lm_finetuning.py @@ -610,7 +610,8 @@ def main(): # Save a trained model if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): logger.info("** ** * Saving fine - tuned model ** ** * ") - model.save_pretrained(args.output_dir) + model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training + model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir)