From b29eb247d39b56d903ea36c4f6c272a7bb0c0b4c Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Thu, 29 Apr 2021 18:33:47 -0400 Subject: [PATCH] Split checkpoint from model_name_or_path in examples (#11492) * Split checkpoint from model_name_or_path in examples * Address review comments * Address review comments --- examples/pytorch/README.md | 2 +- examples/pytorch/language-modeling/run_clm.py | 11 +++++------ examples/pytorch/language-modeling/run_mlm.py | 11 +++++------ examples/pytorch/language-modeling/run_plm.py | 11 +++++------ examples/pytorch/multiple-choice/run_swag.py | 11 +++++------ examples/pytorch/question-answering/run_qa.py | 11 +++++------ .../pytorch/question-answering/run_qa_beam_search.py | 11 +++++------ examples/pytorch/summarization/run_summarization.py | 11 +++++------ examples/pytorch/text-classification/run_glue.py | 12 ++++-------- examples/pytorch/text-classification/run_xnli.py | 9 +++------ examples/pytorch/token-classification/run_ner.py | 11 +++++------ examples/pytorch/translation/run_translation.py | 11 +++++------ src/transformers/training_args.py | 9 +++++++++ 13 files changed, 62 insertions(+), 69 deletions(-) diff --git a/examples/pytorch/README.md b/examples/pytorch/README.md index 7fb888b27a..b5a770dd2e 100644 --- a/examples/pytorch/README.md +++ b/examples/pytorch/README.md @@ -65,7 +65,7 @@ examples/pytorch/token-classification/run_ner.py -h You can resume training from a previous checkpoint like this: 1. Pass `--output_dir previous_output_dir` without `--overwrite_output_dir` to resume training from the latest checkpoint in `output_dir` (what you would use if the training was interrupted, for instance). -2. Pass `--model_name_or_path path_to_a_specific_checkpoint` to resume training from that checkpoint folder. +2. Pass `--resume_from_checkpoint path_to_a_specific_checkpoint` to resume training from that checkpoint folder. Should you want to turn an example into a notebook where you'd no longer have access to the command line, 🤗 Trainer supports resuming from a checkpoint via `trainer.train(resume_from_checkpoint)`. diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index ad9acaf196..fdf0479095 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -190,7 +190,7 @@ def main(): f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) - elif last_checkpoint is not None: + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." @@ -413,12 +413,11 @@ def main(): # Training if training_args.do_train: - if last_checkpoint is not None: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: checkpoint = last_checkpoint - elif model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path): - checkpoint = model_args.model_name_or_path - else: - checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py index f16082ceeb..928d68c8f0 100755 --- a/examples/pytorch/language-modeling/run_mlm.py +++ b/examples/pytorch/language-modeling/run_mlm.py @@ -199,7 +199,7 @@ def main(): f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) - elif last_checkpoint is not None: + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." @@ -443,12 +443,11 @@ def main(): # Training if training_args.do_train: - if last_checkpoint is not None: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: checkpoint = last_checkpoint - elif model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path): - checkpoint = model_args.model_name_or_path - else: - checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py index 5a8be42bf5..2dea89f4d0 100755 --- a/examples/pytorch/language-modeling/run_plm.py +++ b/examples/pytorch/language-modeling/run_plm.py @@ -196,7 +196,7 @@ def main(): f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) - elif last_checkpoint is not None: + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." @@ -419,12 +419,11 @@ def main(): # Training if training_args.do_train: - if last_checkpoint is not None: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: checkpoint = last_checkpoint - elif model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path): - checkpoint = model_args.model_name_or_path - else: - checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py index 0bc0ded2d8..2ee7ad7356 100755 --- a/examples/pytorch/multiple-choice/run_swag.py +++ b/examples/pytorch/multiple-choice/run_swag.py @@ -223,7 +223,7 @@ def main(): f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) - elif last_checkpoint is not None: + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." @@ -398,12 +398,11 @@ def main(): # Training if training_args.do_train: - if last_checkpoint is not None: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: checkpoint = last_checkpoint - elif os.path.isdir(model_args.model_name_or_path): - checkpoint = model_args.model_name_or_path - else: - checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload metrics = train_result.metrics diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py index cd1d250c4a..07f7c28ba6 100755 --- a/examples/pytorch/question-answering/run_qa.py +++ b/examples/pytorch/question-answering/run_qa.py @@ -216,7 +216,7 @@ def main(): f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) - elif last_checkpoint is not None: + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." @@ -557,12 +557,11 @@ def main(): # Training if training_args.do_train: - if last_checkpoint is not None: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: checkpoint = last_checkpoint - elif os.path.isdir(model_args.model_name_or_path): - checkpoint = model_args.model_name_or_path - else: - checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py index ffefee12f7..9da18ac5fd 100755 --- a/examples/pytorch/question-answering/run_qa_beam_search.py +++ b/examples/pytorch/question-answering/run_qa_beam_search.py @@ -215,7 +215,7 @@ def main(): f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) - elif last_checkpoint is not None: + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." @@ -595,12 +595,11 @@ def main(): # Training if training_args.do_train: - if last_checkpoint is not None: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: checkpoint = last_checkpoint - elif os.path.isdir(model_args.model_name_or_path): - checkpoint = model_args.model_name_or_path - else: - checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py index 841d7e9b58..05291a85fe 100755 --- a/examples/pytorch/summarization/run_summarization.py +++ b/examples/pytorch/summarization/run_summarization.py @@ -272,7 +272,7 @@ def main(): f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) - elif last_checkpoint is not None: + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." @@ -520,12 +520,11 @@ def main(): # Training if training_args.do_train: - if last_checkpoint is not None: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: checkpoint = last_checkpoint - elif os.path.isdir(model_args.model_name_or_path): - checkpoint = model_args.model_name_or_path - else: - checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py index cd8c6a94ae..3e49f743f3 100755 --- a/examples/pytorch/text-classification/run_glue.py +++ b/examples/pytorch/text-classification/run_glue.py @@ -196,7 +196,7 @@ def main(): f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) - elif last_checkpoint is not None: + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." @@ -448,14 +448,10 @@ def main(): # Training if training_args.do_train: checkpoint = None - if last_checkpoint is not None: + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: checkpoint = last_checkpoint - elif os.path.isdir(model_args.model_name_or_path): - # Check the config from that potential checkpoint has the right number of labels before using it as a - # checkpoint. - if AutoConfig.from_pretrained(model_args.model_name_or_path).num_labels == num_labels: - checkpoint = model_args.model_name_or_path - train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics max_train_samples = ( diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py index c1d8522c8d..21c071a812 100755 --- a/examples/pytorch/text-classification/run_xnli.py +++ b/examples/pytorch/text-classification/run_xnli.py @@ -335,13 +335,10 @@ def main(): # Training if training_args.do_train: checkpoint = None - if last_checkpoint is not None: + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: checkpoint = last_checkpoint - elif os.path.isdir(model_args.model_name_or_path): - # Check the config from that potential checkpoint has the right number of labels before using it as a - # checkpoint. - if AutoConfig.from_pretrained(model_args.model_name_or_path).num_labels == num_labels: - checkpoint = model_args.model_name_or_path train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics max_train_samples = ( diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py index 6ca2c591ae..08434e554b 100755 --- a/examples/pytorch/token-classification/run_ner.py +++ b/examples/pytorch/token-classification/run_ner.py @@ -189,7 +189,7 @@ def main(): f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) - elif last_checkpoint is not None: + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." @@ -437,12 +437,11 @@ def main(): # Training if training_args.do_train: - if last_checkpoint is not None: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: checkpoint = last_checkpoint - elif os.path.isdir(model_args.model_name_or_path): - checkpoint = model_args.model_name_or_path - else: - checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics trainer.save_model() # Saves the tokenizer too for easy upload diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py index e4fd946e71..125ab70710 100755 --- a/examples/pytorch/translation/run_translation.py +++ b/examples/pytorch/translation/run_translation.py @@ -256,7 +256,7 @@ def main(): f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome." ) - elif last_checkpoint is not None: + elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." @@ -512,12 +512,11 @@ def main(): # Training if training_args.do_train: - if last_checkpoint is not None: + checkpoint = None + if training_args.resume_from_checkpoint is not None: + checkpoint = training_args.resume_from_checkpoint + elif last_checkpoint is not None: checkpoint = last_checkpoint - elif os.path.isdir(model_args.model_name_or_path): - checkpoint = model_args.model_name_or_path - else: - checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) trainer.save_model() # Saves the tokenizer too for easy upload diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 30e433cfc7..6dde8fdd97 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -301,6 +301,11 @@ class TrainingArguments: :class:`~transformers.Trainer`, it's intended to be used by your training/evaluation scripts instead. See the `example scripts `__ for more details. + resume_from_checkpoint (:obj:`str`, `optional`): + The path to a folder with a valid checkpoint for your model. This argument is not directly used by + :class:`~transformers.Trainer`, it's intended to be used by your training/evaluation scripts instead. See + the `example scripts `__ for more + details. """ output_dir: str = field( @@ -531,6 +536,10 @@ class TrainingArguments: push_to_hub: bool = field( default=False, metadata={"help": "Whether or not to upload the trained model to the model hub after training."} ) + resume_from_checkpoint: Optional[str] = field( + default=None, + metadata={"help": "The path to a folder with a valid checkpoint for your model."}, + ) _n_gpu: int = field(init=False, repr=False, default=-1) mp_parameters: str = field( default="",