From 60e1d883f19756989a532ab69a9bb309a945e13b Mon Sep 17 00:00:00 2001 From: Zachary Mueller Date: Wed, 27 Apr 2022 14:46:49 -0400 Subject: [PATCH] Fixup no_trainer save logic (#16968) * Fixup all examples --- .../run_image_classification_no_trainer.py | 30 +++++++++++-------- .../language-modeling/run_clm_no_trainer.py | 30 +++++++++++-------- .../language-modeling/run_mlm_no_trainer.py | 30 +++++++++++-------- .../multiple-choice/run_swag_no_trainer.py | 30 +++++++++++-------- .../run_qa_beam_search_no_trainer.py | 28 ++++++++++------- .../question-answering/run_qa_no_trainer.py | 30 +++++++++++-------- .../run_semantic_segmentation_no_trainer.py | 30 +++++++++++-------- .../run_wav2vec2_pretraining_no_trainer.py | 4 ++- .../run_summarization_no_trainer.py | 30 +++++++++++-------- .../run_glue_no_trainer.py | 30 +++++++++++-------- .../run_ner_no_trainer.py | 30 +++++++++++-------- .../translation/run_translation_no_trainer.py | 30 +++++++++++-------- 12 files changed, 200 insertions(+), 132 deletions(-) diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index 97bfe8e9cd..39f805b458 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -393,33 +393,39 @@ def main(): # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 + starting_epoch = 0 # Potentially load in the weights and states from a previous save if args.resume_from_checkpoint: if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}") accelerator.load_state(args.resume_from_checkpoint) - resume_step = None - path = args.resume_from_checkpoint + path = os.path.basename(args.resume_from_checkpoint) else: # Get the most recent checkpoint dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] dirs.sort(key=os.path.getctime) path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last - if "epoch" in path: - args.num_train_epochs -= int(path.replace("epoch_", "")) - else: - resume_step = int(path.replace("step_", "")) - args.num_train_epochs -= resume_step // len(train_dataloader) - resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step + # Extract `epoch_{i}` or `step_{i}` + training_difference = os.path.splitext(path)[0] - for epoch in range(args.num_train_epochs): + if "epoch" in training_difference: + starting_epoch = int(training_difference.replace("epoch_", "")) + 1 + resume_step = None + else: + resume_step = int(training_difference.replace("step_", "")) + starting_epoch = resume_step // len(train_dataloader) + resume_step -= starting_epoch * len(train_dataloader) + + for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: total_loss = 0 for step, batch in enumerate(train_dataloader): # We need to skip steps until we reach the resumed step - if args.resume_from_checkpoint and epoch == 0 and step < resume_step: - continue + if args.resume_from_checkpoint and epoch == starting_epoch: + if resume_step is not None and step < resume_step: + completed_steps += 1 + continue outputs = model(**batch) loss = outputs.loss # We keep track of the loss at each epoch @@ -436,7 +442,7 @@ def main(): if isinstance(checkpointing_steps, int): if completed_steps % checkpointing_steps == 0: - output_dir = f"step_{completed_steps}" + output_dir = f"step_{completed_steps }" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 8f14ee67c0..3e7cfaa3aa 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -503,34 +503,40 @@ def main(): # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 + starting_epoch = 0 # Potentially load in the weights and states from a previous save if args.resume_from_checkpoint: if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}") accelerator.load_state(args.resume_from_checkpoint) - resume_step = None - path = args.resume_from_checkpoint + path = os.path.basename(args.resume_from_checkpoint) else: # Get the most recent checkpoint dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] dirs.sort(key=os.path.getctime) path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last - if "epoch" in path: - args.num_train_epochs -= int(path.replace("epoch_", "")) - else: - resume_step = int(path.replace("step_", "")) - args.num_train_epochs -= resume_step // len(train_dataloader) - resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step + # Extract `epoch_{i}` or `step_{i}` + training_difference = os.path.splitext(path)[0] - for epoch in range(args.num_train_epochs): + if "epoch" in training_difference: + starting_epoch = int(training_difference.replace("epoch_", "")) + 1 + resume_step = None + else: + resume_step = int(training_difference.replace("step_", "")) + starting_epoch = resume_step // len(train_dataloader) + resume_step -= starting_epoch * len(train_dataloader) + + for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: total_loss = 0 for step, batch in enumerate(train_dataloader): # We need to skip steps until we reach the resumed step - if args.resume_from_checkpoint and epoch == 0 and step < resume_step: - continue + if args.resume_from_checkpoint and epoch == starting_epoch: + if resume_step is not None and step < resume_step: + completed_steps += 1 + continue outputs = model(**batch) loss = outputs.loss # We keep track of the loss at each epoch @@ -547,7 +553,7 @@ def main(): if isinstance(checkpointing_steps, int): if completed_steps % checkpointing_steps == 0: - output_dir = f"step_{completed_steps}" + output_dir = f"step_{completed_steps }" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index f05d92396e..d7d8d011ac 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -549,34 +549,40 @@ def main(): # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 + starting_epoch = 0 # Potentially load in the weights and states from a previous save if args.resume_from_checkpoint: if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}") accelerator.load_state(args.resume_from_checkpoint) - resume_step = None - path = args.resume_from_checkpoint + path = os.path.basename(args.resume_from_checkpoint) else: # Get the most recent checkpoint dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] dirs.sort(key=os.path.getctime) path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last - if "epoch" in path: - args.num_train_epochs -= int(path.replace("epoch_", "")) - else: - resume_step = int(path.replace("step_", "")) - args.num_train_epochs -= resume_step // len(train_dataloader) - resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step + # Extract `epoch_{i}` or `step_{i}` + training_difference = os.path.splitext(path)[0] - for epoch in range(args.num_train_epochs): + if "epoch" in training_difference: + starting_epoch = int(training_difference.replace("epoch_", "")) + 1 + resume_step = None + else: + resume_step = int(training_difference.replace("step_", "")) + starting_epoch = resume_step // len(train_dataloader) + resume_step -= starting_epoch * len(train_dataloader) + + for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: total_loss = 0 for step, batch in enumerate(train_dataloader): # We need to skip steps until we reach the resumed step - if args.resume_from_checkpoint and epoch == 0 and step < resume_step: - continue + if args.resume_from_checkpoint and epoch == starting_epoch: + if resume_step is not None and step < resume_step: + completed_steps += 1 + continue outputs = model(**batch) loss = outputs.loss # We keep track of the loss at each epoch @@ -593,7 +599,7 @@ def main(): if isinstance(checkpointing_steps, int): if completed_steps % checkpointing_steps == 0: - output_dir = f"step_{completed_steps}" + output_dir = f"step_{completed_steps }" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index d9f229f7e6..2c39d29cb1 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -506,34 +506,40 @@ def main(): # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 + starting_epoch = 0 # Potentially load in the weights and states from a previous save if args.resume_from_checkpoint: if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}") accelerator.load_state(args.resume_from_checkpoint) - resume_step = None - path = args.resume_from_checkpoint + path = os.path.basename(args.resume_from_checkpoint) else: # Get the most recent checkpoint dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] dirs.sort(key=os.path.getctime) path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last - if "epoch" in path: - args.num_train_epochs -= int(path.replace("epoch_", "")) - else: - resume_step = int(path.replace("step_", "")) - args.num_train_epochs -= resume_step // len(train_dataloader) - resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step + # Extract `epoch_{i}` or `step_{i}` + training_difference = os.path.splitext(path)[0] - for epoch in range(args.num_train_epochs): + if "epoch" in training_difference: + starting_epoch = int(training_difference.replace("epoch_", "")) + 1 + resume_step = None + else: + resume_step = int(training_difference.replace("step_", "")) + starting_epoch = resume_step // len(train_dataloader) + resume_step -= starting_epoch * len(train_dataloader) + + for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: total_loss = 0 for step, batch in enumerate(train_dataloader): # We need to skip steps until we reach the resumed step - if args.resume_from_checkpoint and epoch == 0 and step < resume_step: - continue + if args.resume_from_checkpoint and epoch == starting_epoch: + if resume_step is not None and step < resume_step: + completed_steps += 1 + continue outputs = model(**batch) loss = outputs.loss # We keep track of the loss at each epoch @@ -550,7 +556,7 @@ def main(): if isinstance(checkpointing_steps, int): if completed_steps % checkpointing_steps == 0: - output_dir = f"step_{completed_steps}" + output_dir = f"step_{completed_steps }" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py index f7bba206a5..6e365c9814 100644 --- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py @@ -765,34 +765,40 @@ def main(): # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 + starting_epoch = 0 # Potentially load in the weights and states from a previous save if args.resume_from_checkpoint: if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}") accelerator.load_state(args.resume_from_checkpoint) - resume_step = None - path = args.resume_from_checkpoint + path = os.path.basename(args.resume_from_checkpoint) else: # Get the most recent checkpoint dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] dirs.sort(key=os.path.getctime) path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last - if "epoch" in path: - args.num_train_epochs -= int(path.replace("epoch_", "")) - else: - resume_step = int(path.replace("step_", "")) - args.num_train_epochs -= resume_step // len(train_dataloader) - resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step + # Extract `epoch_{i}` or `step_{i}` + training_difference = os.path.splitext(path)[0] - for epoch in range(args.num_train_epochs): + if "epoch" in training_difference: + starting_epoch = int(training_difference.replace("epoch_", "")) + 1 + resume_step = None + else: + resume_step = int(training_difference.replace("step_", "")) + starting_epoch = resume_step // len(train_dataloader) + resume_step -= starting_epoch * len(train_dataloader) + + for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: total_loss = 0 for step, batch in enumerate(train_dataloader): # We need to skip steps until we reach the resumed step - if args.resume_from_checkpoint and epoch == 0 and step < resume_step: - continue + if args.resume_from_checkpoint and epoch == starting_epoch: + if resume_step is not None and step < resume_step: + completed_steps += 1 + continue outputs = model(**batch) loss = outputs.loss # We keep track of the loss at each epoch diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py index c9e9b28efe..530df23fd2 100755 --- a/examples/pytorch/question-answering/run_qa_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_no_trainer.py @@ -771,34 +771,40 @@ def main(): # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 + starting_epoch = 0 # Potentially load in the weights and states from a previous save if args.resume_from_checkpoint: if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}") accelerator.load_state(args.resume_from_checkpoint) - resume_step = None - path = args.resume_from_checkpoint + path = os.path.basename(args.resume_from_checkpoint) else: # Get the most recent checkpoint dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] dirs.sort(key=os.path.getctime) path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last - if "epoch" in path: - args.num_train_epochs -= int(path.replace("epoch_", "")) - else: - resume_step = int(path.replace("step_", "")) - args.num_train_epochs -= resume_step // len(train_dataloader) - resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step + # Extract `epoch_{i}` or `step_{i}` + training_difference = os.path.splitext(path)[0] - for epoch in range(args.num_train_epochs): + if "epoch" in training_difference: + starting_epoch = int(training_difference.replace("epoch_", "")) + 1 + resume_step = None + else: + resume_step = int(training_difference.replace("step_", "")) + starting_epoch = resume_step // len(train_dataloader) + resume_step -= starting_epoch * len(train_dataloader) + + for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: total_loss = 0 for step, batch in enumerate(train_dataloader): # We need to skip steps until we reach the resumed step - if args.resume_from_checkpoint and epoch == 0 and step < resume_step: - continue + if args.resume_from_checkpoint and epoch == starting_epoch: + if resume_step is not None and step < resume_step: + completed_steps += 1 + continue outputs = model(**batch) loss = outputs.loss # We keep track of the loss at each epoch @@ -815,7 +821,7 @@ def main(): if isinstance(checkpointing_steps, int): if completed_steps % checkpointing_steps == 0: - output_dir = f"step_{completed_steps}" + output_dir = f"step_{completed_steps }" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py index 1e8d4ff8b6..d5a6a16fe4 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py @@ -501,34 +501,40 @@ def main(): # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 + starting_epoch = 0 # Potentially load in the weights and states from a previous save if args.resume_from_checkpoint: if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}") accelerator.load_state(args.resume_from_checkpoint) - resume_step = None - path = args.resume_from_checkpoint + path = os.path.basename(args.resume_from_checkpoint) else: # Get the most recent checkpoint dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] dirs.sort(key=os.path.getctime) path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last - if "epoch" in path: - args.num_train_epochs -= int(path.replace("epoch_", "")) - else: - resume_step = int(path.replace("step_", "")) - args.num_train_epochs -= resume_step // len(train_dataloader) - resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step + # Extract `epoch_{i}` or `step_{i}` + training_difference = os.path.splitext(path)[0] - for epoch in range(args.num_train_epochs): + if "epoch" in training_difference: + starting_epoch = int(training_difference.replace("epoch_", "")) + 1 + resume_step = None + else: + resume_step = int(training_difference.replace("step_", "")) + starting_epoch = resume_step // len(train_dataloader) + resume_step -= starting_epoch * len(train_dataloader) + + for epoch in range(starting_epoch, args.num_train_epochs): if args.with_tracking: total_loss = 0 model.train() for step, batch in enumerate(train_dataloader): # We need to skip steps until we reach the resumed step - if args.resume_from_checkpoint and epoch == 0 and step < resume_step: - continue + if args.resume_from_checkpoint and epoch == starting_epoch: + if resume_step is not None and step < resume_step: + completed_steps += 1 + continue outputs = model(**batch) loss = outputs.loss # We keep track of the loss at each epoch @@ -545,7 +551,7 @@ def main(): if isinstance(checkpointing_steps, int): if completed_steps % checkpointing_steps == 0: - output_dir = f"step_{completed_steps}" + output_dir = f"step_{completed_steps }" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) diff --git a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py index 1b43f54110..680808f2e7 100755 --- a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py +++ b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py @@ -563,11 +563,13 @@ def main(): logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") logger.info(f" Total optimization steps = {args.max_train_steps}") completed_steps = 0 + starting_epoch = 0 # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 - for epoch in range(args.num_train_epochs): + starting_epoch = 0 + for epoch in range(starting_epoch, args.num_train_epochs): model.train() for step, batch in enumerate(train_dataloader): # compute num of losses diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index ab680d1d30..e08edbf513 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -569,33 +569,39 @@ def main(): # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 + starting_epoch = 0 # Potentially load in the weights and states from a previous save if args.resume_from_checkpoint: if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}") accelerator.load_state(args.resume_from_checkpoint) - resume_step = None - path = args.resume_from_checkpoint + path = os.path.basename(args.resume_from_checkpoint) else: # Get the most recent checkpoint dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] dirs.sort(key=os.path.getctime) path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last - if "epoch" in path: - args.num_train_epochs -= int(path.replace("epoch_", "")) - else: - resume_step = int(path.replace("step_", "")) - args.num_train_epochs -= resume_step // len(train_dataloader) - resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step + # Extract `epoch_{i}` or `step_{i}` + training_difference = os.path.splitext(path)[0] - for epoch in range(args.num_train_epochs): + if "epoch" in training_difference: + starting_epoch = int(training_difference.replace("epoch_", "")) + 1 + resume_step = None + else: + resume_step = int(training_difference.replace("step_", "")) + starting_epoch = resume_step // len(train_dataloader) + resume_step -= starting_epoch * len(train_dataloader) + + for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: total_loss = 0 for step, batch in enumerate(train_dataloader): # We need to skip steps until we reach the resumed step - if args.resume_from_checkpoint and epoch == 0 and step < resume_step: - continue + if args.resume_from_checkpoint and epoch == starting_epoch: + if resume_step is not None and step < resume_step: + completed_steps += 1 + continue outputs = model(**batch) loss = outputs.loss # We keep track of the loss at each epoch @@ -612,7 +618,7 @@ def main(): if isinstance(checkpointing_steps, int): if completed_steps % checkpointing_steps == 0: - output_dir = f"step_{completed_steps}" + output_dir = f"step_{completed_steps }" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py index 86906fe343..73f52825a3 100644 --- a/examples/pytorch/text-classification/run_glue_no_trainer.py +++ b/examples/pytorch/text-classification/run_glue_no_trainer.py @@ -454,33 +454,39 @@ def main(): # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 + starting_epoch = 0 # Potentially load in the weights and states from a previous save if args.resume_from_checkpoint: if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}") accelerator.load_state(args.resume_from_checkpoint) - resume_step = None - path = args.resume_from_checkpoint + path = os.path.basename(args.resume_from_checkpoint) else: # Get the most recent checkpoint dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] dirs.sort(key=os.path.getctime) path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last - if "epoch" in path: - args.num_train_epochs -= int(path.replace("epoch_", "")) - else: - resume_step = int(path.replace("step_", "")) - args.num_train_epochs -= resume_step // len(train_dataloader) - resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step + # Extract `epoch_{i}` or `step_{i}` + training_difference = os.path.splitext(path)[0] - for epoch in range(args.num_train_epochs): + if "epoch" in training_difference: + starting_epoch = int(training_difference.replace("epoch_", "")) + 1 + resume_step = None + else: + resume_step = int(training_difference.replace("step_", "")) + starting_epoch = resume_step // len(train_dataloader) + resume_step -= starting_epoch * len(train_dataloader) + + for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: total_loss = 0 for step, batch in enumerate(train_dataloader): # We need to skip steps until we reach the resumed step - if args.resume_from_checkpoint and epoch == 0 and step < resume_step: - continue + if args.resume_from_checkpoint and epoch == starting_epoch: + if resume_step is not None and step < resume_step: + completed_steps += 1 + continue outputs = model(**batch) loss = outputs.loss # We keep track of the loss at each epoch @@ -497,7 +503,7 @@ def main(): if isinstance(checkpointing_steps, int): if completed_steps % checkpointing_steps == 0: - output_dir = f"step_{completed_steps}" + output_dir = f"step_{completed_steps }" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py index 0eced00fec..6281ee162d 100755 --- a/examples/pytorch/token-classification/run_ner_no_trainer.py +++ b/examples/pytorch/token-classification/run_ner_no_trainer.py @@ -606,33 +606,39 @@ def main(): # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 + starting_epoch = 0 # Potentially load in the weights and states from a previous save if args.resume_from_checkpoint: if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}") accelerator.load_state(args.resume_from_checkpoint) - resume_step = None - path = args.resume_from_checkpoint + path = os.path.basename(args.resume_from_checkpoint) else: # Get the most recent checkpoint dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] dirs.sort(key=os.path.getctime) path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last - if "epoch" in path: - args.num_train_epochs -= int(path.replace("epoch_", "")) - else: - resume_step = int(path.replace("step_", "")) - args.num_train_epochs -= resume_step // len(train_dataloader) - resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step + # Extract `epoch_{i}` or `step_{i}` + training_difference = os.path.splitext(path)[0] - for epoch in range(args.num_train_epochs): + if "epoch" in training_difference: + starting_epoch = int(training_difference.replace("epoch_", "")) + 1 + resume_step = None + else: + resume_step = int(training_difference.replace("step_", "")) + starting_epoch = resume_step // len(train_dataloader) + resume_step -= starting_epoch * len(train_dataloader) + + for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: total_loss = 0 for step, batch in enumerate(train_dataloader): # We need to skip steps until we reach the resumed step - if args.resume_from_checkpoint and epoch == 0 and step < resume_step: - continue + if args.resume_from_checkpoint and epoch == starting_epoch: + if resume_step is not None and step < resume_step: + completed_steps += 1 + continue outputs = model(**batch) loss = outputs.loss # We keep track of the loss at each epoch @@ -649,7 +655,7 @@ def main(): if isinstance(checkpointing_steps, int): if completed_steps % checkpointing_steps == 0: - output_dir = f"step_{completed_steps}" + output_dir = f"step_{completed_steps }" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir) diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index 120aa3e488..597c5c353e 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -552,34 +552,40 @@ def main(): # Only show the progress bar once on each machine. progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process) completed_steps = 0 + starting_epoch = 0 # Potentially load in the weights and states from a previous save if args.resume_from_checkpoint: if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "": accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}") accelerator.load_state(args.resume_from_checkpoint) - resume_step = None - path = args.resume_from_checkpoint + path = os.path.basename(args.resume_from_checkpoint) else: # Get the most recent checkpoint dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()] dirs.sort(key=os.path.getctime) path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last - if "epoch" in path: - args.num_train_epochs -= int(path.replace("epoch_", "")) - else: - resume_step = int(path.replace("step_", "")) - args.num_train_epochs -= resume_step // len(train_dataloader) - resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step + # Extract `epoch_{i}` or `step_{i}` + training_difference = os.path.splitext(path)[0] - for epoch in range(args.num_train_epochs): + if "epoch" in training_difference: + starting_epoch = int(training_difference.replace("epoch_", "")) + 1 + resume_step = None + else: + resume_step = int(training_difference.replace("step_", "")) + starting_epoch = resume_step // len(train_dataloader) + resume_step -= starting_epoch * len(train_dataloader) + + for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: total_loss = 0 for step, batch in enumerate(train_dataloader): # We need to skip steps until we reach the resumed step - if args.resume_from_checkpoint and epoch == 0 and step < resume_step: - continue + if args.resume_from_checkpoint and epoch == starting_epoch: + if resume_step is not None and step < resume_step: + completed_steps += 1 + continue outputs = model(**batch) loss = outputs.loss # We keep track of the loss at each epoch @@ -596,7 +602,7 @@ def main(): if isinstance(checkpointing_steps, int): if completed_steps % checkpointing_steps == 0: - output_dir = f"step_{completed_steps}" + output_dir = f"step_{completed_steps }" if args.output_dir is not None: output_dir = os.path.join(args.output_dir, output_dir) accelerator.save_state(output_dir)