diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index 6a900ff761..9dd373e1a5 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -451,22 +451,26 @@ def main(): if "epoch" in training_difference: starting_epoch = int(training_difference.replace("epoch_", "")) + 1 resume_step = None + completed_steps = starting_epoch * num_update_steps_per_epoch else: resume_step = int(training_difference.replace("step_", "")) starting_epoch = resume_step // len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader) + completed_steps = resume_step + + # update the progress_bar if load from checkpoint + progress_bar.update(completed_steps) for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: total_loss = 0 - for step, batch in enumerate(train_dataloader): - # We need to skip steps until we reach the resumed step - if args.resume_from_checkpoint and epoch == starting_epoch: - if resume_step is not None and step < resume_step: - completed_steps += 1 - continue - + if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None: + # We skip the first `n` batches in the dataloader when resuming from a checkpoint + active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step) + else: + active_dataloader = train_dataloader + for step, batch in enumerate(active_dataloader): with accelerator.accumulate(model): outputs = model(**batch) loss = outputs.loss diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py index 126029150b..18d5c15638 100644 --- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py +++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py @@ -660,29 +660,27 @@ def main(): if "epoch" in training_difference: starting_epoch = int(training_difference.replace("epoch_", "")) + 1 resume_step = None + completed_steps = starting_epoch * num_update_steps_per_epoch else: # need to multiply `gradient_accumulation_steps` to reflect real steps resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps starting_epoch = resume_step // len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader) + completed_steps = resume_step # update the progress_bar if load from checkpoint - progress_bar.update(starting_epoch * num_update_steps_per_epoch) - completed_steps = starting_epoch * num_update_steps_per_epoch + progress_bar.update(completed_steps) for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: total_loss = 0 - for step, batch in enumerate(train_dataloader): - # We need to skip steps until we reach the resumed step - if args.resume_from_checkpoint and epoch == starting_epoch: - if resume_step is not None and step < resume_step: - if step % args.gradient_accumulation_steps == 0: - progress_bar.update(1) - completed_steps += 1 - continue - + if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None: + # We skip the first `n` batches in the dataloader when resuming from a checkpoint + active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step) + else: + active_dataloader = train_dataloader + for step, batch in enumerate(active_dataloader): with accelerator.accumulate(model): outputs = model(**batch) loss = outputs.loss diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index c31f2867c3..fb07ad9392 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -566,29 +566,27 @@ def main(): if "epoch" in training_difference: starting_epoch = int(training_difference.replace("epoch_", "")) + 1 resume_step = None + completed_steps = starting_epoch * num_update_steps_per_epoch else: # need to multiply `gradient_accumulation_steps` to reflect real steps resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps starting_epoch = resume_step // len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader) + completed_steps = resume_step # update the progress_bar if load from checkpoint - progress_bar.update(starting_epoch * num_update_steps_per_epoch) - completed_steps = starting_epoch * num_update_steps_per_epoch + progress_bar.update(completed_steps) for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: total_loss = 0 - for step, batch in enumerate(train_dataloader): - # We need to skip steps until we reach the resumed step - if args.resume_from_checkpoint and epoch == starting_epoch: - if resume_step is not None and step < resume_step: - if step % args.gradient_accumulation_steps == 0: - progress_bar.update(1) - completed_steps += 1 - continue - + if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None: + # We skip the first `n` batches in the dataloader when resuming from a checkpoint + active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step) + else: + active_dataloader = train_dataloader + for step, batch in enumerate(active_dataloader): with accelerator.accumulate(model): outputs = model(**batch) loss = outputs.loss diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index 29a2559851..593dae4628 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -610,29 +610,27 @@ def main(): if "epoch" in training_difference: starting_epoch = int(training_difference.replace("epoch_", "")) + 1 resume_step = None + completed_steps = starting_epoch * num_update_steps_per_epoch else: # need to multiply `gradient_accumulation_steps` to reflect real steps resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps starting_epoch = resume_step // len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader) + completed_steps = resume_step # update the progress_bar if load from checkpoint - progress_bar.update(starting_epoch * num_update_steps_per_epoch) - completed_steps = starting_epoch * num_update_steps_per_epoch + progress_bar.update(completed_steps) for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: total_loss = 0 - for step, batch in enumerate(train_dataloader): - # We need to skip steps until we reach the resumed step - if args.resume_from_checkpoint and epoch == starting_epoch: - if resume_step is not None and step < resume_step: - if step % args.gradient_accumulation_steps == 0: - progress_bar.update(1) - completed_steps += 1 - continue - + if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None: + # We skip the first `n` batches in the dataloader when resuming from a checkpoint + active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step) + else: + active_dataloader = train_dataloader + for step, batch in enumerate(active_dataloader): with accelerator.accumulate(model): outputs = model(**batch) loss = outputs.loss diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index 21f2e1bf04..7492f31ddb 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -557,22 +557,26 @@ def main(): if "epoch" in training_difference: starting_epoch = int(training_difference.replace("epoch_", "")) + 1 resume_step = None + completed_steps = starting_epoch * num_update_steps_per_epoch else: resume_step = int(training_difference.replace("step_", "")) starting_epoch = resume_step // len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader) + completed_steps = resume_step + + # update the progress_bar if load from checkpoint + progress_bar.update(completed_steps) for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: total_loss = 0 - for step, batch in enumerate(train_dataloader): - # We need to skip steps until we reach the resumed step - if args.resume_from_checkpoint and epoch == starting_epoch: - if resume_step is not None and step < resume_step: - completed_steps += 1 - continue - + if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None: + # We skip the first `n` batches in the dataloader when resuming from a checkpoint + active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step) + else: + active_dataloader = train_dataloader + for step, batch in enumerate(active_dataloader): with accelerator.accumulate(model): outputs = model(**batch) loss = outputs.loss diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py index 05c85bdf50..ed69e8b396 100644 --- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py @@ -809,22 +809,26 @@ def main(): if "epoch" in training_difference: starting_epoch = int(training_difference.replace("epoch_", "")) + 1 resume_step = None + completed_steps = starting_epoch * num_update_steps_per_epoch else: resume_step = int(training_difference.replace("step_", "")) starting_epoch = resume_step // len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader) + completed_steps = resume_step + + # update the progress_bar if load from checkpoint + progress_bar.update(completed_steps) for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: total_loss = 0 - for step, batch in enumerate(train_dataloader): - # We need to skip steps until we reach the resumed step - if args.resume_from_checkpoint and epoch == starting_epoch: - if resume_step is not None and step < resume_step: - completed_steps += 1 - continue - + if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None: + # We skip the first `n` batches in the dataloader when resuming from a checkpoint + active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step) + else: + active_dataloader = train_dataloader + for step, batch in enumerate(active_dataloader): with accelerator.accumulate(model): outputs = model(**batch) loss = outputs.loss diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py index 2e363ae970..8054779cc7 100755 --- a/examples/pytorch/question-answering/run_qa_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_no_trainer.py @@ -825,22 +825,26 @@ def main(): if "epoch" in training_difference: starting_epoch = int(training_difference.replace("epoch_", "")) + 1 resume_step = None + completed_steps = starting_epoch * num_update_steps_per_epoch else: resume_step = int(training_difference.replace("step_", "")) starting_epoch = resume_step // len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader) + completed_steps = resume_step + + # update the progress_bar if load from checkpoint + progress_bar.update(completed_steps) for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: total_loss = 0 - for step, batch in enumerate(train_dataloader): - # We need to skip steps until we reach the resumed step - if args.resume_from_checkpoint and epoch == starting_epoch: - if resume_step is not None and step < resume_step: - completed_steps += 1 - continue - + if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None: + # We skip the first `n` batches in the dataloader when resuming from a checkpoint + active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step) + else: + active_dataloader = train_dataloader + for step, batch in enumerate(active_dataloader): with accelerator.accumulate(model): outputs = model(**batch) loss = outputs.loss diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py index de997529de..5087a4186a 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py @@ -554,22 +554,26 @@ def main(): if "epoch" in training_difference: starting_epoch = int(training_difference.replace("epoch_", "")) + 1 resume_step = None + completed_steps = starting_epoch * num_update_steps_per_epoch else: resume_step = int(training_difference.replace("step_", "")) starting_epoch = resume_step // len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader) + completed_steps = resume_step + + # update the progress_bar if load from checkpoint + progress_bar.update(completed_steps) for epoch in range(starting_epoch, args.num_train_epochs): + model.train() if args.with_tracking: total_loss = 0 - model.train() - for step, batch in enumerate(train_dataloader): - # We need to skip steps until we reach the resumed step - if args.resume_from_checkpoint and epoch == starting_epoch: - if resume_step is not None and step < resume_step: - completed_steps += 1 - continue - + if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None: + # We skip the first `n` batches in the dataloader when resuming from a checkpoint + active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step) + else: + active_dataloader = train_dataloader + for step, batch in enumerate(active_dataloader): with accelerator.accumulate(model): outputs = model(**batch) loss = outputs.loss diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index ea09c6b89e..bef349fdb3 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -626,22 +626,26 @@ def main(): if "epoch" in training_difference: starting_epoch = int(training_difference.replace("epoch_", "")) + 1 resume_step = None + completed_steps = starting_epoch * num_update_steps_per_epoch else: resume_step = int(training_difference.replace("step_", "")) starting_epoch = resume_step // len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader) + completed_steps = resume_step + + # update the progress_bar if load from checkpoint + progress_bar.update(completed_steps) for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: total_loss = 0 - for step, batch in enumerate(train_dataloader): - # We need to skip steps until we reach the resumed step - if args.resume_from_checkpoint and epoch == starting_epoch: - if resume_step is not None and step < resume_step: - completed_steps += 1 - continue - + if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None: + # We skip the first `n` batches in the dataloader when resuming from a checkpoint + active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step) + else: + active_dataloader = train_dataloader + for step, batch in enumerate(active_dataloader): with accelerator.accumulate(model): outputs = model(**batch) loss = outputs.loss diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py index 2fbacade06..2ba3c44498 100644 --- a/examples/pytorch/text-classification/run_glue_no_trainer.py +++ b/examples/pytorch/text-classification/run_glue_no_trainer.py @@ -510,12 +510,12 @@ def main(): model.train() if args.with_tracking: total_loss = 0 - for step, batch in enumerate(train_dataloader): - # We need to skip steps until we reach the resumed step - if args.resume_from_checkpoint and epoch == starting_epoch: - if resume_step is not None and step < resume_step: - completed_steps += 1 - continue + if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None: + # We skip the first `n` batches in the dataloader when resuming from a checkpoint + active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step) + else: + active_dataloader = train_dataloader + for step, batch in enumerate(active_dataloader): outputs = model(**batch) loss = outputs.loss # We keep track of the loss at each epoch diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py index bc51fab14e..82aea5d4d6 100755 --- a/examples/pytorch/token-classification/run_ner_no_trainer.py +++ b/examples/pytorch/token-classification/run_ner_no_trainer.py @@ -668,12 +668,12 @@ def main(): model.train() if args.with_tracking: total_loss = 0 - for step, batch in enumerate(train_dataloader): - # We need to skip steps until we reach the resumed step - if args.resume_from_checkpoint and epoch == starting_epoch: - if resume_step is not None and step < resume_step: - completed_steps += 1 - continue + if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None: + # We skip the first `n` batches in the dataloader when resuming from a checkpoint + active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step) + else: + active_dataloader = train_dataloader + for step, batch in enumerate(active_dataloader): outputs = model(**batch) loss = outputs.loss # We keep track of the loss at each epoch diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index e52050308a..5267cea4e2 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -607,28 +607,27 @@ def main(): if "epoch" in training_difference: starting_epoch = int(training_difference.replace("epoch_", "")) + 1 resume_step = None + completed_steps = starting_epoch * num_update_steps_per_epoch else: # need to multiply `gradient_accumulation_steps` to reflect real steps resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps starting_epoch = resume_step // len(train_dataloader) resume_step -= starting_epoch * len(train_dataloader) + completed_steps = resume_step # update the progress_bar if load from checkpoint - progress_bar.update(starting_epoch * num_update_steps_per_epoch) - completed_steps = starting_epoch * num_update_steps_per_epoch + progress_bar.update(completed_steps) for epoch in range(starting_epoch, args.num_train_epochs): model.train() if args.with_tracking: total_loss = 0 - for step, batch in enumerate(train_dataloader): - # We need to skip steps until we reach the resumed step - if args.resume_from_checkpoint and epoch == starting_epoch: - if resume_step is not None and step < resume_step: - if step % args.gradient_accumulation_steps == 0: - progress_bar.update(1) - completed_steps += 1 - continue + if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None: + # We skip the first `n` batches in the dataloader when resuming from a checkpoint + active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step) + else: + active_dataloader = train_dataloader + for step, batch in enumerate(active_dataloader): outputs = model(**batch) loss = outputs.loss # We keep track of the loss at each epoch