From 4fb64e285acf6b5e998b140811f0385a6ec9d7bd Mon Sep 17 00:00:00 2001 From: Phuc Van Phan Date: Wed, 13 Sep 2023 00:31:23 +0700 Subject: [PATCH] chore: correct update_step and correct gradient_accumulation_steps (#26068) --- .../run_image_classification_no_trainer.py | 2 +- examples/pytorch/image-pretraining/run_mim_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_mlm_no_trainer.py | 2 +- examples/pytorch/multiple-choice/run_swag_no_trainer.py | 2 +- .../question-answering/run_qa_beam_search_no_trainer.py | 2 +- examples/pytorch/question-answering/run_qa_no_trainer.py | 5 +++-- .../run_semantic_segmentation_no_trainer.py | 2 +- .../pytorch/summarization/run_summarization_no_trainer.py | 2 +- examples/pytorch/text-classification/run_glue_no_trainer.py | 2 +- examples/pytorch/token-classification/run_ner_no_trainer.py | 2 +- examples/pytorch/translation/run_translation_no_trainer.py | 2 +- 11 files changed, 13 insertions(+), 12 deletions(-) diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index 2fcb124faf..8de3fb8805 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -477,8 +477,8 @@ def main(): # need to multiply `gradient_accumulation_steps` to reflect real steps resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps starting_epoch = resume_step // len(train_dataloader) + completed_steps = resume_step // args.gradient_accumulation_steps resume_step -= starting_epoch * len(train_dataloader) - completed_steps = resume_step // args.gradient_accumulation_step # update the progress_bar if load from checkpoint progress_bar.update(completed_steps) diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py index d9d90f27c9..daa6fdee47 100644 --- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py +++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py @@ -701,8 +701,8 @@ def main(): # need to multiply `gradient_accumulation_steps` to reflect real steps resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps starting_epoch = resume_step // len(train_dataloader) - resume_step -= starting_epoch * len(train_dataloader) completed_steps = resume_step // args.gradient_accumulation_steps + resume_step -= starting_epoch * len(train_dataloader) # update the progress_bar if load from checkpoint progress_bar.update(completed_steps) diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index 5d60ffad12..3d035fded5 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -636,8 +636,8 @@ def main(): # need to multiply `gradient_accumulation_steps` to reflect real steps resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps starting_epoch = resume_step // len(train_dataloader) - resume_step -= starting_epoch * len(train_dataloader) completed_steps = resume_step // args.gradient_accumulation_steps + resume_step -= starting_epoch * len(train_dataloader) # update the progress_bar if load from checkpoint progress_bar.update(completed_steps) diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index 7f032a7107..b8f3bd040c 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -583,8 +583,8 @@ def main(): # need to multiply `gradient_accumulation_steps` to reflect real steps resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps starting_epoch = resume_step // len(train_dataloader) + completed_steps = resume_step // args.gradient_accumulation_steps resume_step -= starting_epoch * len(train_dataloader) - completed_steps = resume_step // args.gradient_accumulation_stepp # update the progress_bar if load from checkpoint progress_bar.update(completed_steps) diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py index 31b65fa258..544a1ec5ea 100644 --- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py @@ -820,8 +820,8 @@ def main(): # need to multiply `gradient_accumulation_steps` to reflect real steps resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps starting_epoch = resume_step // len(train_dataloader) + completed_steps = resume_step // args.gradient_accumulation_steps resume_step -= starting_epoch * len(train_dataloader) - completed_steps = resume_step // args.gradient_accumulation_stepp # update the progress_bar if load from checkpoint progress_bar.update(completed_steps) diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py index b5ab3f2009..c5bce44f79 100755 --- a/examples/pytorch/question-answering/run_qa_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_no_trainer.py @@ -848,10 +848,11 @@ def main(): resume_step = None completed_steps = starting_epoch * num_update_steps_per_epoch else: - resume_step = int(training_difference.replace("step_", "")) + # need to multiply `gradient_accumulation_steps` to reflect real steps + resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps starting_epoch = resume_step // len(train_dataloader) + completed_steps = resume_step // args.gradient_accumulation_steps resume_step -= starting_epoch * len(train_dataloader) - completed_steps = resume_step // args.gradient_accumulation_stepp # update the progress_bar if load from checkpoint progress_bar.update(completed_steps) diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py index 716203aaa6..14c6685d7c 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py @@ -581,8 +581,8 @@ def main(): # need to multiply `gradient_accumulation_steps` to reflect real steps resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps starting_epoch = resume_step // len(train_dataloader) + completed_steps = resume_step // args.gradient_accumulation_steps resume_step -= starting_epoch * len(train_dataloader) - completed_steps = resume_step // args.gradient_accumulation_stepp # update the progress_bar if load from checkpoint progress_bar.update(completed_steps) diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index 4750f18bce..9f51f449b1 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -652,8 +652,8 @@ def main(): # need to multiply `gradient_accumulation_steps` to reflect real steps resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps starting_epoch = resume_step // len(train_dataloader) + completed_steps = resume_step // args.gradient_accumulation_steps resume_step -= starting_epoch * len(train_dataloader) - completed_steps = resume_step // args.gradient_accumulation_stepp # update the progress_bar if load from checkpoint progress_bar.update(completed_steps) diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py index bd18e0b754..7dc24a3cc8 100644 --- a/examples/pytorch/text-classification/run_glue_no_trainer.py +++ b/examples/pytorch/text-classification/run_glue_no_trainer.py @@ -530,8 +530,8 @@ def main(): # need to multiply `gradient_accumulation_steps` to reflect real steps resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps starting_epoch = resume_step // len(train_dataloader) + completed_steps = resume_step // args.gradient_accumulation_steps resume_step -= starting_epoch * len(train_dataloader) - completed_steps = resume_step // args.gradient_accumulation_step # update the progress_bar if load from checkpoint progress_bar.update(completed_steps) diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py index 51b1d6e9cc..b179fdb84e 100755 --- a/examples/pytorch/token-classification/run_ner_no_trainer.py +++ b/examples/pytorch/token-classification/run_ner_no_trainer.py @@ -690,8 +690,8 @@ def main(): # need to multiply `gradient_accumulation_steps` to reflect real steps resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps starting_epoch = resume_step // len(train_dataloader) + completed_steps = resume_step // args.gradient_accumulation_steps resume_step -= starting_epoch * len(train_dataloader) - completed_steps = resume_step // args.gradient_accumulation_stepp # update the progress_bar if load from checkpoint progress_bar.update(completed_steps) diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index baa2c0cc7b..ced2b459e6 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -633,8 +633,8 @@ def main(): # need to multiply `gradient_accumulation_steps` to reflect real steps resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps starting_epoch = resume_step // len(train_dataloader) + completed_steps = resume_step // args.gradient_accumulation_steps resume_step -= starting_epoch * len(train_dataloader) - completed_steps = resume_step // args.gradient_accumulation_stepp # update the progress_bar if load from checkpoint progress_bar.update(completed_steps)