From 75259b44bf2e2b98b5a4d431fb400b7190342a01 Mon Sep 17 00:00:00 2001 From: Zachary Mueller Date: Thu, 23 Jun 2022 15:46:01 -0400 Subject: [PATCH] Properly calculate the total train iterations and recalculate num epochs in no_trainer scripts (#17856) --- .../run_image_classification_no_trainer.py | 9 ++++++--- .../pytorch/language-modeling/run_clm_no_trainer.py | 9 ++++++--- .../pytorch/language-modeling/run_mlm_no_trainer.py | 9 ++++++--- .../pytorch/multiple-choice/run_swag_no_trainer.py | 9 ++++++--- .../run_qa_beam_search_no_trainer.py | 9 ++++++--- .../pytorch/question-answering/run_qa_no_trainer.py | 9 ++++++--- .../run_semantic_segmentation_no_trainer.py | 9 ++++++--- .../run_wav2vec2_pretraining_no_trainer.py | 5 +++-- .../summarization/run_summarization_no_trainer.py | 9 ++++++--- .../text-classification/run_glue_no_trainer.py | 11 +++++++---- .../token-classification/run_ner_no_trainer.py | 9 ++++++--- .../pytorch/translation/run_translation_no_trainer.py | 10 ++++++---- 12 files changed, 70 insertions(+), 37 deletions(-) diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index 76b2059a1b..e59cc490f7 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -371,11 +371,11 @@ def main(): optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Scheduler and math around the number of training steps. + overrode_max_train_steps = False num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch - else: - args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + overrode_max_train_steps = True lr_scheduler = get_scheduler( name=args.lr_scheduler_type, @@ -391,7 +391,10 @@ def main(): # We need to recalculate our total training steps as the size of the training dataloader may have changed. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) - args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + if overrode_max_train_steps: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + # Afterwards we recalculate our number of training epochs + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) # Figure out how many steps we should save the Accelerator states if hasattr(args.checkpointing_steps, "isdigit"): diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 73d1ae0863..6be0e4da93 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -474,11 +474,11 @@ def main(): model.tie_weights() # Scheduler and math around the number of training steps. + overrode_max_train_steps = False num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch - else: - args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + overrode_max_train_steps = True lr_scheduler = get_scheduler( name=args.lr_scheduler_type, @@ -494,7 +494,10 @@ def main(): # We need to recalculate our total training steps as the size of the training dataloader may have changed. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) - args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + if overrode_max_train_steps: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + # Afterwards we recalculate our number of training epochs + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) # Figure out how many steps we should save the Accelerator states if hasattr(args.checkpointing_steps, "isdigit"): diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index 32d42412e3..a4e475d8e2 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -518,11 +518,11 @@ def main(): # shorter in multiprocess) # Scheduler and math around the number of training steps. + overrode_max_train_steps = False num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch - else: - args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + overrode_max_train_steps = True lr_scheduler = get_scheduler( name=args.lr_scheduler_type, @@ -538,7 +538,10 @@ def main(): # We need to recalculate our total training steps as the size of the training dataloader may have changed. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) - args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + if overrode_max_train_steps: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + # Afterwards we recalculate our number of training epochs + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) # Figure out how many steps we should save the Accelerator states if hasattr(args.checkpointing_steps, "isdigit"): diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index 6e948a315b..fb22d54b20 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -470,11 +470,11 @@ def main(): model.to(device) # Scheduler and math around the number of training steps. + overrode_max_train_steps = False num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch - else: - args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + overrode_max_train_steps = True lr_scheduler = get_scheduler( name=args.lr_scheduler_type, @@ -490,7 +490,10 @@ def main(): # We need to recalculate our total training steps as the size of the training dataloader may have changed. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) - args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + if overrode_max_train_steps: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + # Afterwards we recalculate our number of training epochs + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) # Figure out how many steps we should save the Accelerator states if hasattr(args.checkpointing_steps, "isdigit"): diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py index 32b152d9e4..0c6c916026 100644 --- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py @@ -729,11 +729,11 @@ def main(): optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Scheduler and math around the number of training steps. + overrode_max_train_steps = False num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch - else: - args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + overrode_max_train_steps = True lr_scheduler = get_scheduler( name=args.lr_scheduler_type, @@ -749,7 +749,10 @@ def main(): # We need to recalculate our total training steps as the size of the training dataloader may have changed. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) - args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + if overrode_max_train_steps: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + # Afterwards we recalculate our number of training epochs + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) # Figure out how many steps we should save the Accelerator states if hasattr(args.checkpointing_steps, "isdigit"): diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py index 90d955f1e6..7243a666f5 100755 --- a/examples/pytorch/question-answering/run_qa_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_no_trainer.py @@ -747,11 +747,11 @@ def main(): optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Scheduler and math around the number of training steps. + overrode_max_train_steps = False num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch - else: - args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + overrode_max_train_steps = True lr_scheduler = get_scheduler( name=args.lr_scheduler_type, @@ -767,7 +767,10 @@ def main(): # We need to recalculate our total training steps as the size of the training dataloader may have changed. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) - args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + if overrode_max_train_steps: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + # Afterwards we recalculate our number of training epochs + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) # Figure out how many steps we should save the Accelerator states if hasattr(args.checkpointing_steps, "isdigit"): diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py index 37df263f5b..35d069a747 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py @@ -474,11 +474,11 @@ def main(): checkpointing_steps = None # Scheduler and math around the number of training steps. + overrode_max_train_steps = False num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch - else: - args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + overrode_max_train_steps = True lr_scheduler = get_scheduler( name=args.lr_scheduler_type, @@ -494,7 +494,10 @@ def main(): # We need to recalculate our total training steps as the size of the training dataloader may have changed. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) - args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + if overrode_max_train_steps: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + # Afterwards we recalculate our number of training epochs + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) # Instantiate metric metric = load_metric("mean_iou") diff --git a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py index 1f6125390d..a3db215d08 100755 --- a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py +++ b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py @@ -546,8 +546,6 @@ def main(): if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch - else: - args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) lr_scheduler = get_scheduler( name=args.lr_scheduler_type, @@ -556,6 +554,9 @@ def main(): num_training_steps=args.max_train_steps, ) + # Afterwards we recalculate our number of training epochs + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + # 5. Train total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index 98c7f09bd4..02e9203e7a 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -540,11 +540,11 @@ def main(): optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Scheduler and math around the number of training steps. + overrode_max_train_steps = False num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch - else: - args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + overrode_max_train_steps = True lr_scheduler = get_scheduler( name=args.lr_scheduler_type, @@ -560,7 +560,10 @@ def main(): # We need to recalculate our total training steps as the size of the training dataloader may have changed. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) - args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + if overrode_max_train_steps: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + # Afterwards we recalculate our number of training epochs + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) # Figure out how many steps we should save the Accelerator states if hasattr(args.checkpointing_steps, "isdigit"): diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py index 4e73a10e9a..4a022db69c 100644 --- a/examples/pytorch/text-classification/run_glue_no_trainer.py +++ b/examples/pytorch/text-classification/run_glue_no_trainer.py @@ -421,11 +421,11 @@ def main(): optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Scheduler and math around the number of training steps. + overrode_max_train_steps = False num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch - else: - args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + overrode_max_train_steps = True lr_scheduler = get_scheduler( name=args.lr_scheduler_type, @@ -439,9 +439,12 @@ def main(): model, optimizer, train_dataloader, eval_dataloader, lr_scheduler ) - # We need to recalculate our total training steps as the size of the training dataloader may have changed. + # We need to recalculate our total training steps as the size of the training dataloader may have changed num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) - args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + if overrode_max_train_steps: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + # Afterwards we recalculate our number of training epochs + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) # Figure out how many steps we should save the Accelerator states if hasattr(args.checkpointing_steps, "isdigit"): diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py index 4910b30e04..e6fbc8bfd0 100755 --- a/examples/pytorch/token-classification/run_ner_no_trainer.py +++ b/examples/pytorch/token-classification/run_ner_no_trainer.py @@ -536,11 +536,11 @@ def main(): model.to(device) # Scheduler and math around the number of training steps. + overrode_max_train_steps = False num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch - else: - args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + overrode_max_train_steps = True lr_scheduler = get_scheduler( name=args.lr_scheduler_type, @@ -556,7 +556,10 @@ def main(): # We need to recalculate our total training steps as the size of the training dataloader may have changed. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) - args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + if overrode_max_train_steps: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + # Afterwards we recalculate our number of training epochs + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) # Figure out how many steps we should save the Accelerator states if hasattr(args.checkpointing_steps, "isdigit"): diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index acc49ffdfc..7227681d05 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -521,11 +521,11 @@ def main(): optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate) # Scheduler and math around the number of training steps. + overrode_max_train_steps = False num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) if args.max_train_steps is None: args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch - else: - args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) + overrode_max_train_steps = True lr_scheduler = get_scheduler( name=args.lr_scheduler_type, @@ -541,8 +541,10 @@ def main(): # We need to recalculate our total training steps as the size of the training dataloader may have changed. num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps) - args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch - + if overrode_max_train_steps: + args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch + # Afterwards we recalculate our number of training epochs + args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch) # Figure out how many steps we should save the Accelerator states if hasattr(args.checkpointing_steps, "isdigit"): checkpointing_steps = args.checkpointing_steps