Properly calculate the total train iterations and recalculate num epochs in no_trainer scripts (#17856)
This commit is contained in:
@@ -371,11 +371,11 @@ def main():
|
|||||||
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
|
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
|
||||||
|
|
||||||
# Scheduler and math around the number of training steps.
|
# Scheduler and math around the number of training steps.
|
||||||
|
overrode_max_train_steps = False
|
||||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||||
if args.max_train_steps is None:
|
if args.max_train_steps is None:
|
||||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||||
else:
|
overrode_max_train_steps = True
|
||||||
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
|
|
||||||
|
|
||||||
lr_scheduler = get_scheduler(
|
lr_scheduler = get_scheduler(
|
||||||
name=args.lr_scheduler_type,
|
name=args.lr_scheduler_type,
|
||||||
@@ -391,7 +391,10 @@ def main():
|
|||||||
|
|
||||||
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
|
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
|
||||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
if overrode_max_train_steps:
|
||||||
|
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||||
|
# Afterwards we recalculate our number of training epochs
|
||||||
|
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
|
||||||
|
|
||||||
# Figure out how many steps we should save the Accelerator states
|
# Figure out how many steps we should save the Accelerator states
|
||||||
if hasattr(args.checkpointing_steps, "isdigit"):
|
if hasattr(args.checkpointing_steps, "isdigit"):
|
||||||
|
|||||||
@@ -474,11 +474,11 @@ def main():
|
|||||||
model.tie_weights()
|
model.tie_weights()
|
||||||
|
|
||||||
# Scheduler and math around the number of training steps.
|
# Scheduler and math around the number of training steps.
|
||||||
|
overrode_max_train_steps = False
|
||||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||||
if args.max_train_steps is None:
|
if args.max_train_steps is None:
|
||||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||||
else:
|
overrode_max_train_steps = True
|
||||||
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
|
|
||||||
|
|
||||||
lr_scheduler = get_scheduler(
|
lr_scheduler = get_scheduler(
|
||||||
name=args.lr_scheduler_type,
|
name=args.lr_scheduler_type,
|
||||||
@@ -494,7 +494,10 @@ def main():
|
|||||||
|
|
||||||
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
|
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
|
||||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
if overrode_max_train_steps:
|
||||||
|
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||||
|
# Afterwards we recalculate our number of training epochs
|
||||||
|
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
|
||||||
|
|
||||||
# Figure out how many steps we should save the Accelerator states
|
# Figure out how many steps we should save the Accelerator states
|
||||||
if hasattr(args.checkpointing_steps, "isdigit"):
|
if hasattr(args.checkpointing_steps, "isdigit"):
|
||||||
|
|||||||
@@ -518,11 +518,11 @@ def main():
|
|||||||
# shorter in multiprocess)
|
# shorter in multiprocess)
|
||||||
|
|
||||||
# Scheduler and math around the number of training steps.
|
# Scheduler and math around the number of training steps.
|
||||||
|
overrode_max_train_steps = False
|
||||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||||
if args.max_train_steps is None:
|
if args.max_train_steps is None:
|
||||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||||
else:
|
overrode_max_train_steps = True
|
||||||
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
|
|
||||||
|
|
||||||
lr_scheduler = get_scheduler(
|
lr_scheduler = get_scheduler(
|
||||||
name=args.lr_scheduler_type,
|
name=args.lr_scheduler_type,
|
||||||
@@ -538,7 +538,10 @@ def main():
|
|||||||
|
|
||||||
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
|
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
|
||||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
if overrode_max_train_steps:
|
||||||
|
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||||
|
# Afterwards we recalculate our number of training epochs
|
||||||
|
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
|
||||||
|
|
||||||
# Figure out how many steps we should save the Accelerator states
|
# Figure out how many steps we should save the Accelerator states
|
||||||
if hasattr(args.checkpointing_steps, "isdigit"):
|
if hasattr(args.checkpointing_steps, "isdigit"):
|
||||||
|
|||||||
@@ -470,11 +470,11 @@ def main():
|
|||||||
model.to(device)
|
model.to(device)
|
||||||
|
|
||||||
# Scheduler and math around the number of training steps.
|
# Scheduler and math around the number of training steps.
|
||||||
|
overrode_max_train_steps = False
|
||||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||||
if args.max_train_steps is None:
|
if args.max_train_steps is None:
|
||||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||||
else:
|
overrode_max_train_steps = True
|
||||||
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
|
|
||||||
|
|
||||||
lr_scheduler = get_scheduler(
|
lr_scheduler = get_scheduler(
|
||||||
name=args.lr_scheduler_type,
|
name=args.lr_scheduler_type,
|
||||||
@@ -490,7 +490,10 @@ def main():
|
|||||||
|
|
||||||
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
|
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
|
||||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
if overrode_max_train_steps:
|
||||||
|
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||||
|
# Afterwards we recalculate our number of training epochs
|
||||||
|
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
|
||||||
|
|
||||||
# Figure out how many steps we should save the Accelerator states
|
# Figure out how many steps we should save the Accelerator states
|
||||||
if hasattr(args.checkpointing_steps, "isdigit"):
|
if hasattr(args.checkpointing_steps, "isdigit"):
|
||||||
|
|||||||
@@ -729,11 +729,11 @@ def main():
|
|||||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
|
||||||
|
|
||||||
# Scheduler and math around the number of training steps.
|
# Scheduler and math around the number of training steps.
|
||||||
|
overrode_max_train_steps = False
|
||||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||||
if args.max_train_steps is None:
|
if args.max_train_steps is None:
|
||||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||||
else:
|
overrode_max_train_steps = True
|
||||||
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
|
|
||||||
|
|
||||||
lr_scheduler = get_scheduler(
|
lr_scheduler = get_scheduler(
|
||||||
name=args.lr_scheduler_type,
|
name=args.lr_scheduler_type,
|
||||||
@@ -749,7 +749,10 @@ def main():
|
|||||||
|
|
||||||
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
|
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
|
||||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
if overrode_max_train_steps:
|
||||||
|
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||||
|
# Afterwards we recalculate our number of training epochs
|
||||||
|
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
|
||||||
|
|
||||||
# Figure out how many steps we should save the Accelerator states
|
# Figure out how many steps we should save the Accelerator states
|
||||||
if hasattr(args.checkpointing_steps, "isdigit"):
|
if hasattr(args.checkpointing_steps, "isdigit"):
|
||||||
|
|||||||
@@ -747,11 +747,11 @@ def main():
|
|||||||
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
|
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
|
||||||
|
|
||||||
# Scheduler and math around the number of training steps.
|
# Scheduler and math around the number of training steps.
|
||||||
|
overrode_max_train_steps = False
|
||||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||||
if args.max_train_steps is None:
|
if args.max_train_steps is None:
|
||||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||||
else:
|
overrode_max_train_steps = True
|
||||||
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
|
|
||||||
|
|
||||||
lr_scheduler = get_scheduler(
|
lr_scheduler = get_scheduler(
|
||||||
name=args.lr_scheduler_type,
|
name=args.lr_scheduler_type,
|
||||||
@@ -767,7 +767,10 @@ def main():
|
|||||||
|
|
||||||
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
|
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
|
||||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
if overrode_max_train_steps:
|
||||||
|
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||||
|
# Afterwards we recalculate our number of training epochs
|
||||||
|
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
|
||||||
|
|
||||||
# Figure out how many steps we should save the Accelerator states
|
# Figure out how many steps we should save the Accelerator states
|
||||||
if hasattr(args.checkpointing_steps, "isdigit"):
|
if hasattr(args.checkpointing_steps, "isdigit"):
|
||||||
|
|||||||
@@ -474,11 +474,11 @@ def main():
|
|||||||
checkpointing_steps = None
|
checkpointing_steps = None
|
||||||
|
|
||||||
# Scheduler and math around the number of training steps.
|
# Scheduler and math around the number of training steps.
|
||||||
|
overrode_max_train_steps = False
|
||||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||||
if args.max_train_steps is None:
|
if args.max_train_steps is None:
|
||||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||||
else:
|
overrode_max_train_steps = True
|
||||||
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
|
|
||||||
|
|
||||||
lr_scheduler = get_scheduler(
|
lr_scheduler = get_scheduler(
|
||||||
name=args.lr_scheduler_type,
|
name=args.lr_scheduler_type,
|
||||||
@@ -494,7 +494,10 @@ def main():
|
|||||||
|
|
||||||
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
|
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
|
||||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
if overrode_max_train_steps:
|
||||||
|
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||||
|
# Afterwards we recalculate our number of training epochs
|
||||||
|
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
|
||||||
|
|
||||||
# Instantiate metric
|
# Instantiate metric
|
||||||
metric = load_metric("mean_iou")
|
metric = load_metric("mean_iou")
|
||||||
|
|||||||
@@ -546,8 +546,6 @@ def main():
|
|||||||
|
|
||||||
if args.max_train_steps is None:
|
if args.max_train_steps is None:
|
||||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||||
else:
|
|
||||||
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
|
|
||||||
|
|
||||||
lr_scheduler = get_scheduler(
|
lr_scheduler = get_scheduler(
|
||||||
name=args.lr_scheduler_type,
|
name=args.lr_scheduler_type,
|
||||||
@@ -556,6 +554,9 @@ def main():
|
|||||||
num_training_steps=args.max_train_steps,
|
num_training_steps=args.max_train_steps,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Afterwards we recalculate our number of training epochs
|
||||||
|
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
|
||||||
|
|
||||||
# 5. Train
|
# 5. Train
|
||||||
total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
|
total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
|
||||||
|
|
||||||
|
|||||||
@@ -540,11 +540,11 @@ def main():
|
|||||||
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
|
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
|
||||||
|
|
||||||
# Scheduler and math around the number of training steps.
|
# Scheduler and math around the number of training steps.
|
||||||
|
overrode_max_train_steps = False
|
||||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||||
if args.max_train_steps is None:
|
if args.max_train_steps is None:
|
||||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||||
else:
|
overrode_max_train_steps = True
|
||||||
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
|
|
||||||
|
|
||||||
lr_scheduler = get_scheduler(
|
lr_scheduler = get_scheduler(
|
||||||
name=args.lr_scheduler_type,
|
name=args.lr_scheduler_type,
|
||||||
@@ -560,7 +560,10 @@ def main():
|
|||||||
|
|
||||||
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
|
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
|
||||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
if overrode_max_train_steps:
|
||||||
|
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||||
|
# Afterwards we recalculate our number of training epochs
|
||||||
|
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
|
||||||
|
|
||||||
# Figure out how many steps we should save the Accelerator states
|
# Figure out how many steps we should save the Accelerator states
|
||||||
if hasattr(args.checkpointing_steps, "isdigit"):
|
if hasattr(args.checkpointing_steps, "isdigit"):
|
||||||
|
|||||||
@@ -421,11 +421,11 @@ def main():
|
|||||||
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
|
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
|
||||||
|
|
||||||
# Scheduler and math around the number of training steps.
|
# Scheduler and math around the number of training steps.
|
||||||
|
overrode_max_train_steps = False
|
||||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||||
if args.max_train_steps is None:
|
if args.max_train_steps is None:
|
||||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||||
else:
|
overrode_max_train_steps = True
|
||||||
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
|
|
||||||
|
|
||||||
lr_scheduler = get_scheduler(
|
lr_scheduler = get_scheduler(
|
||||||
name=args.lr_scheduler_type,
|
name=args.lr_scheduler_type,
|
||||||
@@ -439,9 +439,12 @@ def main():
|
|||||||
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||||
)
|
)
|
||||||
|
|
||||||
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
|
# We need to recalculate our total training steps as the size of the training dataloader may have changed
|
||||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
if overrode_max_train_steps:
|
||||||
|
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||||
|
# Afterwards we recalculate our number of training epochs
|
||||||
|
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
|
||||||
|
|
||||||
# Figure out how many steps we should save the Accelerator states
|
# Figure out how many steps we should save the Accelerator states
|
||||||
if hasattr(args.checkpointing_steps, "isdigit"):
|
if hasattr(args.checkpointing_steps, "isdigit"):
|
||||||
|
|||||||
@@ -536,11 +536,11 @@ def main():
|
|||||||
model.to(device)
|
model.to(device)
|
||||||
|
|
||||||
# Scheduler and math around the number of training steps.
|
# Scheduler and math around the number of training steps.
|
||||||
|
overrode_max_train_steps = False
|
||||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||||
if args.max_train_steps is None:
|
if args.max_train_steps is None:
|
||||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||||
else:
|
overrode_max_train_steps = True
|
||||||
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
|
|
||||||
|
|
||||||
lr_scheduler = get_scheduler(
|
lr_scheduler = get_scheduler(
|
||||||
name=args.lr_scheduler_type,
|
name=args.lr_scheduler_type,
|
||||||
@@ -556,7 +556,10 @@ def main():
|
|||||||
|
|
||||||
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
|
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
|
||||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
if overrode_max_train_steps:
|
||||||
|
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||||
|
# Afterwards we recalculate our number of training epochs
|
||||||
|
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
|
||||||
|
|
||||||
# Figure out how many steps we should save the Accelerator states
|
# Figure out how many steps we should save the Accelerator states
|
||||||
if hasattr(args.checkpointing_steps, "isdigit"):
|
if hasattr(args.checkpointing_steps, "isdigit"):
|
||||||
|
|||||||
@@ -521,11 +521,11 @@ def main():
|
|||||||
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
|
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
|
||||||
|
|
||||||
# Scheduler and math around the number of training steps.
|
# Scheduler and math around the number of training steps.
|
||||||
|
overrode_max_train_steps = False
|
||||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||||
if args.max_train_steps is None:
|
if args.max_train_steps is None:
|
||||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||||
else:
|
overrode_max_train_steps = True
|
||||||
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
|
|
||||||
|
|
||||||
lr_scheduler = get_scheduler(
|
lr_scheduler = get_scheduler(
|
||||||
name=args.lr_scheduler_type,
|
name=args.lr_scheduler_type,
|
||||||
@@ -541,8 +541,10 @@ def main():
|
|||||||
|
|
||||||
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
|
# We need to recalculate our total training steps as the size of the training dataloader may have changed.
|
||||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||||
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
if overrode_max_train_steps:
|
||||||
|
args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
|
||||||
|
# Afterwards we recalculate our number of training epochs
|
||||||
|
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
|
||||||
# Figure out how many steps we should save the Accelerator states
|
# Figure out how many steps we should save the Accelerator states
|
||||||
if hasattr(args.checkpointing_steps, "isdigit"):
|
if hasattr(args.checkpointing_steps, "isdigit"):
|
||||||
checkpointing_steps = args.checkpointing_steps
|
checkpointing_steps = args.checkpointing_steps
|
||||||
|
|||||||
Reference in New Issue
Block a user