Update run_translation_no_trainer.py (#18637)
* Update run_translation_no_trainer.py found an error in selecting `no_decay` parameters and some small modifications when the user continues to train from a checkpoint * fixs `no_decay` and `resume_step` issue 1. change `no_decay` list 2. if use continue to train their model from provided checkpoint, the `resume_step` will not be initialized properly if `args.gradient_accumulation_steps != 1`
This commit is contained in:
@@ -464,7 +464,7 @@ def main():
|
||||
|
||||
# Optimizer
|
||||
# Split weights in two groups, one with weight decay and the other not.
|
||||
no_decay = ["bias", "LayerNorm.weight"]
|
||||
no_decay = ["bias", "layer_norm.weight"]
|
||||
optimizer_grouped_parameters = [
|
||||
{
|
||||
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
|
||||
@@ -558,10 +558,15 @@ def main():
|
||||
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
||||
resume_step = None
|
||||
else:
|
||||
resume_step = int(training_difference.replace("step_", ""))
|
||||
# need to multiply `gradient_accumulation_steps` to reflect real steps
|
||||
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
|
||||
starting_epoch = resume_step // len(train_dataloader)
|
||||
resume_step -= starting_epoch * len(train_dataloader)
|
||||
|
||||
# update the progress_bar if load from checkpoint
|
||||
progress_bar.update(starting_epoch * num_update_steps_per_epoch)
|
||||
completed_steps = starting_epoch * num_update_steps_per_epoch
|
||||
|
||||
for epoch in range(starting_epoch, args.num_train_epochs):
|
||||
model.train()
|
||||
if args.with_tracking:
|
||||
@@ -570,7 +575,9 @@ def main():
|
||||
# We need to skip steps until we reach the resumed step
|
||||
if args.resume_from_checkpoint and epoch == starting_epoch:
|
||||
if resume_step is not None and step < resume_step:
|
||||
completed_steps += 1
|
||||
if step % args.gradient_accumulation_steps == 0:
|
||||
progress_bar.update(1)
|
||||
completed_steps += 1
|
||||
continue
|
||||
|
||||
with accelerator.accumulate(model):
|
||||
|
||||
@@ -602,10 +602,15 @@ def main():
|
||||
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
||||
resume_step = None
|
||||
else:
|
||||
resume_step = int(training_difference.replace("step_", ""))
|
||||
# need to multiply `gradient_accumulation_steps` to reflect real steps
|
||||
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
|
||||
starting_epoch = resume_step // len(train_dataloader)
|
||||
resume_step -= starting_epoch * len(train_dataloader)
|
||||
|
||||
# update the progress_bar if load from checkpoint
|
||||
progress_bar.update(starting_epoch * num_update_steps_per_epoch)
|
||||
completed_steps = starting_epoch * num_update_steps_per_epoch
|
||||
|
||||
for epoch in range(starting_epoch, args.num_train_epochs):
|
||||
model.train()
|
||||
if args.with_tracking:
|
||||
@@ -614,7 +619,9 @@ def main():
|
||||
# We need to skip steps until we reach the resumed step
|
||||
if args.resume_from_checkpoint and epoch == starting_epoch:
|
||||
if resume_step is not None and step < resume_step:
|
||||
completed_steps += 1
|
||||
if step % args.gradient_accumulation_steps == 0:
|
||||
progress_bar.update(1)
|
||||
completed_steps += 1
|
||||
continue
|
||||
|
||||
with accelerator.accumulate(model):
|
||||
|
||||
@@ -510,7 +510,7 @@ def main():
|
||||
|
||||
# Optimizer
|
||||
# Split weights in two groups, one with weight decay and the other not.
|
||||
no_decay = ["bias", "LayerNorm.weight"]
|
||||
no_decay = ["bias", "LayerNorm.weight", "layer_norm.weight"]
|
||||
optimizer_grouped_parameters = [
|
||||
{
|
||||
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
|
||||
@@ -607,10 +607,15 @@ def main():
|
||||
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
||||
resume_step = None
|
||||
else:
|
||||
resume_step = int(training_difference.replace("step_", ""))
|
||||
# need to multiply `gradient_accumulation_steps` to reflect real steps
|
||||
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
|
||||
starting_epoch = resume_step // len(train_dataloader)
|
||||
resume_step -= starting_epoch * len(train_dataloader)
|
||||
|
||||
# update the progress_bar if load from checkpoint
|
||||
progress_bar.update(starting_epoch * num_update_steps_per_epoch)
|
||||
completed_steps = starting_epoch * num_update_steps_per_epoch
|
||||
|
||||
for epoch in range(starting_epoch, args.num_train_epochs):
|
||||
model.train()
|
||||
if args.with_tracking:
|
||||
@@ -619,7 +624,9 @@ def main():
|
||||
# We need to skip steps until we reach the resumed step
|
||||
if args.resume_from_checkpoint and epoch == starting_epoch:
|
||||
if resume_step is not None and step < resume_step:
|
||||
completed_steps += 1
|
||||
if step % args.gradient_accumulation_steps == 0:
|
||||
progress_bar.update(1)
|
||||
completed_steps += 1
|
||||
continue
|
||||
outputs = model(**batch)
|
||||
loss = outputs.loss
|
||||
|
||||
Reference in New Issue
Block a user