Update run_translation_no_trainer.py (#18637)
* Update run_translation_no_trainer.py found an error in selecting `no_decay` parameters and some small modifications when the user continues to train from a checkpoint * fixs `no_decay` and `resume_step` issue 1. change `no_decay` list 2. if use continue to train their model from provided checkpoint, the `resume_step` will not be initialized properly if `args.gradient_accumulation_steps != 1`
This commit is contained in:
@@ -464,7 +464,7 @@ def main():
|
|||||||
|
|
||||||
# Optimizer
|
# Optimizer
|
||||||
# Split weights in two groups, one with weight decay and the other not.
|
# Split weights in two groups, one with weight decay and the other not.
|
||||||
no_decay = ["bias", "LayerNorm.weight"]
|
no_decay = ["bias", "layer_norm.weight"]
|
||||||
optimizer_grouped_parameters = [
|
optimizer_grouped_parameters = [
|
||||||
{
|
{
|
||||||
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
|
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
|
||||||
@@ -558,10 +558,15 @@ def main():
|
|||||||
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
||||||
resume_step = None
|
resume_step = None
|
||||||
else:
|
else:
|
||||||
resume_step = int(training_difference.replace("step_", ""))
|
# need to multiply `gradient_accumulation_steps` to reflect real steps
|
||||||
|
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
|
||||||
starting_epoch = resume_step // len(train_dataloader)
|
starting_epoch = resume_step // len(train_dataloader)
|
||||||
resume_step -= starting_epoch * len(train_dataloader)
|
resume_step -= starting_epoch * len(train_dataloader)
|
||||||
|
|
||||||
|
# update the progress_bar if load from checkpoint
|
||||||
|
progress_bar.update(starting_epoch * num_update_steps_per_epoch)
|
||||||
|
completed_steps = starting_epoch * num_update_steps_per_epoch
|
||||||
|
|
||||||
for epoch in range(starting_epoch, args.num_train_epochs):
|
for epoch in range(starting_epoch, args.num_train_epochs):
|
||||||
model.train()
|
model.train()
|
||||||
if args.with_tracking:
|
if args.with_tracking:
|
||||||
@@ -570,7 +575,9 @@ def main():
|
|||||||
# We need to skip steps until we reach the resumed step
|
# We need to skip steps until we reach the resumed step
|
||||||
if args.resume_from_checkpoint and epoch == starting_epoch:
|
if args.resume_from_checkpoint and epoch == starting_epoch:
|
||||||
if resume_step is not None and step < resume_step:
|
if resume_step is not None and step < resume_step:
|
||||||
completed_steps += 1
|
if step % args.gradient_accumulation_steps == 0:
|
||||||
|
progress_bar.update(1)
|
||||||
|
completed_steps += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
with accelerator.accumulate(model):
|
with accelerator.accumulate(model):
|
||||||
|
|||||||
@@ -602,10 +602,15 @@ def main():
|
|||||||
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
||||||
resume_step = None
|
resume_step = None
|
||||||
else:
|
else:
|
||||||
resume_step = int(training_difference.replace("step_", ""))
|
# need to multiply `gradient_accumulation_steps` to reflect real steps
|
||||||
|
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
|
||||||
starting_epoch = resume_step // len(train_dataloader)
|
starting_epoch = resume_step // len(train_dataloader)
|
||||||
resume_step -= starting_epoch * len(train_dataloader)
|
resume_step -= starting_epoch * len(train_dataloader)
|
||||||
|
|
||||||
|
# update the progress_bar if load from checkpoint
|
||||||
|
progress_bar.update(starting_epoch * num_update_steps_per_epoch)
|
||||||
|
completed_steps = starting_epoch * num_update_steps_per_epoch
|
||||||
|
|
||||||
for epoch in range(starting_epoch, args.num_train_epochs):
|
for epoch in range(starting_epoch, args.num_train_epochs):
|
||||||
model.train()
|
model.train()
|
||||||
if args.with_tracking:
|
if args.with_tracking:
|
||||||
@@ -614,7 +619,9 @@ def main():
|
|||||||
# We need to skip steps until we reach the resumed step
|
# We need to skip steps until we reach the resumed step
|
||||||
if args.resume_from_checkpoint and epoch == starting_epoch:
|
if args.resume_from_checkpoint and epoch == starting_epoch:
|
||||||
if resume_step is not None and step < resume_step:
|
if resume_step is not None and step < resume_step:
|
||||||
completed_steps += 1
|
if step % args.gradient_accumulation_steps == 0:
|
||||||
|
progress_bar.update(1)
|
||||||
|
completed_steps += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
with accelerator.accumulate(model):
|
with accelerator.accumulate(model):
|
||||||
|
|||||||
@@ -510,7 +510,7 @@ def main():
|
|||||||
|
|
||||||
# Optimizer
|
# Optimizer
|
||||||
# Split weights in two groups, one with weight decay and the other not.
|
# Split weights in two groups, one with weight decay and the other not.
|
||||||
no_decay = ["bias", "LayerNorm.weight"]
|
no_decay = ["bias", "LayerNorm.weight", "layer_norm.weight"]
|
||||||
optimizer_grouped_parameters = [
|
optimizer_grouped_parameters = [
|
||||||
{
|
{
|
||||||
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
|
"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
|
||||||
@@ -607,10 +607,15 @@ def main():
|
|||||||
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
||||||
resume_step = None
|
resume_step = None
|
||||||
else:
|
else:
|
||||||
resume_step = int(training_difference.replace("step_", ""))
|
# need to multiply `gradient_accumulation_steps` to reflect real steps
|
||||||
|
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
|
||||||
starting_epoch = resume_step // len(train_dataloader)
|
starting_epoch = resume_step // len(train_dataloader)
|
||||||
resume_step -= starting_epoch * len(train_dataloader)
|
resume_step -= starting_epoch * len(train_dataloader)
|
||||||
|
|
||||||
|
# update the progress_bar if load from checkpoint
|
||||||
|
progress_bar.update(starting_epoch * num_update_steps_per_epoch)
|
||||||
|
completed_steps = starting_epoch * num_update_steps_per_epoch
|
||||||
|
|
||||||
for epoch in range(starting_epoch, args.num_train_epochs):
|
for epoch in range(starting_epoch, args.num_train_epochs):
|
||||||
model.train()
|
model.train()
|
||||||
if args.with_tracking:
|
if args.with_tracking:
|
||||||
@@ -619,7 +624,9 @@ def main():
|
|||||||
# We need to skip steps until we reach the resumed step
|
# We need to skip steps until we reach the resumed step
|
||||||
if args.resume_from_checkpoint and epoch == starting_epoch:
|
if args.resume_from_checkpoint and epoch == starting_epoch:
|
||||||
if resume_step is not None and step < resume_step:
|
if resume_step is not None and step < resume_step:
|
||||||
completed_steps += 1
|
if step % args.gradient_accumulation_steps == 0:
|
||||||
|
progress_bar.update(1)
|
||||||
|
completed_steps += 1
|
||||||
continue
|
continue
|
||||||
outputs = model(**batch)
|
outputs = model(**batch)
|
||||||
loss = outputs.loss
|
loss = outputs.loss
|
||||||
|
|||||||
Reference in New Issue
Block a user