Update all no_trainer with skip_first_batches (#23664)
This commit is contained in:
@@ -451,22 +451,26 @@ def main():
|
|||||||
if "epoch" in training_difference:
|
if "epoch" in training_difference:
|
||||||
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
||||||
resume_step = None
|
resume_step = None
|
||||||
|
completed_steps = starting_epoch * num_update_steps_per_epoch
|
||||||
else:
|
else:
|
||||||
resume_step = int(training_difference.replace("step_", ""))
|
resume_step = int(training_difference.replace("step_", ""))
|
||||||
starting_epoch = resume_step // len(train_dataloader)
|
starting_epoch = resume_step // len(train_dataloader)
|
||||||
resume_step -= starting_epoch * len(train_dataloader)
|
resume_step -= starting_epoch * len(train_dataloader)
|
||||||
|
completed_steps = resume_step
|
||||||
|
|
||||||
|
# update the progress_bar if load from checkpoint
|
||||||
|
progress_bar.update(completed_steps)
|
||||||
|
|
||||||
for epoch in range(starting_epoch, args.num_train_epochs):
|
for epoch in range(starting_epoch, args.num_train_epochs):
|
||||||
model.train()
|
model.train()
|
||||||
if args.with_tracking:
|
if args.with_tracking:
|
||||||
total_loss = 0
|
total_loss = 0
|
||||||
for step, batch in enumerate(train_dataloader):
|
if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
|
||||||
# We need to skip steps until we reach the resumed step
|
# We skip the first `n` batches in the dataloader when resuming from a checkpoint
|
||||||
if args.resume_from_checkpoint and epoch == starting_epoch:
|
active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
|
||||||
if resume_step is not None and step < resume_step:
|
else:
|
||||||
completed_steps += 1
|
active_dataloader = train_dataloader
|
||||||
continue
|
for step, batch in enumerate(active_dataloader):
|
||||||
|
|
||||||
with accelerator.accumulate(model):
|
with accelerator.accumulate(model):
|
||||||
outputs = model(**batch)
|
outputs = model(**batch)
|
||||||
loss = outputs.loss
|
loss = outputs.loss
|
||||||
|
|||||||
@@ -660,29 +660,27 @@ def main():
|
|||||||
if "epoch" in training_difference:
|
if "epoch" in training_difference:
|
||||||
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
||||||
resume_step = None
|
resume_step = None
|
||||||
|
completed_steps = starting_epoch * num_update_steps_per_epoch
|
||||||
else:
|
else:
|
||||||
# need to multiply `gradient_accumulation_steps` to reflect real steps
|
# need to multiply `gradient_accumulation_steps` to reflect real steps
|
||||||
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
|
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
|
||||||
starting_epoch = resume_step // len(train_dataloader)
|
starting_epoch = resume_step // len(train_dataloader)
|
||||||
resume_step -= starting_epoch * len(train_dataloader)
|
resume_step -= starting_epoch * len(train_dataloader)
|
||||||
|
completed_steps = resume_step
|
||||||
|
|
||||||
# update the progress_bar if load from checkpoint
|
# update the progress_bar if load from checkpoint
|
||||||
progress_bar.update(starting_epoch * num_update_steps_per_epoch)
|
progress_bar.update(completed_steps)
|
||||||
completed_steps = starting_epoch * num_update_steps_per_epoch
|
|
||||||
|
|
||||||
for epoch in range(starting_epoch, args.num_train_epochs):
|
for epoch in range(starting_epoch, args.num_train_epochs):
|
||||||
model.train()
|
model.train()
|
||||||
if args.with_tracking:
|
if args.with_tracking:
|
||||||
total_loss = 0
|
total_loss = 0
|
||||||
for step, batch in enumerate(train_dataloader):
|
if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
|
||||||
# We need to skip steps until we reach the resumed step
|
# We skip the first `n` batches in the dataloader when resuming from a checkpoint
|
||||||
if args.resume_from_checkpoint and epoch == starting_epoch:
|
active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
|
||||||
if resume_step is not None and step < resume_step:
|
else:
|
||||||
if step % args.gradient_accumulation_steps == 0:
|
active_dataloader = train_dataloader
|
||||||
progress_bar.update(1)
|
for step, batch in enumerate(active_dataloader):
|
||||||
completed_steps += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
with accelerator.accumulate(model):
|
with accelerator.accumulate(model):
|
||||||
outputs = model(**batch)
|
outputs = model(**batch)
|
||||||
loss = outputs.loss
|
loss = outputs.loss
|
||||||
|
|||||||
@@ -566,29 +566,27 @@ def main():
|
|||||||
if "epoch" in training_difference:
|
if "epoch" in training_difference:
|
||||||
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
||||||
resume_step = None
|
resume_step = None
|
||||||
|
completed_steps = starting_epoch * num_update_steps_per_epoch
|
||||||
else:
|
else:
|
||||||
# need to multiply `gradient_accumulation_steps` to reflect real steps
|
# need to multiply `gradient_accumulation_steps` to reflect real steps
|
||||||
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
|
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
|
||||||
starting_epoch = resume_step // len(train_dataloader)
|
starting_epoch = resume_step // len(train_dataloader)
|
||||||
resume_step -= starting_epoch * len(train_dataloader)
|
resume_step -= starting_epoch * len(train_dataloader)
|
||||||
|
completed_steps = resume_step
|
||||||
|
|
||||||
# update the progress_bar if load from checkpoint
|
# update the progress_bar if load from checkpoint
|
||||||
progress_bar.update(starting_epoch * num_update_steps_per_epoch)
|
progress_bar.update(completed_steps)
|
||||||
completed_steps = starting_epoch * num_update_steps_per_epoch
|
|
||||||
|
|
||||||
for epoch in range(starting_epoch, args.num_train_epochs):
|
for epoch in range(starting_epoch, args.num_train_epochs):
|
||||||
model.train()
|
model.train()
|
||||||
if args.with_tracking:
|
if args.with_tracking:
|
||||||
total_loss = 0
|
total_loss = 0
|
||||||
for step, batch in enumerate(train_dataloader):
|
if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
|
||||||
# We need to skip steps until we reach the resumed step
|
# We skip the first `n` batches in the dataloader when resuming from a checkpoint
|
||||||
if args.resume_from_checkpoint and epoch == starting_epoch:
|
active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
|
||||||
if resume_step is not None and step < resume_step:
|
else:
|
||||||
if step % args.gradient_accumulation_steps == 0:
|
active_dataloader = train_dataloader
|
||||||
progress_bar.update(1)
|
for step, batch in enumerate(active_dataloader):
|
||||||
completed_steps += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
with accelerator.accumulate(model):
|
with accelerator.accumulate(model):
|
||||||
outputs = model(**batch)
|
outputs = model(**batch)
|
||||||
loss = outputs.loss
|
loss = outputs.loss
|
||||||
|
|||||||
@@ -610,29 +610,27 @@ def main():
|
|||||||
if "epoch" in training_difference:
|
if "epoch" in training_difference:
|
||||||
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
||||||
resume_step = None
|
resume_step = None
|
||||||
|
completed_steps = starting_epoch * num_update_steps_per_epoch
|
||||||
else:
|
else:
|
||||||
# need to multiply `gradient_accumulation_steps` to reflect real steps
|
# need to multiply `gradient_accumulation_steps` to reflect real steps
|
||||||
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
|
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
|
||||||
starting_epoch = resume_step // len(train_dataloader)
|
starting_epoch = resume_step // len(train_dataloader)
|
||||||
resume_step -= starting_epoch * len(train_dataloader)
|
resume_step -= starting_epoch * len(train_dataloader)
|
||||||
|
completed_steps = resume_step
|
||||||
|
|
||||||
# update the progress_bar if load from checkpoint
|
# update the progress_bar if load from checkpoint
|
||||||
progress_bar.update(starting_epoch * num_update_steps_per_epoch)
|
progress_bar.update(completed_steps)
|
||||||
completed_steps = starting_epoch * num_update_steps_per_epoch
|
|
||||||
|
|
||||||
for epoch in range(starting_epoch, args.num_train_epochs):
|
for epoch in range(starting_epoch, args.num_train_epochs):
|
||||||
model.train()
|
model.train()
|
||||||
if args.with_tracking:
|
if args.with_tracking:
|
||||||
total_loss = 0
|
total_loss = 0
|
||||||
for step, batch in enumerate(train_dataloader):
|
if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
|
||||||
# We need to skip steps until we reach the resumed step
|
# We skip the first `n` batches in the dataloader when resuming from a checkpoint
|
||||||
if args.resume_from_checkpoint and epoch == starting_epoch:
|
active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
|
||||||
if resume_step is not None and step < resume_step:
|
else:
|
||||||
if step % args.gradient_accumulation_steps == 0:
|
active_dataloader = train_dataloader
|
||||||
progress_bar.update(1)
|
for step, batch in enumerate(active_dataloader):
|
||||||
completed_steps += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
with accelerator.accumulate(model):
|
with accelerator.accumulate(model):
|
||||||
outputs = model(**batch)
|
outputs = model(**batch)
|
||||||
loss = outputs.loss
|
loss = outputs.loss
|
||||||
|
|||||||
@@ -557,22 +557,26 @@ def main():
|
|||||||
if "epoch" in training_difference:
|
if "epoch" in training_difference:
|
||||||
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
||||||
resume_step = None
|
resume_step = None
|
||||||
|
completed_steps = starting_epoch * num_update_steps_per_epoch
|
||||||
else:
|
else:
|
||||||
resume_step = int(training_difference.replace("step_", ""))
|
resume_step = int(training_difference.replace("step_", ""))
|
||||||
starting_epoch = resume_step // len(train_dataloader)
|
starting_epoch = resume_step // len(train_dataloader)
|
||||||
resume_step -= starting_epoch * len(train_dataloader)
|
resume_step -= starting_epoch * len(train_dataloader)
|
||||||
|
completed_steps = resume_step
|
||||||
|
|
||||||
|
# update the progress_bar if load from checkpoint
|
||||||
|
progress_bar.update(completed_steps)
|
||||||
|
|
||||||
for epoch in range(starting_epoch, args.num_train_epochs):
|
for epoch in range(starting_epoch, args.num_train_epochs):
|
||||||
model.train()
|
model.train()
|
||||||
if args.with_tracking:
|
if args.with_tracking:
|
||||||
total_loss = 0
|
total_loss = 0
|
||||||
for step, batch in enumerate(train_dataloader):
|
if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
|
||||||
# We need to skip steps until we reach the resumed step
|
# We skip the first `n` batches in the dataloader when resuming from a checkpoint
|
||||||
if args.resume_from_checkpoint and epoch == starting_epoch:
|
active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
|
||||||
if resume_step is not None and step < resume_step:
|
else:
|
||||||
completed_steps += 1
|
active_dataloader = train_dataloader
|
||||||
continue
|
for step, batch in enumerate(active_dataloader):
|
||||||
|
|
||||||
with accelerator.accumulate(model):
|
with accelerator.accumulate(model):
|
||||||
outputs = model(**batch)
|
outputs = model(**batch)
|
||||||
loss = outputs.loss
|
loss = outputs.loss
|
||||||
|
|||||||
@@ -809,22 +809,26 @@ def main():
|
|||||||
if "epoch" in training_difference:
|
if "epoch" in training_difference:
|
||||||
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
||||||
resume_step = None
|
resume_step = None
|
||||||
|
completed_steps = starting_epoch * num_update_steps_per_epoch
|
||||||
else:
|
else:
|
||||||
resume_step = int(training_difference.replace("step_", ""))
|
resume_step = int(training_difference.replace("step_", ""))
|
||||||
starting_epoch = resume_step // len(train_dataloader)
|
starting_epoch = resume_step // len(train_dataloader)
|
||||||
resume_step -= starting_epoch * len(train_dataloader)
|
resume_step -= starting_epoch * len(train_dataloader)
|
||||||
|
completed_steps = resume_step
|
||||||
|
|
||||||
|
# update the progress_bar if load from checkpoint
|
||||||
|
progress_bar.update(completed_steps)
|
||||||
|
|
||||||
for epoch in range(starting_epoch, args.num_train_epochs):
|
for epoch in range(starting_epoch, args.num_train_epochs):
|
||||||
model.train()
|
model.train()
|
||||||
if args.with_tracking:
|
if args.with_tracking:
|
||||||
total_loss = 0
|
total_loss = 0
|
||||||
for step, batch in enumerate(train_dataloader):
|
if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
|
||||||
# We need to skip steps until we reach the resumed step
|
# We skip the first `n` batches in the dataloader when resuming from a checkpoint
|
||||||
if args.resume_from_checkpoint and epoch == starting_epoch:
|
active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
|
||||||
if resume_step is not None and step < resume_step:
|
else:
|
||||||
completed_steps += 1
|
active_dataloader = train_dataloader
|
||||||
continue
|
for step, batch in enumerate(active_dataloader):
|
||||||
|
|
||||||
with accelerator.accumulate(model):
|
with accelerator.accumulate(model):
|
||||||
outputs = model(**batch)
|
outputs = model(**batch)
|
||||||
loss = outputs.loss
|
loss = outputs.loss
|
||||||
|
|||||||
@@ -825,22 +825,26 @@ def main():
|
|||||||
if "epoch" in training_difference:
|
if "epoch" in training_difference:
|
||||||
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
||||||
resume_step = None
|
resume_step = None
|
||||||
|
completed_steps = starting_epoch * num_update_steps_per_epoch
|
||||||
else:
|
else:
|
||||||
resume_step = int(training_difference.replace("step_", ""))
|
resume_step = int(training_difference.replace("step_", ""))
|
||||||
starting_epoch = resume_step // len(train_dataloader)
|
starting_epoch = resume_step // len(train_dataloader)
|
||||||
resume_step -= starting_epoch * len(train_dataloader)
|
resume_step -= starting_epoch * len(train_dataloader)
|
||||||
|
completed_steps = resume_step
|
||||||
|
|
||||||
|
# update the progress_bar if load from checkpoint
|
||||||
|
progress_bar.update(completed_steps)
|
||||||
|
|
||||||
for epoch in range(starting_epoch, args.num_train_epochs):
|
for epoch in range(starting_epoch, args.num_train_epochs):
|
||||||
model.train()
|
model.train()
|
||||||
if args.with_tracking:
|
if args.with_tracking:
|
||||||
total_loss = 0
|
total_loss = 0
|
||||||
for step, batch in enumerate(train_dataloader):
|
if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
|
||||||
# We need to skip steps until we reach the resumed step
|
# We skip the first `n` batches in the dataloader when resuming from a checkpoint
|
||||||
if args.resume_from_checkpoint and epoch == starting_epoch:
|
active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
|
||||||
if resume_step is not None and step < resume_step:
|
else:
|
||||||
completed_steps += 1
|
active_dataloader = train_dataloader
|
||||||
continue
|
for step, batch in enumerate(active_dataloader):
|
||||||
|
|
||||||
with accelerator.accumulate(model):
|
with accelerator.accumulate(model):
|
||||||
outputs = model(**batch)
|
outputs = model(**batch)
|
||||||
loss = outputs.loss
|
loss = outputs.loss
|
||||||
|
|||||||
@@ -554,22 +554,26 @@ def main():
|
|||||||
if "epoch" in training_difference:
|
if "epoch" in training_difference:
|
||||||
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
||||||
resume_step = None
|
resume_step = None
|
||||||
|
completed_steps = starting_epoch * num_update_steps_per_epoch
|
||||||
else:
|
else:
|
||||||
resume_step = int(training_difference.replace("step_", ""))
|
resume_step = int(training_difference.replace("step_", ""))
|
||||||
starting_epoch = resume_step // len(train_dataloader)
|
starting_epoch = resume_step // len(train_dataloader)
|
||||||
resume_step -= starting_epoch * len(train_dataloader)
|
resume_step -= starting_epoch * len(train_dataloader)
|
||||||
|
completed_steps = resume_step
|
||||||
|
|
||||||
|
# update the progress_bar if load from checkpoint
|
||||||
|
progress_bar.update(completed_steps)
|
||||||
|
|
||||||
for epoch in range(starting_epoch, args.num_train_epochs):
|
for epoch in range(starting_epoch, args.num_train_epochs):
|
||||||
|
model.train()
|
||||||
if args.with_tracking:
|
if args.with_tracking:
|
||||||
total_loss = 0
|
total_loss = 0
|
||||||
model.train()
|
if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
|
||||||
for step, batch in enumerate(train_dataloader):
|
# We skip the first `n` batches in the dataloader when resuming from a checkpoint
|
||||||
# We need to skip steps until we reach the resumed step
|
active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
|
||||||
if args.resume_from_checkpoint and epoch == starting_epoch:
|
else:
|
||||||
if resume_step is not None and step < resume_step:
|
active_dataloader = train_dataloader
|
||||||
completed_steps += 1
|
for step, batch in enumerate(active_dataloader):
|
||||||
continue
|
|
||||||
|
|
||||||
with accelerator.accumulate(model):
|
with accelerator.accumulate(model):
|
||||||
outputs = model(**batch)
|
outputs = model(**batch)
|
||||||
loss = outputs.loss
|
loss = outputs.loss
|
||||||
|
|||||||
@@ -626,22 +626,26 @@ def main():
|
|||||||
if "epoch" in training_difference:
|
if "epoch" in training_difference:
|
||||||
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
||||||
resume_step = None
|
resume_step = None
|
||||||
|
completed_steps = starting_epoch * num_update_steps_per_epoch
|
||||||
else:
|
else:
|
||||||
resume_step = int(training_difference.replace("step_", ""))
|
resume_step = int(training_difference.replace("step_", ""))
|
||||||
starting_epoch = resume_step // len(train_dataloader)
|
starting_epoch = resume_step // len(train_dataloader)
|
||||||
resume_step -= starting_epoch * len(train_dataloader)
|
resume_step -= starting_epoch * len(train_dataloader)
|
||||||
|
completed_steps = resume_step
|
||||||
|
|
||||||
|
# update the progress_bar if load from checkpoint
|
||||||
|
progress_bar.update(completed_steps)
|
||||||
|
|
||||||
for epoch in range(starting_epoch, args.num_train_epochs):
|
for epoch in range(starting_epoch, args.num_train_epochs):
|
||||||
model.train()
|
model.train()
|
||||||
if args.with_tracking:
|
if args.with_tracking:
|
||||||
total_loss = 0
|
total_loss = 0
|
||||||
for step, batch in enumerate(train_dataloader):
|
if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
|
||||||
# We need to skip steps until we reach the resumed step
|
# We skip the first `n` batches in the dataloader when resuming from a checkpoint
|
||||||
if args.resume_from_checkpoint and epoch == starting_epoch:
|
active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
|
||||||
if resume_step is not None and step < resume_step:
|
else:
|
||||||
completed_steps += 1
|
active_dataloader = train_dataloader
|
||||||
continue
|
for step, batch in enumerate(active_dataloader):
|
||||||
|
|
||||||
with accelerator.accumulate(model):
|
with accelerator.accumulate(model):
|
||||||
outputs = model(**batch)
|
outputs = model(**batch)
|
||||||
loss = outputs.loss
|
loss = outputs.loss
|
||||||
|
|||||||
@@ -510,12 +510,12 @@ def main():
|
|||||||
model.train()
|
model.train()
|
||||||
if args.with_tracking:
|
if args.with_tracking:
|
||||||
total_loss = 0
|
total_loss = 0
|
||||||
for step, batch in enumerate(train_dataloader):
|
if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
|
||||||
# We need to skip steps until we reach the resumed step
|
# We skip the first `n` batches in the dataloader when resuming from a checkpoint
|
||||||
if args.resume_from_checkpoint and epoch == starting_epoch:
|
active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
|
||||||
if resume_step is not None and step < resume_step:
|
else:
|
||||||
completed_steps += 1
|
active_dataloader = train_dataloader
|
||||||
continue
|
for step, batch in enumerate(active_dataloader):
|
||||||
outputs = model(**batch)
|
outputs = model(**batch)
|
||||||
loss = outputs.loss
|
loss = outputs.loss
|
||||||
# We keep track of the loss at each epoch
|
# We keep track of the loss at each epoch
|
||||||
|
|||||||
@@ -668,12 +668,12 @@ def main():
|
|||||||
model.train()
|
model.train()
|
||||||
if args.with_tracking:
|
if args.with_tracking:
|
||||||
total_loss = 0
|
total_loss = 0
|
||||||
for step, batch in enumerate(train_dataloader):
|
if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
|
||||||
# We need to skip steps until we reach the resumed step
|
# We skip the first `n` batches in the dataloader when resuming from a checkpoint
|
||||||
if args.resume_from_checkpoint and epoch == starting_epoch:
|
active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
|
||||||
if resume_step is not None and step < resume_step:
|
else:
|
||||||
completed_steps += 1
|
active_dataloader = train_dataloader
|
||||||
continue
|
for step, batch in enumerate(active_dataloader):
|
||||||
outputs = model(**batch)
|
outputs = model(**batch)
|
||||||
loss = outputs.loss
|
loss = outputs.loss
|
||||||
# We keep track of the loss at each epoch
|
# We keep track of the loss at each epoch
|
||||||
|
|||||||
@@ -607,28 +607,27 @@ def main():
|
|||||||
if "epoch" in training_difference:
|
if "epoch" in training_difference:
|
||||||
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
starting_epoch = int(training_difference.replace("epoch_", "")) + 1
|
||||||
resume_step = None
|
resume_step = None
|
||||||
|
completed_steps = starting_epoch * num_update_steps_per_epoch
|
||||||
else:
|
else:
|
||||||
# need to multiply `gradient_accumulation_steps` to reflect real steps
|
# need to multiply `gradient_accumulation_steps` to reflect real steps
|
||||||
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
|
resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
|
||||||
starting_epoch = resume_step // len(train_dataloader)
|
starting_epoch = resume_step // len(train_dataloader)
|
||||||
resume_step -= starting_epoch * len(train_dataloader)
|
resume_step -= starting_epoch * len(train_dataloader)
|
||||||
|
completed_steps = resume_step
|
||||||
|
|
||||||
# update the progress_bar if load from checkpoint
|
# update the progress_bar if load from checkpoint
|
||||||
progress_bar.update(starting_epoch * num_update_steps_per_epoch)
|
progress_bar.update(completed_steps)
|
||||||
completed_steps = starting_epoch * num_update_steps_per_epoch
|
|
||||||
|
|
||||||
for epoch in range(starting_epoch, args.num_train_epochs):
|
for epoch in range(starting_epoch, args.num_train_epochs):
|
||||||
model.train()
|
model.train()
|
||||||
if args.with_tracking:
|
if args.with_tracking:
|
||||||
total_loss = 0
|
total_loss = 0
|
||||||
for step, batch in enumerate(train_dataloader):
|
if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
|
||||||
# We need to skip steps until we reach the resumed step
|
# We skip the first `n` batches in the dataloader when resuming from a checkpoint
|
||||||
if args.resume_from_checkpoint and epoch == starting_epoch:
|
active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
|
||||||
if resume_step is not None and step < resume_step:
|
else:
|
||||||
if step % args.gradient_accumulation_steps == 0:
|
active_dataloader = train_dataloader
|
||||||
progress_bar.update(1)
|
for step, batch in enumerate(active_dataloader):
|
||||||
completed_steps += 1
|
|
||||||
continue
|
|
||||||
outputs = model(**batch)
|
outputs = model(**batch)
|
||||||
loss = outputs.loss
|
loss = outputs.loss
|
||||||
# We keep track of the loss at each epoch
|
# We keep track of the loss at each epoch
|
||||||
|
|||||||
Reference in New Issue
Block a user