Update no_trainer scripts with new Accelerate functionalities (#16617)
Adds logging and save/loading to the Accelerate scripts Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
@@ -185,6 +185,23 @@ def parse_args():
|
|||||||
"--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
|
"--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
|
||||||
)
|
)
|
||||||
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--checkpointing_steps",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--resume_from_checkpoint",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="If the training should continue from a checkpoint folder.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--with_tracking",
|
||||||
|
required=False,
|
||||||
|
help="Whether to load in all available experiment trackers from the environment and use them for logging.",
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Sanity checks
|
# Sanity checks
|
||||||
@@ -208,7 +225,8 @@ def main():
|
|||||||
args = parse_args()
|
args = parse_args()
|
||||||
|
|
||||||
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
|
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
|
||||||
accelerator = Accelerator()
|
# If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment
|
||||||
|
accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator()
|
||||||
# Make one log on every process with the configuration for debugging.
|
# Make one log on every process with the configuration for debugging.
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||||
@@ -427,18 +445,10 @@ def main():
|
|||||||
]
|
]
|
||||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
|
||||||
|
|
||||||
# Prepare everything with our `accelerator`.
|
|
||||||
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
|
|
||||||
model, optimizer, train_dataloader, eval_dataloader
|
|
||||||
)
|
|
||||||
|
|
||||||
# On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
|
# On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
|
||||||
if accelerator.distributed_type == DistributedType.TPU:
|
if accelerator.distributed_type == DistributedType.TPU:
|
||||||
model.tie_weights()
|
model.tie_weights()
|
||||||
|
|
||||||
# Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
|
|
||||||
# shorter in multiprocess)
|
|
||||||
|
|
||||||
# Scheduler and math around the number of training steps.
|
# Scheduler and math around the number of training steps.
|
||||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||||
if args.max_train_steps is None:
|
if args.max_train_steps is None:
|
||||||
@@ -453,6 +463,23 @@ def main():
|
|||||||
num_training_steps=args.max_train_steps,
|
num_training_steps=args.max_train_steps,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Prepare everything with our `accelerator`.
|
||||||
|
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||||
|
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||||
|
)
|
||||||
|
|
||||||
|
# Figure out how many steps we should save the Accelerator states
|
||||||
|
if hasattr(args.checkpointing_steps, "isdigit"):
|
||||||
|
checkpointing_steps = args.checkpointing_steps
|
||||||
|
if args.checkpointing_steps.isdigit():
|
||||||
|
checkpointing_steps = int(args.checkpointing_steps)
|
||||||
|
else:
|
||||||
|
checkpointing_steps = None
|
||||||
|
|
||||||
|
# We need to initialize the trackers we use, and also store our configuration
|
||||||
|
if args.with_tracking:
|
||||||
|
accelerator.init_trackers("clm_no_trainer", args)
|
||||||
|
|
||||||
# Train!
|
# Train!
|
||||||
total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
|
total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
|
||||||
|
|
||||||
@@ -467,11 +494,38 @@ def main():
|
|||||||
progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
|
progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
|
||||||
completed_steps = 0
|
completed_steps = 0
|
||||||
|
|
||||||
|
# Potentially load in the weights and states from a previous save
|
||||||
|
if args.resume_from_checkpoint:
|
||||||
|
if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
|
||||||
|
accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
|
||||||
|
accelerator.load_state(args.resume_from_checkpoint)
|
||||||
|
resume_step = None
|
||||||
|
path = args.resume_from_checkpoint
|
||||||
|
else:
|
||||||
|
# Get the most recent checkpoint
|
||||||
|
dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
|
||||||
|
dirs.sort(key=os.path.getctime)
|
||||||
|
path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last
|
||||||
|
if "epoch" in path:
|
||||||
|
args.num_train_epochs -= int(path.replace("epoch_", ""))
|
||||||
|
else:
|
||||||
|
resume_step = int(path.replace("step_", ""))
|
||||||
|
args.num_train_epochs -= resume_step // len(train_dataloader)
|
||||||
|
resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step
|
||||||
|
|
||||||
for epoch in range(args.num_train_epochs):
|
for epoch in range(args.num_train_epochs):
|
||||||
model.train()
|
model.train()
|
||||||
|
if args.with_tracking:
|
||||||
|
total_loss = 0
|
||||||
for step, batch in enumerate(train_dataloader):
|
for step, batch in enumerate(train_dataloader):
|
||||||
|
# We need to skip steps until we reach the resumed step
|
||||||
|
if args.resume_from_checkpoint and epoch == 0 and step < resume_step:
|
||||||
|
continue
|
||||||
outputs = model(**batch)
|
outputs = model(**batch)
|
||||||
loss = outputs.loss
|
loss = outputs.loss
|
||||||
|
# We keep track of the loss at each epoch
|
||||||
|
if args.with_tracking:
|
||||||
|
total_loss += loss.detach().float()
|
||||||
loss = loss / args.gradient_accumulation_steps
|
loss = loss / args.gradient_accumulation_steps
|
||||||
accelerator.backward(loss)
|
accelerator.backward(loss)
|
||||||
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
|
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
|
||||||
@@ -481,6 +535,10 @@ def main():
|
|||||||
progress_bar.update(1)
|
progress_bar.update(1)
|
||||||
completed_steps += 1
|
completed_steps += 1
|
||||||
|
|
||||||
|
if isinstance(checkpointing_steps, int):
|
||||||
|
if completed_steps % checkpointing_steps == 0:
|
||||||
|
accelerator.save_state(f"step_{completed_steps}")
|
||||||
|
|
||||||
if completed_steps >= args.max_train_steps:
|
if completed_steps >= args.max_train_steps:
|
||||||
break
|
break
|
||||||
|
|
||||||
@@ -502,6 +560,16 @@ def main():
|
|||||||
|
|
||||||
logger.info(f"epoch {epoch}: perplexity: {perplexity}")
|
logger.info(f"epoch {epoch}: perplexity: {perplexity}")
|
||||||
|
|
||||||
|
if args.with_tracking:
|
||||||
|
accelerator.log(
|
||||||
|
{
|
||||||
|
"perplexity": perplexity,
|
||||||
|
"train_loss": total_loss,
|
||||||
|
"epoch": epoch,
|
||||||
|
},
|
||||||
|
step=completed_steps,
|
||||||
|
)
|
||||||
|
|
||||||
if args.push_to_hub and epoch < args.num_train_epochs - 1:
|
if args.push_to_hub and epoch < args.num_train_epochs - 1:
|
||||||
accelerator.wait_for_everyone()
|
accelerator.wait_for_everyone()
|
||||||
unwrapped_model = accelerator.unwrap_model(model)
|
unwrapped_model = accelerator.unwrap_model(model)
|
||||||
@@ -512,6 +580,9 @@ def main():
|
|||||||
commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
|
commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if args.checkpointing_steps == "epoch":
|
||||||
|
accelerator.save_state(f"epoch_{epoch}")
|
||||||
|
|
||||||
if args.output_dir is not None:
|
if args.output_dir is not None:
|
||||||
accelerator.wait_for_everyone()
|
accelerator.wait_for_everyone()
|
||||||
unwrapped_model = accelerator.unwrap_model(model)
|
unwrapped_model = accelerator.unwrap_model(model)
|
||||||
|
|||||||
@@ -194,6 +194,23 @@ def parse_args():
|
|||||||
"--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
|
"--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
|
||||||
)
|
)
|
||||||
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--checkpointing_steps",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--resume_from_checkpoint",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="If the training should continue from a checkpoint folder.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--with_tracking",
|
||||||
|
required=False,
|
||||||
|
help="Whether to load in all available experiment trackers from the environment and use them for logging.",
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Sanity checks
|
# Sanity checks
|
||||||
@@ -219,7 +236,8 @@ def main():
|
|||||||
args = parse_args()
|
args = parse_args()
|
||||||
|
|
||||||
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
|
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
|
||||||
accelerator = Accelerator()
|
# If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment
|
||||||
|
accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator()
|
||||||
# Make one log on every process with the configuration for debugging.
|
# Make one log on every process with the configuration for debugging.
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||||
@@ -468,11 +486,6 @@ def main():
|
|||||||
]
|
]
|
||||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
|
||||||
|
|
||||||
# Prepare everything with our `accelerator`.
|
|
||||||
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
|
|
||||||
model, optimizer, train_dataloader, eval_dataloader
|
|
||||||
)
|
|
||||||
|
|
||||||
# On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
|
# On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
|
||||||
if accelerator.distributed_type == DistributedType.TPU:
|
if accelerator.distributed_type == DistributedType.TPU:
|
||||||
model.tie_weights()
|
model.tie_weights()
|
||||||
@@ -494,6 +507,23 @@ def main():
|
|||||||
num_training_steps=args.max_train_steps,
|
num_training_steps=args.max_train_steps,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Prepare everything with our `accelerator`.
|
||||||
|
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||||
|
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||||
|
)
|
||||||
|
|
||||||
|
# Figure out how many steps we should save the Accelerator states
|
||||||
|
if hasattr(args.checkpointing_steps, "isdigit"):
|
||||||
|
checkpointing_steps = args.checkpointing_steps
|
||||||
|
if args.checkpointing_steps.isdigit():
|
||||||
|
checkpointing_steps = int(args.checkpointing_steps)
|
||||||
|
else:
|
||||||
|
checkpointing_steps = None
|
||||||
|
|
||||||
|
# We need to initialize the trackers we use, and also store our configuration
|
||||||
|
if args.with_tracking:
|
||||||
|
accelerator.init_trackers("clm_no_trainer", args)
|
||||||
|
|
||||||
# Train!
|
# Train!
|
||||||
total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
|
total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
|
||||||
|
|
||||||
@@ -508,11 +538,38 @@ def main():
|
|||||||
progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
|
progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
|
||||||
completed_steps = 0
|
completed_steps = 0
|
||||||
|
|
||||||
|
# Potentially load in the weights and states from a previous save
|
||||||
|
if args.resume_from_checkpoint:
|
||||||
|
if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
|
||||||
|
accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
|
||||||
|
accelerator.load_state(args.resume_from_checkpoint)
|
||||||
|
resume_step = None
|
||||||
|
path = args.resume_from_checkpoint
|
||||||
|
else:
|
||||||
|
# Get the most recent checkpoint
|
||||||
|
dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
|
||||||
|
dirs.sort(key=os.path.getctime)
|
||||||
|
path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last
|
||||||
|
if "epoch" in path:
|
||||||
|
args.num_train_epochs -= int(path.replace("epoch_", ""))
|
||||||
|
else:
|
||||||
|
resume_step = int(path.replace("step_", ""))
|
||||||
|
args.num_train_epochs -= resume_step // len(train_dataloader)
|
||||||
|
resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step
|
||||||
|
|
||||||
for epoch in range(args.num_train_epochs):
|
for epoch in range(args.num_train_epochs):
|
||||||
model.train()
|
model.train()
|
||||||
|
if args.with_tracking:
|
||||||
|
total_loss = 0
|
||||||
for step, batch in enumerate(train_dataloader):
|
for step, batch in enumerate(train_dataloader):
|
||||||
|
# We need to skip steps until we reach the resumed step
|
||||||
|
if args.resume_from_checkpoint and epoch == 0 and step < resume_step:
|
||||||
|
continue
|
||||||
outputs = model(**batch)
|
outputs = model(**batch)
|
||||||
loss = outputs.loss
|
loss = outputs.loss
|
||||||
|
# We keep track of the loss at each epoch
|
||||||
|
if args.with_tracking:
|
||||||
|
total_loss += loss.detach().float()
|
||||||
loss = loss / args.gradient_accumulation_steps
|
loss = loss / args.gradient_accumulation_steps
|
||||||
accelerator.backward(loss)
|
accelerator.backward(loss)
|
||||||
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
|
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
|
||||||
@@ -522,6 +579,10 @@ def main():
|
|||||||
progress_bar.update(1)
|
progress_bar.update(1)
|
||||||
completed_steps += 1
|
completed_steps += 1
|
||||||
|
|
||||||
|
if isinstance(checkpointing_steps, int):
|
||||||
|
if completed_steps % checkpointing_steps == 0:
|
||||||
|
accelerator.save_state(f"step_{completed_steps}")
|
||||||
|
|
||||||
if completed_steps >= args.max_train_steps:
|
if completed_steps >= args.max_train_steps:
|
||||||
break
|
break
|
||||||
|
|
||||||
@@ -543,6 +604,16 @@ def main():
|
|||||||
|
|
||||||
logger.info(f"epoch {epoch}: perplexity: {perplexity}")
|
logger.info(f"epoch {epoch}: perplexity: {perplexity}")
|
||||||
|
|
||||||
|
if args.with_tracking:
|
||||||
|
accelerator.log(
|
||||||
|
{
|
||||||
|
"perplexity": perplexity,
|
||||||
|
"train_loss": total_loss,
|
||||||
|
"epoch": epoch,
|
||||||
|
},
|
||||||
|
step=completed_steps,
|
||||||
|
)
|
||||||
|
|
||||||
if args.push_to_hub and epoch < args.num_train_epochs - 1:
|
if args.push_to_hub and epoch < args.num_train_epochs - 1:
|
||||||
accelerator.wait_for_everyone()
|
accelerator.wait_for_everyone()
|
||||||
unwrapped_model = accelerator.unwrap_model(model)
|
unwrapped_model = accelerator.unwrap_model(model)
|
||||||
@@ -553,6 +624,9 @@ def main():
|
|||||||
commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
|
commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if args.checkpointing_steps == "epoch":
|
||||||
|
accelerator.save_state(f"epoch_{epoch}")
|
||||||
|
|
||||||
if args.output_dir is not None:
|
if args.output_dir is not None:
|
||||||
accelerator.wait_for_everyone()
|
accelerator.wait_for_everyone()
|
||||||
unwrapped_model = accelerator.unwrap_model(model)
|
unwrapped_model = accelerator.unwrap_model(model)
|
||||||
|
|||||||
@@ -177,6 +177,23 @@ def parse_args():
|
|||||||
"--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
|
"--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
|
||||||
)
|
)
|
||||||
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--checkpointing_steps",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--resume_from_checkpoint",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="If the training should continue from a checkpoint folder.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--with_tracking",
|
||||||
|
required=False,
|
||||||
|
help="Whether to load in all available experiment trackers from the environment and use them for logging.",
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
if args.push_to_hub:
|
if args.push_to_hub:
|
||||||
@@ -246,7 +263,8 @@ def main():
|
|||||||
args = parse_args()
|
args = parse_args()
|
||||||
|
|
||||||
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
|
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
|
||||||
accelerator = Accelerator()
|
# If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment
|
||||||
|
accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator()
|
||||||
# Make one log on every process with the configuration for debugging.
|
# Make one log on every process with the configuration for debugging.
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||||
@@ -431,14 +449,6 @@ def main():
|
|||||||
device = accelerator.device
|
device = accelerator.device
|
||||||
model.to(device)
|
model.to(device)
|
||||||
|
|
||||||
# Prepare everything with our `accelerator`.
|
|
||||||
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
|
|
||||||
model, optimizer, train_dataloader, eval_dataloader
|
|
||||||
)
|
|
||||||
|
|
||||||
# Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
|
|
||||||
# shorter in multiprocess)
|
|
||||||
|
|
||||||
# Scheduler and math around the number of training steps.
|
# Scheduler and math around the number of training steps.
|
||||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||||
if args.max_train_steps is None:
|
if args.max_train_steps is None:
|
||||||
@@ -453,6 +463,23 @@ def main():
|
|||||||
num_training_steps=args.max_train_steps,
|
num_training_steps=args.max_train_steps,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Prepare everything with our `accelerator`.
|
||||||
|
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||||
|
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||||
|
)
|
||||||
|
|
||||||
|
# Figure out how many steps we should save the Accelerator states
|
||||||
|
if hasattr(args.checkpointing_steps, "isdigit"):
|
||||||
|
checkpointing_steps = args.checkpointing_steps
|
||||||
|
if args.checkpointing_steps.isdigit():
|
||||||
|
checkpointing_steps = int(args.checkpointing_steps)
|
||||||
|
else:
|
||||||
|
checkpointing_steps = None
|
||||||
|
|
||||||
|
# We need to initialize the trackers we use, and also store our configuration
|
||||||
|
if args.with_tracking:
|
||||||
|
accelerator.init_trackers("clm_no_trainer", args)
|
||||||
|
|
||||||
# Metrics
|
# Metrics
|
||||||
metric = load_metric("accuracy")
|
metric = load_metric("accuracy")
|
||||||
|
|
||||||
@@ -470,11 +497,38 @@ def main():
|
|||||||
progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
|
progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
|
||||||
completed_steps = 0
|
completed_steps = 0
|
||||||
|
|
||||||
|
# Potentially load in the weights and states from a previous save
|
||||||
|
if args.resume_from_checkpoint:
|
||||||
|
if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
|
||||||
|
accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
|
||||||
|
accelerator.load_state(args.resume_from_checkpoint)
|
||||||
|
resume_step = None
|
||||||
|
path = args.resume_from_checkpoint
|
||||||
|
else:
|
||||||
|
# Get the most recent checkpoint
|
||||||
|
dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
|
||||||
|
dirs.sort(key=os.path.getctime)
|
||||||
|
path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last
|
||||||
|
if "epoch" in path:
|
||||||
|
args.num_train_epochs -= int(path.replace("epoch_", ""))
|
||||||
|
else:
|
||||||
|
resume_step = int(path.replace("step_", ""))
|
||||||
|
args.num_train_epochs -= resume_step // len(train_dataloader)
|
||||||
|
resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step
|
||||||
|
|
||||||
for epoch in range(args.num_train_epochs):
|
for epoch in range(args.num_train_epochs):
|
||||||
model.train()
|
model.train()
|
||||||
|
if args.with_tracking:
|
||||||
|
total_loss = 0
|
||||||
for step, batch in enumerate(train_dataloader):
|
for step, batch in enumerate(train_dataloader):
|
||||||
|
# We need to skip steps until we reach the resumed step
|
||||||
|
if args.resume_from_checkpoint and epoch == 0 and step < resume_step:
|
||||||
|
continue
|
||||||
outputs = model(**batch)
|
outputs = model(**batch)
|
||||||
loss = outputs.loss
|
loss = outputs.loss
|
||||||
|
# We keep track of the loss at each epoch
|
||||||
|
if args.with_tracking:
|
||||||
|
total_loss += loss.detach().float()
|
||||||
loss = loss / args.gradient_accumulation_steps
|
loss = loss / args.gradient_accumulation_steps
|
||||||
accelerator.backward(loss)
|
accelerator.backward(loss)
|
||||||
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
|
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
|
||||||
@@ -484,6 +538,10 @@ def main():
|
|||||||
progress_bar.update(1)
|
progress_bar.update(1)
|
||||||
completed_steps += 1
|
completed_steps += 1
|
||||||
|
|
||||||
|
if isinstance(checkpointing_steps, int):
|
||||||
|
if completed_steps % checkpointing_steps == 0:
|
||||||
|
accelerator.save_state(f"step_{completed_steps}")
|
||||||
|
|
||||||
if completed_steps >= args.max_train_steps:
|
if completed_steps >= args.max_train_steps:
|
||||||
break
|
break
|
||||||
|
|
||||||
@@ -500,6 +558,16 @@ def main():
|
|||||||
eval_metric = metric.compute()
|
eval_metric = metric.compute()
|
||||||
accelerator.print(f"epoch {epoch}: {eval_metric}")
|
accelerator.print(f"epoch {epoch}: {eval_metric}")
|
||||||
|
|
||||||
|
if args.with_tracking:
|
||||||
|
accelerator.log(
|
||||||
|
{
|
||||||
|
"accuracy": eval_metric,
|
||||||
|
"train_loss": total_loss,
|
||||||
|
"epoch": epoch,
|
||||||
|
},
|
||||||
|
step=completed_steps,
|
||||||
|
)
|
||||||
|
|
||||||
if args.push_to_hub and epoch < args.num_train_epochs - 1:
|
if args.push_to_hub and epoch < args.num_train_epochs - 1:
|
||||||
accelerator.wait_for_everyone()
|
accelerator.wait_for_everyone()
|
||||||
unwrapped_model = accelerator.unwrap_model(model)
|
unwrapped_model = accelerator.unwrap_model(model)
|
||||||
|
|||||||
@@ -210,6 +210,23 @@ def parse_args():
|
|||||||
"--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
|
"--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
|
||||||
)
|
)
|
||||||
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--checkpointing_steps",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--resume_from_checkpoint",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="If the training should continue from a checkpoint folder.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--with_tracking",
|
||||||
|
required=False,
|
||||||
|
help="Whether to load in all available experiment trackers from the environment and use them for logging.",
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Sanity checks
|
# Sanity checks
|
||||||
@@ -241,7 +258,8 @@ def main():
|
|||||||
args = parse_args()
|
args = parse_args()
|
||||||
|
|
||||||
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
|
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
|
||||||
accelerator = Accelerator()
|
# If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment
|
||||||
|
accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator()
|
||||||
# Make one log on every process with the configuration for debugging.
|
# Make one log on every process with the configuration for debugging.
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||||
@@ -670,14 +688,6 @@ def main():
|
|||||||
]
|
]
|
||||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
|
||||||
|
|
||||||
# Prepare everything with our `accelerator`.
|
|
||||||
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
|
|
||||||
model, optimizer, train_dataloader, eval_dataloader
|
|
||||||
)
|
|
||||||
|
|
||||||
# Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
|
|
||||||
# shorter in multiprocess)
|
|
||||||
|
|
||||||
# Scheduler and math around the number of training steps.
|
# Scheduler and math around the number of training steps.
|
||||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||||
if args.max_train_steps is None:
|
if args.max_train_steps is None:
|
||||||
@@ -692,6 +702,23 @@ def main():
|
|||||||
num_training_steps=args.max_train_steps,
|
num_training_steps=args.max_train_steps,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Prepare everything with our `accelerator`.
|
||||||
|
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||||
|
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||||
|
)
|
||||||
|
|
||||||
|
# Figure out how many steps we should save the Accelerator states
|
||||||
|
if hasattr(args.checkpointing_steps, "isdigit"):
|
||||||
|
checkpointing_steps = args.checkpointing_steps
|
||||||
|
if args.checkpointing_steps.isdigit():
|
||||||
|
checkpointing_steps = int(args.checkpointing_steps)
|
||||||
|
else:
|
||||||
|
checkpointing_steps = None
|
||||||
|
|
||||||
|
# We need to initialize the trackers we use, and also store our configuration
|
||||||
|
if args.with_tracking:
|
||||||
|
accelerator.init_trackers("clm_no_trainer", args)
|
||||||
|
|
||||||
# Train!
|
# Train!
|
||||||
total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
|
total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
|
||||||
|
|
||||||
@@ -707,11 +734,38 @@ def main():
|
|||||||
progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
|
progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
|
||||||
completed_steps = 0
|
completed_steps = 0
|
||||||
|
|
||||||
|
# Potentially load in the weights and states from a previous save
|
||||||
|
if args.resume_from_checkpoint:
|
||||||
|
if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
|
||||||
|
accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
|
||||||
|
accelerator.load_state(args.resume_from_checkpoint)
|
||||||
|
resume_step = None
|
||||||
|
path = args.resume_from_checkpoint
|
||||||
|
else:
|
||||||
|
# Get the most recent checkpoint
|
||||||
|
dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
|
||||||
|
dirs.sort(key=os.path.getctime)
|
||||||
|
path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last
|
||||||
|
if "epoch" in path:
|
||||||
|
args.num_train_epochs -= int(path.replace("epoch_", ""))
|
||||||
|
else:
|
||||||
|
resume_step = int(path.replace("step_", ""))
|
||||||
|
args.num_train_epochs -= resume_step // len(train_dataloader)
|
||||||
|
resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step
|
||||||
|
|
||||||
for epoch in range(args.num_train_epochs):
|
for epoch in range(args.num_train_epochs):
|
||||||
model.train()
|
model.train()
|
||||||
|
if args.with_tracking:
|
||||||
|
total_loss = 0
|
||||||
for step, batch in enumerate(train_dataloader):
|
for step, batch in enumerate(train_dataloader):
|
||||||
|
# We need to skip steps until we reach the resumed step
|
||||||
|
if args.resume_from_checkpoint and epoch == 0 and step < resume_step:
|
||||||
|
continue
|
||||||
outputs = model(**batch)
|
outputs = model(**batch)
|
||||||
loss = outputs.loss
|
loss = outputs.loss
|
||||||
|
# We keep track of the loss at each epoch
|
||||||
|
if args.with_tracking:
|
||||||
|
total_loss += loss.detach().float()
|
||||||
loss = loss / args.gradient_accumulation_steps
|
loss = loss / args.gradient_accumulation_steps
|
||||||
accelerator.backward(loss)
|
accelerator.backward(loss)
|
||||||
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
|
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
|
||||||
@@ -721,6 +775,10 @@ def main():
|
|||||||
progress_bar.update(1)
|
progress_bar.update(1)
|
||||||
completed_steps += 1
|
completed_steps += 1
|
||||||
|
|
||||||
|
if isinstance(checkpointing_steps, int):
|
||||||
|
if completed_steps % checkpointing_steps == 0:
|
||||||
|
accelerator.save_state(f"step_{completed_steps}")
|
||||||
|
|
||||||
if completed_steps >= args.max_train_steps:
|
if completed_steps >= args.max_train_steps:
|
||||||
break
|
break
|
||||||
|
|
||||||
@@ -847,6 +905,20 @@ def main():
|
|||||||
predict_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids)
|
predict_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids)
|
||||||
logger.info(f"Predict metrics: {predict_metric}")
|
logger.info(f"Predict metrics: {predict_metric}")
|
||||||
|
|
||||||
|
if args.with_tracking:
|
||||||
|
log = {
|
||||||
|
"squad_v2" if args.version_2_with_negative else "squad": eval_metric,
|
||||||
|
"train_loss": total_loss,
|
||||||
|
"epoch": epoch,
|
||||||
|
}
|
||||||
|
if args.do_predict:
|
||||||
|
log["squad_v2_predict" if args.version_2_with_negative else "squad_predict"] = predict_metric
|
||||||
|
|
||||||
|
accelerator.log(log, step=completed_steps)
|
||||||
|
|
||||||
|
if args.checkpointing_steps == "epoch":
|
||||||
|
accelerator.save_state(f"epoch_{epoch}")
|
||||||
|
|
||||||
if args.output_dir is not None:
|
if args.output_dir is not None:
|
||||||
accelerator.wait_for_everyone()
|
accelerator.wait_for_everyone()
|
||||||
unwrapped_model = accelerator.unwrap_model(model)
|
unwrapped_model = accelerator.unwrap_model(model)
|
||||||
|
|||||||
@@ -239,6 +239,23 @@ def parse_args():
|
|||||||
"--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
|
"--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
|
||||||
)
|
)
|
||||||
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--checkpointing_steps",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--resume_from_checkpoint",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="If the training should continue from a checkpoint folder.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--with_tracking",
|
||||||
|
required=False,
|
||||||
|
help="Whether to load in all available experiment trackers from the environment and use them for logging.",
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Sanity checks
|
# Sanity checks
|
||||||
@@ -270,7 +287,8 @@ def main():
|
|||||||
args = parse_args()
|
args = parse_args()
|
||||||
|
|
||||||
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
|
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
|
||||||
accelerator = Accelerator()
|
# If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment
|
||||||
|
accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator()
|
||||||
# Make one log on every process with the configuration for debugging.
|
# Make one log on every process with the configuration for debugging.
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||||
@@ -676,14 +694,6 @@ def main():
|
|||||||
]
|
]
|
||||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
|
||||||
|
|
||||||
# Prepare everything with our `accelerator`.
|
|
||||||
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
|
|
||||||
model, optimizer, train_dataloader, eval_dataloader
|
|
||||||
)
|
|
||||||
|
|
||||||
# Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
|
|
||||||
# shorter in multiprocess)
|
|
||||||
|
|
||||||
# Scheduler and math around the number of training steps.
|
# Scheduler and math around the number of training steps.
|
||||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||||
if args.max_train_steps is None:
|
if args.max_train_steps is None:
|
||||||
@@ -698,6 +708,23 @@ def main():
|
|||||||
num_training_steps=args.max_train_steps,
|
num_training_steps=args.max_train_steps,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Prepare everything with our `accelerator`.
|
||||||
|
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||||
|
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||||
|
)
|
||||||
|
|
||||||
|
# Figure out how many steps we should save the Accelerator states
|
||||||
|
if hasattr(args.checkpointing_steps, "isdigit"):
|
||||||
|
checkpointing_steps = args.checkpointing_steps
|
||||||
|
if args.checkpointing_steps.isdigit():
|
||||||
|
checkpointing_steps = int(args.checkpointing_steps)
|
||||||
|
else:
|
||||||
|
checkpointing_steps = None
|
||||||
|
|
||||||
|
# We need to initialize the trackers we use, and also store our configuration
|
||||||
|
if args.with_tracking:
|
||||||
|
accelerator.init_trackers("clm_no_trainer", args)
|
||||||
|
|
||||||
# Train!
|
# Train!
|
||||||
total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
|
total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
|
||||||
|
|
||||||
@@ -713,11 +740,38 @@ def main():
|
|||||||
progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
|
progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
|
||||||
completed_steps = 0
|
completed_steps = 0
|
||||||
|
|
||||||
|
# Potentially load in the weights and states from a previous save
|
||||||
|
if args.resume_from_checkpoint:
|
||||||
|
if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
|
||||||
|
accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
|
||||||
|
accelerator.load_state(args.resume_from_checkpoint)
|
||||||
|
resume_step = None
|
||||||
|
path = args.resume_from_checkpoint
|
||||||
|
else:
|
||||||
|
# Get the most recent checkpoint
|
||||||
|
dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
|
||||||
|
dirs.sort(key=os.path.getctime)
|
||||||
|
path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last
|
||||||
|
if "epoch" in path:
|
||||||
|
args.num_train_epochs -= int(path.replace("epoch_", ""))
|
||||||
|
else:
|
||||||
|
resume_step = int(path.replace("step_", ""))
|
||||||
|
args.num_train_epochs -= resume_step // len(train_dataloader)
|
||||||
|
resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step
|
||||||
|
|
||||||
for epoch in range(args.num_train_epochs):
|
for epoch in range(args.num_train_epochs):
|
||||||
model.train()
|
model.train()
|
||||||
|
if args.with_tracking:
|
||||||
|
total_loss = 0
|
||||||
for step, batch in enumerate(train_dataloader):
|
for step, batch in enumerate(train_dataloader):
|
||||||
|
# We need to skip steps until we reach the resumed step
|
||||||
|
if args.resume_from_checkpoint and epoch == 0 and step < resume_step:
|
||||||
|
continue
|
||||||
outputs = model(**batch)
|
outputs = model(**batch)
|
||||||
loss = outputs.loss
|
loss = outputs.loss
|
||||||
|
# We keep track of the loss at each epoch
|
||||||
|
if args.with_tracking:
|
||||||
|
total_loss += loss.detach().float()
|
||||||
loss = loss / args.gradient_accumulation_steps
|
loss = loss / args.gradient_accumulation_steps
|
||||||
accelerator.backward(loss)
|
accelerator.backward(loss)
|
||||||
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
|
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
|
||||||
@@ -727,6 +781,10 @@ def main():
|
|||||||
progress_bar.update(1)
|
progress_bar.update(1)
|
||||||
completed_steps += 1
|
completed_steps += 1
|
||||||
|
|
||||||
|
if isinstance(checkpointing_steps, int):
|
||||||
|
if completed_steps % checkpointing_steps == 0:
|
||||||
|
accelerator.save_state(f"step_{completed_steps}")
|
||||||
|
|
||||||
if completed_steps >= args.max_train_steps:
|
if completed_steps >= args.max_train_steps:
|
||||||
break
|
break
|
||||||
|
|
||||||
@@ -810,6 +868,20 @@ def main():
|
|||||||
predict_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids)
|
predict_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids)
|
||||||
logger.info(f"Predict metrics: {predict_metric}")
|
logger.info(f"Predict metrics: {predict_metric}")
|
||||||
|
|
||||||
|
if args.with_tracking:
|
||||||
|
log = {
|
||||||
|
"squad_v2" if args.version_2_with_negative else "squad": eval_metric,
|
||||||
|
"train_loss": total_loss,
|
||||||
|
"epoch": epoch,
|
||||||
|
}
|
||||||
|
if args.do_predict:
|
||||||
|
log["squad_v2_predict" if args.version_2_with_negative else "squad_predict"] = predict_metric
|
||||||
|
|
||||||
|
accelerator.log(log, step=completed_steps)
|
||||||
|
|
||||||
|
if args.checkpointing_steps == "epoch":
|
||||||
|
accelerator.save_state(f"epoch_{epoch}")
|
||||||
|
|
||||||
if args.output_dir is not None:
|
if args.output_dir is not None:
|
||||||
accelerator.wait_for_everyone()
|
accelerator.wait_for_everyone()
|
||||||
unwrapped_model = accelerator.unwrap_model(model)
|
unwrapped_model = accelerator.unwrap_model(model)
|
||||||
|
|||||||
@@ -262,6 +262,23 @@ def parse_args():
|
|||||||
"--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
|
"--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
|
||||||
)
|
)
|
||||||
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--checkpointing_steps",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--resume_from_checkpoint",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="If the training should continue from a checkpoint folder.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--with_tracking",
|
||||||
|
required=False,
|
||||||
|
help="Whether to load in all available experiment trackers from the environment and use them for logging.",
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Sanity checks
|
# Sanity checks
|
||||||
@@ -296,7 +313,8 @@ def main():
|
|||||||
"`--source_prefix 'summarize: ' `"
|
"`--source_prefix 'summarize: ' `"
|
||||||
)
|
)
|
||||||
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
|
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
|
||||||
accelerator = Accelerator()
|
# If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment
|
||||||
|
accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator()
|
||||||
# Make one log on every process with the configuration for debugging.
|
# Make one log on every process with the configuration for debugging.
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||||
@@ -494,14 +512,6 @@ def main():
|
|||||||
]
|
]
|
||||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
|
||||||
|
|
||||||
# Prepare everything with our `accelerator`.
|
|
||||||
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
|
|
||||||
model, optimizer, train_dataloader, eval_dataloader
|
|
||||||
)
|
|
||||||
|
|
||||||
# Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
|
|
||||||
# shorter in multiprocess)
|
|
||||||
|
|
||||||
# Scheduler and math around the number of training steps.
|
# Scheduler and math around the number of training steps.
|
||||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||||
if args.max_train_steps is None:
|
if args.max_train_steps is None:
|
||||||
@@ -516,6 +526,23 @@ def main():
|
|||||||
num_training_steps=args.max_train_steps,
|
num_training_steps=args.max_train_steps,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Prepare everything with our `accelerator`.
|
||||||
|
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||||
|
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||||
|
)
|
||||||
|
|
||||||
|
# Figure out how many steps we should save the Accelerator states
|
||||||
|
if hasattr(args.checkpointing_steps, "isdigit"):
|
||||||
|
checkpointing_steps = args.checkpointing_steps
|
||||||
|
if args.checkpointing_steps.isdigit():
|
||||||
|
checkpointing_steps = int(args.checkpointing_steps)
|
||||||
|
else:
|
||||||
|
checkpointing_steps = None
|
||||||
|
|
||||||
|
# We need to initialize the trackers we use, and also store our configuration
|
||||||
|
if args.with_tracking:
|
||||||
|
accelerator.init_trackers("summarization_no_trainer", args)
|
||||||
|
|
||||||
# Metric
|
# Metric
|
||||||
metric = load_metric("rouge")
|
metric = load_metric("rouge")
|
||||||
|
|
||||||
@@ -532,12 +559,38 @@ def main():
|
|||||||
# Only show the progress bar once on each machine.
|
# Only show the progress bar once on each machine.
|
||||||
progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
|
progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
|
||||||
completed_steps = 0
|
completed_steps = 0
|
||||||
|
# Potentially load in the weights and states from a previous save
|
||||||
|
if args.resume_from_checkpoint:
|
||||||
|
if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
|
||||||
|
accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
|
||||||
|
accelerator.load_state(args.resume_from_checkpoint)
|
||||||
|
resume_step = None
|
||||||
|
path = args.resume_from_checkpoint
|
||||||
|
else:
|
||||||
|
# Get the most recent checkpoint
|
||||||
|
dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
|
||||||
|
dirs.sort(key=os.path.getctime)
|
||||||
|
path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last
|
||||||
|
if "epoch" in path:
|
||||||
|
args.num_train_epochs -= int(path.replace("epoch_", ""))
|
||||||
|
else:
|
||||||
|
resume_step = int(path.replace("step_", ""))
|
||||||
|
args.num_train_epochs -= resume_step // len(train_dataloader)
|
||||||
|
resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step
|
||||||
|
|
||||||
for epoch in range(args.num_train_epochs):
|
for epoch in range(args.num_train_epochs):
|
||||||
model.train()
|
model.train()
|
||||||
|
if args.with_tracking:
|
||||||
|
total_loss = 0
|
||||||
for step, batch in enumerate(train_dataloader):
|
for step, batch in enumerate(train_dataloader):
|
||||||
|
# We need to skip steps until we reach the resumed step
|
||||||
|
if args.resume_from_checkpoint and epoch == 0 and step < resume_step:
|
||||||
|
continue
|
||||||
outputs = model(**batch)
|
outputs = model(**batch)
|
||||||
loss = outputs.loss
|
loss = outputs.loss
|
||||||
|
# We keep track of the loss at each epoch
|
||||||
|
if args.with_tracking:
|
||||||
|
total_loss += loss.detach().float()
|
||||||
loss = loss / args.gradient_accumulation_steps
|
loss = loss / args.gradient_accumulation_steps
|
||||||
accelerator.backward(loss)
|
accelerator.backward(loss)
|
||||||
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
|
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
|
||||||
@@ -547,6 +600,10 @@ def main():
|
|||||||
progress_bar.update(1)
|
progress_bar.update(1)
|
||||||
completed_steps += 1
|
completed_steps += 1
|
||||||
|
|
||||||
|
if isinstance(checkpointing_steps, int):
|
||||||
|
if completed_steps % checkpointing_steps == 0:
|
||||||
|
accelerator.save_state(f"step_{completed_steps}")
|
||||||
|
|
||||||
if completed_steps >= args.max_train_steps:
|
if completed_steps >= args.max_train_steps:
|
||||||
break
|
break
|
||||||
|
|
||||||
@@ -596,6 +653,11 @@ def main():
|
|||||||
|
|
||||||
logger.info(result)
|
logger.info(result)
|
||||||
|
|
||||||
|
if args.with_tracking:
|
||||||
|
result["train_loss"] = total_loss
|
||||||
|
result["epoch"] = epoch
|
||||||
|
accelerator.log(result, step=completed_steps)
|
||||||
|
|
||||||
if args.push_to_hub and epoch < args.num_train_epochs - 1:
|
if args.push_to_hub and epoch < args.num_train_epochs - 1:
|
||||||
accelerator.wait_for_everyone()
|
accelerator.wait_for_everyone()
|
||||||
unwrapped_model = accelerator.unwrap_model(model)
|
unwrapped_model = accelerator.unwrap_model(model)
|
||||||
@@ -606,6 +668,9 @@ def main():
|
|||||||
commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
|
commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if args.checkpointing_steps == "epoch":
|
||||||
|
accelerator.save_state(f"epoch_{epoch}")
|
||||||
|
|
||||||
if args.output_dir is not None:
|
if args.output_dir is not None:
|
||||||
accelerator.wait_for_everyone()
|
accelerator.wait_for_everyone()
|
||||||
unwrapped_model = accelerator.unwrap_model(model)
|
unwrapped_model = accelerator.unwrap_model(model)
|
||||||
|
|||||||
@@ -150,6 +150,24 @@ def parse_args():
|
|||||||
"--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
|
"--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
|
||||||
)
|
)
|
||||||
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
||||||
|
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--checkpointing_steps",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--resume_from_checkpoint",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="If the training should continue from a checkpoint folder.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--with_tracking",
|
||||||
|
required=False,
|
||||||
|
help="Whether to load in all available experiment trackers from the environment and use them for logging.",
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Sanity checks
|
# Sanity checks
|
||||||
@@ -173,7 +191,8 @@ def main():
|
|||||||
args = parse_args()
|
args = parse_args()
|
||||||
|
|
||||||
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
|
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
|
||||||
accelerator = Accelerator()
|
# If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment
|
||||||
|
accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator()
|
||||||
# Make one log on every process with the configuration for debugging.
|
# Make one log on every process with the configuration for debugging.
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||||
@@ -376,14 +395,6 @@ def main():
|
|||||||
]
|
]
|
||||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
|
||||||
|
|
||||||
# Prepare everything with our `accelerator`.
|
|
||||||
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
|
|
||||||
model, optimizer, train_dataloader, eval_dataloader
|
|
||||||
)
|
|
||||||
|
|
||||||
# Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
|
|
||||||
# shorter in multiprocess)
|
|
||||||
|
|
||||||
# Scheduler and math around the number of training steps.
|
# Scheduler and math around the number of training steps.
|
||||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||||
if args.max_train_steps is None:
|
if args.max_train_steps is None:
|
||||||
@@ -398,6 +409,23 @@ def main():
|
|||||||
num_training_steps=args.max_train_steps,
|
num_training_steps=args.max_train_steps,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Prepare everything with our `accelerator`.
|
||||||
|
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||||
|
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||||
|
)
|
||||||
|
|
||||||
|
# Figure out how many steps we should save the Accelerator states
|
||||||
|
if hasattr(args.checkpointing_steps, "isdigit"):
|
||||||
|
checkpointing_steps = args.checkpointing_steps
|
||||||
|
if args.checkpointing_steps.isdigit():
|
||||||
|
checkpointing_steps = int(args.checkpointing_steps)
|
||||||
|
else:
|
||||||
|
checkpointing_steps = None
|
||||||
|
|
||||||
|
# We need to initialize the trackers we use, and also store our configuration
|
||||||
|
if args.with_tracking:
|
||||||
|
accelerator.init_trackers("glue_no_trainer", args)
|
||||||
|
|
||||||
# Get the metric function
|
# Get the metric function
|
||||||
if args.task_name is not None:
|
if args.task_name is not None:
|
||||||
metric = load_metric("glue", args.task_name)
|
metric = load_metric("glue", args.task_name)
|
||||||
@@ -417,12 +445,38 @@ def main():
|
|||||||
# Only show the progress bar once on each machine.
|
# Only show the progress bar once on each machine.
|
||||||
progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
|
progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
|
||||||
completed_steps = 0
|
completed_steps = 0
|
||||||
|
# Potentially load in the weights and states from a previous save
|
||||||
|
if args.resume_from_checkpoint:
|
||||||
|
if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
|
||||||
|
accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
|
||||||
|
accelerator.load_state(args.resume_from_checkpoint)
|
||||||
|
resume_step = None
|
||||||
|
path = args.resume_from_checkpoint
|
||||||
|
else:
|
||||||
|
# Get the most recent checkpoint
|
||||||
|
dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
|
||||||
|
dirs.sort(key=os.path.getctime)
|
||||||
|
path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last
|
||||||
|
if "epoch" in path:
|
||||||
|
args.num_train_epochs -= int(path.replace("epoch_", ""))
|
||||||
|
else:
|
||||||
|
resume_step = int(path.replace("step_", ""))
|
||||||
|
args.num_train_epochs -= resume_step // len(train_dataloader)
|
||||||
|
resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step
|
||||||
|
|
||||||
for epoch in range(args.num_train_epochs):
|
for epoch in range(args.num_train_epochs):
|
||||||
model.train()
|
model.train()
|
||||||
|
if args.with_tracking:
|
||||||
|
total_loss = 0
|
||||||
for step, batch in enumerate(train_dataloader):
|
for step, batch in enumerate(train_dataloader):
|
||||||
|
# We need to skip steps until we reach the resumed step
|
||||||
|
if args.resume_from_checkpoint and epoch == 0 and step < resume_step:
|
||||||
|
continue
|
||||||
outputs = model(**batch)
|
outputs = model(**batch)
|
||||||
loss = outputs.loss
|
loss = outputs.loss
|
||||||
|
# We keep track of the loss at each epoch
|
||||||
|
if args.with_tracking:
|
||||||
|
total_loss += loss.detach().float()
|
||||||
loss = loss / args.gradient_accumulation_steps
|
loss = loss / args.gradient_accumulation_steps
|
||||||
accelerator.backward(loss)
|
accelerator.backward(loss)
|
||||||
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
|
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
|
||||||
@@ -432,6 +486,10 @@ def main():
|
|||||||
progress_bar.update(1)
|
progress_bar.update(1)
|
||||||
completed_steps += 1
|
completed_steps += 1
|
||||||
|
|
||||||
|
if isinstance(checkpointing_steps, int):
|
||||||
|
if completed_steps % checkpointing_steps == 0:
|
||||||
|
accelerator.save_state(f"step_{completed_steps}")
|
||||||
|
|
||||||
if completed_steps >= args.max_train_steps:
|
if completed_steps >= args.max_train_steps:
|
||||||
break
|
break
|
||||||
|
|
||||||
@@ -447,6 +505,16 @@ def main():
|
|||||||
eval_metric = metric.compute()
|
eval_metric = metric.compute()
|
||||||
logger.info(f"epoch {epoch}: {eval_metric}")
|
logger.info(f"epoch {epoch}: {eval_metric}")
|
||||||
|
|
||||||
|
if args.with_tracking:
|
||||||
|
accelerator.log(
|
||||||
|
{
|
||||||
|
"accuracy" if args.task_name is not None else "glue": eval_metric,
|
||||||
|
"train_loss": total_loss,
|
||||||
|
"epoch": epoch,
|
||||||
|
},
|
||||||
|
step=completed_steps,
|
||||||
|
)
|
||||||
|
|
||||||
if args.push_to_hub and epoch < args.num_train_epochs - 1:
|
if args.push_to_hub and epoch < args.num_train_epochs - 1:
|
||||||
accelerator.wait_for_everyone()
|
accelerator.wait_for_everyone()
|
||||||
unwrapped_model = accelerator.unwrap_model(model)
|
unwrapped_model = accelerator.unwrap_model(model)
|
||||||
@@ -457,6 +525,9 @@ def main():
|
|||||||
commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
|
commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if args.checkpointing_steps == "epoch":
|
||||||
|
accelerator.save_state(f"epoch_{epoch}")
|
||||||
|
|
||||||
if args.output_dir is not None:
|
if args.output_dir is not None:
|
||||||
accelerator.wait_for_everyone()
|
accelerator.wait_for_everyone()
|
||||||
unwrapped_model = accelerator.unwrap_model(model)
|
unwrapped_model = accelerator.unwrap_model(model)
|
||||||
|
|||||||
@@ -204,6 +204,23 @@ def parse_args():
|
|||||||
"--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
|
"--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
|
||||||
)
|
)
|
||||||
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--checkpointing_steps",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--resume_from_checkpoint",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="If the training should continue from a checkpoint folder.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--with_tracking",
|
||||||
|
required=False,
|
||||||
|
help="Whether to load in all available experiment trackers from the environment and use them for logging.",
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Sanity checks
|
# Sanity checks
|
||||||
@@ -227,7 +244,8 @@ def main():
|
|||||||
args = parse_args()
|
args = parse_args()
|
||||||
|
|
||||||
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
|
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
|
||||||
accelerator = Accelerator()
|
# If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment
|
||||||
|
accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator()
|
||||||
# Make one log on every process with the configuration for debugging.
|
# Make one log on every process with the configuration for debugging.
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||||
@@ -491,14 +509,6 @@ def main():
|
|||||||
device = accelerator.device
|
device = accelerator.device
|
||||||
model.to(device)
|
model.to(device)
|
||||||
|
|
||||||
# Prepare everything with our `accelerator`.
|
|
||||||
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
|
|
||||||
model, optimizer, train_dataloader, eval_dataloader
|
|
||||||
)
|
|
||||||
|
|
||||||
# Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
|
|
||||||
# shorter in multiprocess)
|
|
||||||
|
|
||||||
# Scheduler and math around the number of training steps.
|
# Scheduler and math around the number of training steps.
|
||||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||||
if args.max_train_steps is None:
|
if args.max_train_steps is None:
|
||||||
@@ -513,6 +523,23 @@ def main():
|
|||||||
num_training_steps=args.max_train_steps,
|
num_training_steps=args.max_train_steps,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Prepare everything with our `accelerator`.
|
||||||
|
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||||
|
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||||
|
)
|
||||||
|
|
||||||
|
# Figure out how many steps we should save the Accelerator states
|
||||||
|
if hasattr(args.checkpointing_steps, "isdigit"):
|
||||||
|
checkpointing_steps = args.checkpointing_steps
|
||||||
|
if args.checkpointing_steps.isdigit():
|
||||||
|
checkpointing_steps = int(args.checkpointing_steps)
|
||||||
|
else:
|
||||||
|
checkpointing_steps = None
|
||||||
|
|
||||||
|
# We need to initialize the trackers we use, and also store our configuration
|
||||||
|
if args.with_tracking:
|
||||||
|
accelerator.init_trackers("clm_no_trainer", args)
|
||||||
|
|
||||||
# Metrics
|
# Metrics
|
||||||
metric = load_metric("seqeval")
|
metric = load_metric("seqeval")
|
||||||
|
|
||||||
@@ -569,12 +596,38 @@ def main():
|
|||||||
# Only show the progress bar once on each machine.
|
# Only show the progress bar once on each machine.
|
||||||
progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
|
progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
|
||||||
completed_steps = 0
|
completed_steps = 0
|
||||||
|
# Potentially load in the weights and states from a previous save
|
||||||
|
if args.resume_from_checkpoint:
|
||||||
|
if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
|
||||||
|
accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
|
||||||
|
accelerator.load_state(args.resume_from_checkpoint)
|
||||||
|
resume_step = None
|
||||||
|
path = args.resume_from_checkpoint
|
||||||
|
else:
|
||||||
|
# Get the most recent checkpoint
|
||||||
|
dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
|
||||||
|
dirs.sort(key=os.path.getctime)
|
||||||
|
path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last
|
||||||
|
if "epoch" in path:
|
||||||
|
args.num_train_epochs -= int(path.replace("epoch_", ""))
|
||||||
|
else:
|
||||||
|
resume_step = int(path.replace("step_", ""))
|
||||||
|
args.num_train_epochs -= resume_step // len(train_dataloader)
|
||||||
|
resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step
|
||||||
|
|
||||||
for epoch in range(args.num_train_epochs):
|
for epoch in range(args.num_train_epochs):
|
||||||
model.train()
|
model.train()
|
||||||
|
if args.with_tracking:
|
||||||
|
total_loss = 0
|
||||||
for step, batch in enumerate(train_dataloader):
|
for step, batch in enumerate(train_dataloader):
|
||||||
|
# We need to skip steps until we reach the resumed step
|
||||||
|
if args.resume_from_checkpoint and epoch == 0 and step < resume_step:
|
||||||
|
continue
|
||||||
outputs = model(**batch)
|
outputs = model(**batch)
|
||||||
loss = outputs.loss
|
loss = outputs.loss
|
||||||
|
# We keep track of the loss at each epoch
|
||||||
|
if args.with_tracking:
|
||||||
|
total_loss += loss.detach().float()
|
||||||
loss = loss / args.gradient_accumulation_steps
|
loss = loss / args.gradient_accumulation_steps
|
||||||
accelerator.backward(loss)
|
accelerator.backward(loss)
|
||||||
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
|
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
|
||||||
@@ -584,6 +637,10 @@ def main():
|
|||||||
progress_bar.update(1)
|
progress_bar.update(1)
|
||||||
completed_steps += 1
|
completed_steps += 1
|
||||||
|
|
||||||
|
if isinstance(checkpointing_steps, int):
|
||||||
|
if completed_steps % checkpointing_steps == 0:
|
||||||
|
accelerator.save_state(f"step_{completed_steps}")
|
||||||
|
|
||||||
if completed_steps >= args.max_train_steps:
|
if completed_steps >= args.max_train_steps:
|
||||||
break
|
break
|
||||||
|
|
||||||
@@ -608,6 +665,15 @@ def main():
|
|||||||
# eval_metric = metric.compute()
|
# eval_metric = metric.compute()
|
||||||
eval_metric = compute_metrics()
|
eval_metric = compute_metrics()
|
||||||
accelerator.print(f"epoch {epoch}:", eval_metric)
|
accelerator.print(f"epoch {epoch}:", eval_metric)
|
||||||
|
if args.with_tracking:
|
||||||
|
accelerator.log(
|
||||||
|
{
|
||||||
|
"seqeval": eval_metric,
|
||||||
|
"train_loss": total_loss,
|
||||||
|
"epoch": epoch,
|
||||||
|
},
|
||||||
|
step=completed_steps,
|
||||||
|
)
|
||||||
|
|
||||||
if args.push_to_hub and epoch < args.num_train_epochs - 1:
|
if args.push_to_hub and epoch < args.num_train_epochs - 1:
|
||||||
accelerator.wait_for_everyone()
|
accelerator.wait_for_everyone()
|
||||||
@@ -619,6 +685,9 @@ def main():
|
|||||||
commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
|
commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if args.checkpointing_steps == "epoch":
|
||||||
|
accelerator.save_state(f"epoch_{epoch}")
|
||||||
|
|
||||||
if args.output_dir is not None:
|
if args.output_dir is not None:
|
||||||
accelerator.wait_for_everyone()
|
accelerator.wait_for_everyone()
|
||||||
unwrapped_model = accelerator.unwrap_model(model)
|
unwrapped_model = accelerator.unwrap_model(model)
|
||||||
|
|||||||
@@ -243,6 +243,23 @@ def parse_args():
|
|||||||
"--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
|
"--hub_model_id", type=str, help="The name of the repository to keep in sync with the local `output_dir`."
|
||||||
)
|
)
|
||||||
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--checkpointing_steps",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Whether the various states should be saved at the end of every n steps, or 'epoch' for each epoch.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--resume_from_checkpoint",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="If the training should continue from a checkpoint folder.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--with_tracking",
|
||||||
|
required=False,
|
||||||
|
help="Whether to load in all available experiment trackers from the environment and use them for logging.",
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# Sanity checks
|
# Sanity checks
|
||||||
@@ -268,7 +285,8 @@ def main():
|
|||||||
args = parse_args()
|
args = parse_args()
|
||||||
|
|
||||||
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
|
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
|
||||||
accelerator = Accelerator()
|
# If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment
|
||||||
|
accelerator = Accelerator(log_with="all") if args.with_tracking else Accelerator()
|
||||||
|
|
||||||
# Make one log on every process with the configuration for debugging.
|
# Make one log on every process with the configuration for debugging.
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
@@ -472,14 +490,6 @@ def main():
|
|||||||
]
|
]
|
||||||
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
|
||||||
|
|
||||||
# Prepare everything with our `accelerator`.
|
|
||||||
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
|
|
||||||
model, optimizer, train_dataloader, eval_dataloader
|
|
||||||
)
|
|
||||||
|
|
||||||
# Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
|
|
||||||
# shorter in multiprocess)
|
|
||||||
|
|
||||||
# Scheduler and math around the number of training steps.
|
# Scheduler and math around the number of training steps.
|
||||||
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
|
||||||
if args.max_train_steps is None:
|
if args.max_train_steps is None:
|
||||||
@@ -494,6 +504,23 @@ def main():
|
|||||||
num_training_steps=args.max_train_steps,
|
num_training_steps=args.max_train_steps,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Prepare everything with our `accelerator`.
|
||||||
|
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
|
||||||
|
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
|
||||||
|
)
|
||||||
|
|
||||||
|
# Figure out how many steps we should save the Accelerator states
|
||||||
|
if hasattr(args.checkpointing_steps, "isdigit"):
|
||||||
|
checkpointing_steps = args.checkpointing_steps
|
||||||
|
if args.checkpointing_steps.isdigit():
|
||||||
|
checkpointing_steps = int(args.checkpointing_steps)
|
||||||
|
else:
|
||||||
|
checkpointing_steps = None
|
||||||
|
|
||||||
|
# We need to initialize the trackers we use, and also store our configuration
|
||||||
|
if args.with_tracking:
|
||||||
|
accelerator.init_trackers("translation_no_trainer", args)
|
||||||
|
|
||||||
metric = load_metric("sacrebleu")
|
metric = load_metric("sacrebleu")
|
||||||
|
|
||||||
def postprocess_text(preds, labels):
|
def postprocess_text(preds, labels):
|
||||||
@@ -516,11 +543,38 @@ def main():
|
|||||||
progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
|
progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
|
||||||
completed_steps = 0
|
completed_steps = 0
|
||||||
|
|
||||||
|
# Potentially load in the weights and states from a previous save
|
||||||
|
if args.resume_from_checkpoint:
|
||||||
|
if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
|
||||||
|
accelerator.print(f"Resumed from checkpoint: {args.resume_from_checkpoint}")
|
||||||
|
accelerator.load_state(args.resume_from_checkpoint)
|
||||||
|
resume_step = None
|
||||||
|
path = args.resume_from_checkpoint
|
||||||
|
else:
|
||||||
|
# Get the most recent checkpoint
|
||||||
|
dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
|
||||||
|
dirs.sort(key=os.path.getctime)
|
||||||
|
path = dirs[-1] # Sorts folders by date modified, most recent checkpoint is the last
|
||||||
|
if "epoch" in path:
|
||||||
|
args.num_train_epochs -= int(path.replace("epoch_", ""))
|
||||||
|
else:
|
||||||
|
resume_step = int(path.replace("step_", ""))
|
||||||
|
args.num_train_epochs -= resume_step // len(train_dataloader)
|
||||||
|
resume_step = (args.num_train_epochs * len(train_dataloader)) - resume_step
|
||||||
|
|
||||||
for epoch in range(args.num_train_epochs):
|
for epoch in range(args.num_train_epochs):
|
||||||
model.train()
|
model.train()
|
||||||
|
if args.with_tracking:
|
||||||
|
total_loss = 0
|
||||||
for step, batch in enumerate(train_dataloader):
|
for step, batch in enumerate(train_dataloader):
|
||||||
|
# We need to skip steps until we reach the resumed step
|
||||||
|
if args.resume_from_checkpoint and epoch == 0 and step < resume_step:
|
||||||
|
continue
|
||||||
outputs = model(**batch)
|
outputs = model(**batch)
|
||||||
loss = outputs.loss
|
loss = outputs.loss
|
||||||
|
# We keep track of the loss at each epoch
|
||||||
|
if args.with_tracking:
|
||||||
|
total_loss += loss.detach().float()
|
||||||
loss = loss / args.gradient_accumulation_steps
|
loss = loss / args.gradient_accumulation_steps
|
||||||
accelerator.backward(loss)
|
accelerator.backward(loss)
|
||||||
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
|
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
|
||||||
@@ -530,6 +584,10 @@ def main():
|
|||||||
progress_bar.update(1)
|
progress_bar.update(1)
|
||||||
completed_steps += 1
|
completed_steps += 1
|
||||||
|
|
||||||
|
if isinstance(checkpointing_steps, int):
|
||||||
|
if completed_steps % checkpointing_steps == 0:
|
||||||
|
accelerator.save_state(f"step_{completed_steps}")
|
||||||
|
|
||||||
if completed_steps >= args.max_train_steps:
|
if completed_steps >= args.max_train_steps:
|
||||||
break
|
break
|
||||||
|
|
||||||
@@ -574,6 +632,16 @@ def main():
|
|||||||
eval_metric = metric.compute()
|
eval_metric = metric.compute()
|
||||||
logger.info({"bleu": eval_metric["score"]})
|
logger.info({"bleu": eval_metric["score"]})
|
||||||
|
|
||||||
|
if args.with_tracking:
|
||||||
|
accelerator.log(
|
||||||
|
{
|
||||||
|
"blue": eval_metric["score"],
|
||||||
|
"train_loss": total_loss,
|
||||||
|
"epoch": epoch,
|
||||||
|
},
|
||||||
|
step=completed_steps,
|
||||||
|
)
|
||||||
|
|
||||||
if args.push_to_hub and epoch < args.num_train_epochs - 1:
|
if args.push_to_hub and epoch < args.num_train_epochs - 1:
|
||||||
accelerator.wait_for_everyone()
|
accelerator.wait_for_everyone()
|
||||||
unwrapped_model = accelerator.unwrap_model(model)
|
unwrapped_model = accelerator.unwrap_model(model)
|
||||||
@@ -584,6 +652,9 @@ def main():
|
|||||||
commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
|
commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if args.checkpointing_steps == "epoch":
|
||||||
|
accelerator.save_state(f"epoch_{epoch}")
|
||||||
|
|
||||||
if args.output_dir is not None:
|
if args.output_dir is not None:
|
||||||
accelerator.wait_for_everyone()
|
accelerator.wait_for_everyone()
|
||||||
unwrapped_model = accelerator.unwrap_model(model)
|
unwrapped_model = accelerator.unwrap_model(model)
|
||||||
|
|||||||
Reference in New Issue
Block a user