Update no_trainer.py scripts to include accelerate gradient accumulation wrapper (#18473)
* Added accelerate gradient accumulation wrapper to run_image_classification_no_trainer.py example script * make fixup changes * PR comments * changed input to Acceletor based on PR comment, ran make fixup * Added comment explaining the sync_gradients statement * Fixed lr scheduler max steps * Changed run_clm_no_trainer.py script to use accelerate gradient accum wrapper * Fixed all scripts except wav2vec2 pretraining to use accelerate gradient accum wrapper * Added accelerate gradient accum wrapper for wav2vec2_pretraining_no_trainer.py script * make fixup and lr_scheduler step inserted back into run_qa_beam_search_no_trainer.py * removed changes to run_wav2vec2_pretraining_no_trainer.py script and fixed using wrong constant in qa_beam_search_no_trainer.py script
This commit is contained in:
committed by
GitHub
parent
f1f5de31ed
commit
a765b68aa6
@@ -212,9 +212,14 @@ def main():
|
||||
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
|
||||
# If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
|
||||
# in the environment
|
||||
accelerator = (
|
||||
Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
|
||||
)
|
||||
accelerator_log_kwargs = {}
|
||||
|
||||
if args.with_tracking:
|
||||
accelerator_log_kwargs["log_with"] = args.report_to
|
||||
accelerator_log_kwargs["logging_dir"] = args.output_dir
|
||||
|
||||
accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
|
||||
|
||||
logger.info(accelerator.state)
|
||||
# Make one log on every process with the configuration for debugging.
|
||||
logging.basicConfig(
|
||||
@@ -384,8 +389,8 @@ def main():
|
||||
lr_scheduler = get_scheduler(
|
||||
name=args.lr_scheduler_type,
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=args.num_warmup_steps,
|
||||
num_training_steps=args.max_train_steps,
|
||||
num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
|
||||
num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
|
||||
)
|
||||
|
||||
# Prepare everything with our `accelerator`.
|
||||
@@ -467,17 +472,20 @@ def main():
|
||||
if resume_step is not None and step < resume_step:
|
||||
completed_steps += 1
|
||||
continue
|
||||
|
||||
with accelerator.accumulate(model):
|
||||
outputs = model(**batch)
|
||||
loss = outputs.loss
|
||||
# We keep track of the loss at each epoch
|
||||
if args.with_tracking:
|
||||
total_loss += loss.detach().float()
|
||||
loss = loss / args.gradient_accumulation_steps
|
||||
accelerator.backward(loss)
|
||||
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
|
||||
optimizer.step()
|
||||
lr_scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
# Checks if the accelerator has performed an optimization step behind the scenes
|
||||
if accelerator.sync_gradients:
|
||||
progress_bar.update(1)
|
||||
completed_steps += 1
|
||||
|
||||
|
||||
@@ -249,9 +249,14 @@ def main():
|
||||
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
|
||||
# If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
|
||||
# in the environment
|
||||
accelerator = (
|
||||
Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
|
||||
)
|
||||
accelerator_log_kwargs = {}
|
||||
|
||||
if args.with_tracking:
|
||||
accelerator_log_kwargs["log_with"] = args.report_to
|
||||
accelerator_log_kwargs["logging_dir"] = args.output_dir
|
||||
|
||||
accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
|
||||
|
||||
# Make one log on every process with the configuration for debugging.
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
@@ -486,8 +491,8 @@ def main():
|
||||
lr_scheduler = get_scheduler(
|
||||
name=args.lr_scheduler_type,
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=args.num_warmup_steps,
|
||||
num_training_steps=args.max_train_steps,
|
||||
num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
|
||||
num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
|
||||
)
|
||||
|
||||
# Prepare everything with our `accelerator`.
|
||||
@@ -567,17 +572,20 @@ def main():
|
||||
if resume_step is not None and step < resume_step:
|
||||
completed_steps += 1
|
||||
continue
|
||||
|
||||
with accelerator.accumulate(model):
|
||||
outputs = model(**batch)
|
||||
loss = outputs.loss
|
||||
# We keep track of the loss at each epoch
|
||||
if args.with_tracking:
|
||||
total_loss += loss.detach().float()
|
||||
loss = loss / args.gradient_accumulation_steps
|
||||
accelerator.backward(loss)
|
||||
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
|
||||
optimizer.step()
|
||||
lr_scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
# Checks if the accelerator has performed an optimization step behind the scenes
|
||||
if accelerator.sync_gradients:
|
||||
progress_bar.update(1)
|
||||
completed_steps += 1
|
||||
|
||||
|
||||
@@ -258,9 +258,14 @@ def main():
|
||||
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
|
||||
# If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
|
||||
# in the environment
|
||||
accelerator = (
|
||||
Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
|
||||
)
|
||||
accelerator_log_kwargs = {}
|
||||
|
||||
if args.with_tracking:
|
||||
accelerator_log_kwargs["log_with"] = args.report_to
|
||||
accelerator_log_kwargs["logging_dir"] = args.output_dir
|
||||
|
||||
accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
|
||||
|
||||
# Make one log on every process with the configuration for debugging.
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
@@ -530,8 +535,8 @@ def main():
|
||||
lr_scheduler = get_scheduler(
|
||||
name=args.lr_scheduler_type,
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=args.num_warmup_steps,
|
||||
num_training_steps=args.max_train_steps,
|
||||
num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
|
||||
num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
|
||||
)
|
||||
|
||||
# Prepare everything with our `accelerator`.
|
||||
@@ -611,17 +616,20 @@ def main():
|
||||
if resume_step is not None and step < resume_step:
|
||||
completed_steps += 1
|
||||
continue
|
||||
|
||||
with accelerator.accumulate(model):
|
||||
outputs = model(**batch)
|
||||
loss = outputs.loss
|
||||
# We keep track of the loss at each epoch
|
||||
if args.with_tracking:
|
||||
total_loss += loss.detach().float()
|
||||
loss = loss / args.gradient_accumulation_steps
|
||||
accelerator.backward(loss)
|
||||
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
|
||||
optimizer.step()
|
||||
lr_scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
# Checks if the accelerator has performed an optimization step behind the scenes
|
||||
if accelerator.sync_gradients:
|
||||
progress_bar.update(1)
|
||||
completed_steps += 1
|
||||
|
||||
|
||||
@@ -65,7 +65,7 @@ MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(description="Finetune a transformers model on a text classification task")
|
||||
parser = argparse.ArgumentParser(description="Finetune a transformers model on a multiple choice task")
|
||||
parser.add_argument(
|
||||
"--dataset_name",
|
||||
type=str,
|
||||
@@ -284,9 +284,14 @@ def main():
|
||||
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
|
||||
# If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
|
||||
# in the environment
|
||||
accelerator = (
|
||||
Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
|
||||
)
|
||||
accelerator_log_kwargs = {}
|
||||
|
||||
if args.with_tracking:
|
||||
accelerator_log_kwargs["log_with"] = args.report_to
|
||||
accelerator_log_kwargs["logging_dir"] = args.output_dir
|
||||
|
||||
accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
|
||||
|
||||
# Make one log on every process with the configuration for debugging.
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
@@ -483,8 +488,8 @@ def main():
|
||||
lr_scheduler = get_scheduler(
|
||||
name=args.lr_scheduler_type,
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=args.num_warmup_steps,
|
||||
num_training_steps=args.max_train_steps,
|
||||
num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
|
||||
num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
|
||||
)
|
||||
|
||||
# Prepare everything with our `accelerator`.
|
||||
@@ -567,17 +572,20 @@ def main():
|
||||
if resume_step is not None and step < resume_step:
|
||||
completed_steps += 1
|
||||
continue
|
||||
|
||||
with accelerator.accumulate(model):
|
||||
outputs = model(**batch)
|
||||
loss = outputs.loss
|
||||
# We keep track of the loss at each epoch
|
||||
if args.with_tracking:
|
||||
total_loss += loss.detach().float()
|
||||
loss = loss / args.gradient_accumulation_steps
|
||||
accelerator.backward(loss)
|
||||
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
|
||||
optimizer.step()
|
||||
lr_scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
# Checks if the accelerator has performed an optimization step behind the scenes
|
||||
if accelerator.sync_gradients:
|
||||
progress_bar.update(1)
|
||||
completed_steps += 1
|
||||
|
||||
|
||||
@@ -297,8 +297,16 @@ def main():
|
||||
send_example_telemetry("run_qa_beam_search_no_trainer", args)
|
||||
|
||||
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
|
||||
# If we're using tracking, we also need to initialize it here and it will pick up all supported trackers in the environment
|
||||
accelerator = Accelerator(log_with="all", logging_dir=args.output_dir) if args.with_tracking else Accelerator()
|
||||
# If we're using tracking, we also need to initialize it here and it will pick up all supported trackers
|
||||
# in the environment
|
||||
accelerator_log_kwargs = {}
|
||||
|
||||
if args.with_tracking:
|
||||
accelerator_log_kwargs["log_with"] = args.report_to
|
||||
accelerator_log_kwargs["logging_dir"] = args.output_dir
|
||||
|
||||
accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
|
||||
|
||||
# Make one log on every process with the configuration for debugging.
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
@@ -739,8 +747,8 @@ def main():
|
||||
lr_scheduler = get_scheduler(
|
||||
name=args.lr_scheduler_type,
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=args.num_warmup_steps,
|
||||
num_training_steps=args.max_train_steps,
|
||||
num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
|
||||
num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
|
||||
)
|
||||
|
||||
# Prepare everything with our `accelerator`.
|
||||
@@ -818,17 +826,22 @@ def main():
|
||||
if resume_step is not None and step < resume_step:
|
||||
completed_steps += 1
|
||||
continue
|
||||
|
||||
with accelerator.accumulate(model):
|
||||
outputs = model(**batch)
|
||||
loss = outputs.loss
|
||||
# We keep track of the loss at each epoch
|
||||
if args.with_tracking:
|
||||
total_loss += loss.detach().float()
|
||||
loss = loss / args.gradient_accumulation_steps
|
||||
|
||||
accelerator.backward(loss)
|
||||
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
|
||||
|
||||
optimizer.step()
|
||||
lr_scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
# Checks if the accelerator has performed an optimization step behind the scenes
|
||||
if accelerator.sync_gradients:
|
||||
progress_bar.update(1)
|
||||
completed_steps += 1
|
||||
|
||||
|
||||
@@ -337,9 +337,14 @@ def main():
|
||||
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
|
||||
# If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
|
||||
# in the environment
|
||||
accelerator = (
|
||||
Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
|
||||
)
|
||||
accelerator_log_kwargs = {}
|
||||
|
||||
if args.with_tracking:
|
||||
accelerator_log_kwargs["log_with"] = args.report_to
|
||||
accelerator_log_kwargs["logging_dir"] = args.output_dir
|
||||
|
||||
accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
|
||||
|
||||
# Make one log on every process with the configuration for debugging.
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
@@ -757,8 +762,8 @@ def main():
|
||||
lr_scheduler = get_scheduler(
|
||||
name=args.lr_scheduler_type,
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=args.num_warmup_steps,
|
||||
num_training_steps=args.max_train_steps,
|
||||
num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
|
||||
num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
|
||||
)
|
||||
|
||||
# Prepare everything with our `accelerator`.
|
||||
@@ -839,17 +844,21 @@ def main():
|
||||
if resume_step is not None and step < resume_step:
|
||||
completed_steps += 1
|
||||
continue
|
||||
|
||||
with accelerator.accumulate(model):
|
||||
outputs = model(**batch)
|
||||
loss = outputs.loss
|
||||
# We keep track of the loss at each epoch
|
||||
if args.with_tracking:
|
||||
total_loss += loss.detach().float()
|
||||
loss = loss / args.gradient_accumulation_steps
|
||||
|
||||
accelerator.backward(loss)
|
||||
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
|
||||
optimizer.step()
|
||||
lr_scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
# Checks if the accelerator has performed an optimization step behind the scenes
|
||||
if accelerator.sync_gradients:
|
||||
progress_bar.update(1)
|
||||
completed_steps += 1
|
||||
|
||||
|
||||
@@ -326,9 +326,14 @@ def main():
|
||||
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
|
||||
# If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
|
||||
# in the environment
|
||||
accelerator = (
|
||||
Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
|
||||
)
|
||||
accelerator_log_kwargs = {}
|
||||
|
||||
if args.with_tracking:
|
||||
accelerator_log_kwargs["log_with"] = args.report_to
|
||||
accelerator_log_kwargs["logging_dir"] = args.output_dir
|
||||
|
||||
accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
|
||||
|
||||
logger.info(accelerator.state, main_process_only=False)
|
||||
if accelerator.is_local_main_process:
|
||||
datasets.utils.logging.set_verbosity_warning()
|
||||
@@ -487,8 +492,8 @@ def main():
|
||||
lr_scheduler = get_scheduler(
|
||||
name=args.lr_scheduler_type,
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=args.num_warmup_steps,
|
||||
num_training_steps=args.max_train_steps,
|
||||
num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
|
||||
num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
|
||||
)
|
||||
|
||||
# Prepare everything with our `accelerator`.
|
||||
@@ -563,17 +568,20 @@ def main():
|
||||
if resume_step is not None and step < resume_step:
|
||||
completed_steps += 1
|
||||
continue
|
||||
|
||||
with accelerator.accumulate(model):
|
||||
outputs = model(**batch)
|
||||
loss = outputs.loss
|
||||
# We keep track of the loss at each epoch
|
||||
if args.with_tracking:
|
||||
total_loss += loss.detach().float()
|
||||
loss = loss / args.gradient_accumulation_steps
|
||||
accelerator.backward(loss)
|
||||
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
|
||||
optimizer.step()
|
||||
lr_scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
# Checks if the accelerator has performed an optimization step behind the scenes
|
||||
if accelerator.sync_gradients:
|
||||
progress_bar.update(1)
|
||||
completed_steps += 1
|
||||
|
||||
|
||||
@@ -330,9 +330,13 @@ def main():
|
||||
# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
|
||||
# If we're using tracking, we also need to initialize it here and it will by default pick up all supported trackers
|
||||
# in the environment
|
||||
accelerator = (
|
||||
Accelerator(log_with=args.report_to, logging_dir=args.output_dir) if args.with_tracking else Accelerator()
|
||||
)
|
||||
accelerator_log_kwargs = {}
|
||||
|
||||
if args.with_tracking:
|
||||
accelerator_log_kwargs["log_with"] = args.report_to
|
||||
accelerator_log_kwargs["logging_dir"] = args.output_dir
|
||||
|
||||
accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
|
||||
if args.source_prefix is None and args.model_name_or_path in [
|
||||
"t5-small",
|
||||
"t5-base",
|
||||
@@ -552,8 +556,8 @@ def main():
|
||||
lr_scheduler = get_scheduler(
|
||||
name=args.lr_scheduler_type,
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=args.num_warmup_steps,
|
||||
num_training_steps=args.max_train_steps,
|
||||
num_warmup_steps=args.num_warmup_steps * args.gradient_accumulation_steps,
|
||||
num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
|
||||
)
|
||||
|
||||
# Prepare everything with our `accelerator`.
|
||||
@@ -635,17 +639,20 @@ def main():
|
||||
if resume_step is not None and step < resume_step:
|
||||
completed_steps += 1
|
||||
continue
|
||||
|
||||
with accelerator.accumulate(model):
|
||||
outputs = model(**batch)
|
||||
loss = outputs.loss
|
||||
# We keep track of the loss at each epoch
|
||||
if args.with_tracking:
|
||||
total_loss += loss.detach().float()
|
||||
loss = loss / args.gradient_accumulation_steps
|
||||
accelerator.backward(loss)
|
||||
if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
|
||||
optimizer.step()
|
||||
lr_scheduler.step()
|
||||
optimizer.zero_grad()
|
||||
|
||||
# Checks if the accelerator has performed an optimization step behind the scenes
|
||||
if accelerator.sync_gradients:
|
||||
progress_bar.update(1)
|
||||
completed_steps += 1
|
||||
|
||||
|
||||
Reference in New Issue
Block a user