Save tokenizer after each epoch to be able to resume training from a checkpoint
This commit is contained in:
committed by
Lysandre Debut
parent
f71b1bb05a
commit
a03fcf570d
@@ -274,6 +274,8 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
os.makedirs(output_dir)
|
os.makedirs(output_dir)
|
||||||
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
|
||||||
model_to_save.save_pretrained(output_dir)
|
model_to_save.save_pretrained(output_dir)
|
||||||
|
tokenizer.save_pretrained(output_dir)
|
||||||
|
|
||||||
torch.save(args, os.path.join(output_dir, 'training_args.bin'))
|
torch.save(args, os.path.join(output_dir, 'training_args.bin'))
|
||||||
logger.info("Saving model checkpoint to %s", output_dir)
|
logger.info("Saving model checkpoint to %s", output_dir)
|
||||||
|
|
||||||
@@ -282,6 +284,7 @@ def train(args, train_dataset, model, tokenizer):
|
|||||||
torch.save(optimizer.state_dict(), os.path.join(output_dir, 'optimizer.pt'))
|
torch.save(optimizer.state_dict(), os.path.join(output_dir, 'optimizer.pt'))
|
||||||
torch.save(scheduler.state_dict(), os.path.join(output_dir, 'scheduler.pt'))
|
torch.save(scheduler.state_dict(), os.path.join(output_dir, 'scheduler.pt'))
|
||||||
torch.save(epoch, os.path.join(output_dir, 'training_state.pt'))
|
torch.save(epoch, os.path.join(output_dir, 'training_state.pt'))
|
||||||
|
logger.info("Saving training state to %s", output_dir)
|
||||||
|
|
||||||
if args.max_steps > 0 and global_step > args.max_steps:
|
if args.max_steps > 0 and global_step > args.max_steps:
|
||||||
epoch_iterator.close()
|
epoch_iterator.close()
|
||||||
|
|||||||
Reference in New Issue
Block a user