Merge pull request #1903 from valohai/master

Valohai integration
This commit is contained in:
Thomas Wolf
2019-12-03 16:13:01 +01:00
committed by GitHub
3 changed files with 249 additions and 4 deletions

View File

@@ -22,6 +22,7 @@ import glob
import logging
import os
import random
import json
import numpy as np
import torch
@@ -176,15 +177,23 @@ def train(args, train_dataset, model, tokenizer):
global_step += 1
if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
# Log metrics
logs = {}
if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well
results = evaluate(args, model, tokenizer)
for key, value in results.items():
tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
eval_key = 'eval_{}'.format(key)
logs[eval_key] = value
loss_scalar = (tr_loss - logging_loss) / args.logging_steps
learning_rate_scalar = scheduler.get_lr()[0]
logs['learning_rate'] = learning_rate_scalar
logs['loss'] = loss_scalar
logging_loss = tr_loss
for key, value in logs.items():
tb_writer.add_scalar(key, value, global_step)
print(json.dumps({**logs, **{'step': global_step}}))
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
# Save model checkpoint
output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))