rewamp optimization

This commit is contained in:
thomwolf
2019-07-11 14:48:22 +02:00
parent 4fef5919a5
commit ec07cf5a66
7 changed files with 138 additions and 389 deletions

View File

@@ -25,19 +25,21 @@ import random
import numpy as np
import torch
from tensorboardX import SummaryWriter
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tensorboardX import SummaryWriter
from tqdm import tqdm, trange
from pytorch_transformers import WEIGHTS_NAME
from pytorch_transformers import (BertConfig, BertForSequenceClassification,
BertTokenizer, XLMConfig,
XLMForSequenceClassification, XLMTokenizer,
XLNetConfig, XLNetForSequenceClassification,
from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
BertForSequenceClassification, BertTokenizer,
XLMConfig, XLMForSequenceClassification,
XLMTokenizer, XLNetConfig,
XLNetForSequenceClassification,
XLNetTokenizer)
from pytorch_transformers.optimization import BertAdam
from pytorch_transformers import AdamW, WarmupLinearSchedule
from utils_glue import (compute_metrics, convert_examples_to_features,
output_modes, processors)
@@ -56,24 +58,24 @@ def train(args, train_dataset, model, tokenizer):
if args.local_rank in [-1, 0]:
tb_writer = SummaryWriter()
args.train_batch_size = args.per_gpu_train_batch_size * args.n_gpu
args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
if args.max_steps > 0:
num_train_optimization_steps = args.max_steps
t_total = args.max_steps
args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
else:
num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
# Prepare optimizer
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate,
t_total=num_train_optimization_steps, warmup=args.warmup_proportion)
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
schedule = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
if args.fp16:
try:
from apex import amp
@@ -89,11 +91,11 @@ def train(args, train_dataset, model, tokenizer):
logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d",
args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
logger.info(" Total optimization steps = %d", num_train_optimization_steps)
logger.info(" Total optimization steps = %d", t_total)
global_step = 0
tr_loss, logging_loss = 0.0, 0.0
optimizer.zero_grad()
model.zero_grad()
for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]):
for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
model.train()
@@ -103,7 +105,7 @@ def train(args, train_dataset, model, tokenizer):
'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids
'labels': batch[3]}
ouputs = model(**inputs)
loss = ouputs[0]
loss = ouputs[0] # model outputs are always tuple in pytorch-transformers (see doc)
if args.n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu parallel training
@@ -113,22 +115,25 @@ def train(args, train_dataset, model, tokenizer):
if args.fp16:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
else:
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
tr_loss += loss.item()
if (step + 1) % args.gradient_accumulation_steps == 0:
scheduler.step() # Update learning rate schedule
optimizer.step()
optimizer.zero_grad()
model.zero_grad()
global_step += 1
if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
# Log metrics
if args.local_rank == -1: # Only evaluate on single GPU otherwise metrics may not average well
results = evaluate(args, model, tokenizer, prefix=global_step)
if args.local_rank == -1: # Only evaluate when single GPU otherwise metrics may not average well
results = evaluate(args, model, tokenizer)
for key, value in results.items():
tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
logging_loss = tr_loss
@@ -140,6 +145,7 @@ def train(args, train_dataset, model, tokenizer):
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
torch.save(args, os.path.join(output_dir, 'training_args.bin'))
logger.info("Saving model checkpoint to %s", output_dir)
if args.max_steps > 0 and global_step > args.max_steps:
break
@@ -162,20 +168,21 @@ def evaluate(args, model, tokenizer, prefix=""):
if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
os.makedirs(eval_output_dir)
args.eval_batch_size = args.per_gpu_eval_batch_size * args.n_gpu
# Note that DistributedSampler samples randomly
eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
# Eval!
logger.info("***** Running evaluation *****")
logger.info("***** Running evaluation {} *****".format(prefix))
logger.info(" Num examples = %d", len(eval_dataset))
logger.info(" Batch size = %d", args.eval_batch_size)
model.eval()
eval_loss = 0
nb_eval_steps = 0
preds = None
out_label_ids = None
for batch in tqdm(eval_dataloader, desc="Evaluating"):
model.eval()
batch = tuple(t.to(args.device) for t in batch)
with torch.no_grad():
@@ -186,7 +193,7 @@ def evaluate(args, model, tokenizer, prefix=""):
outputs = model(**inputs)
tmp_eval_loss, logits = outputs[:2]
eval_loss += tmp_eval_loss.mean().item()
eval_loss += tmp_eval_loss.mean().item()
nb_eval_steps += 1
if preds is None:
preds = logits.detach().cpu().numpy()
@@ -213,7 +220,7 @@ def evaluate(args, model, tokenizer, prefix=""):
return results
def load_and_cache_examples(args, task, tokenizer, evaluate=False, overwrite_cache=False):
def load_and_cache_examples(args, task, tokenizer, evaluate=False):
processor = processors[task]()
output_mode = output_modes[task]
# Load data features from cache or dataset file
@@ -285,20 +292,22 @@ def main():
parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
help="Batch size per GPU for training.")
parser.add_argument("--eval_batch_size", default=8, type=int,
help="Total batch size for eval.")
parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
help="Batch size per GPU for evaluation.")
parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
help="Number of updates steps to accumulate before performing a backward/update pass.")
parser.add_argument("--learning_rate", default=5e-5, type=float,
help="The initial learning rate for Adam.")
parser.add_argument("--weight_decay", default=0.0, type=float,
help="Weight deay if we apply some.")
parser.add_argument("--max_grad_norm", default=1.0, type=float,
help="Max gradient norm.")
parser.add_argument("--num_train_epochs", default=3.0, type=float,
help="Total number of training epochs to perform.")
parser.add_argument("--max_steps", default=-1, type=int,
help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
parser.add_argument("--warmup_proportion", default=0.1, type=float,
help="Proportion of training with linear learning rate warmup (0.1 = 10%% of training).")
parser.add_argument("--warmup_steps", default=0, type=int,
help="Linear warmup over warmup_steps.")
parser.add_argument('--logging_steps', type=int, default=50,
help="Log every X updates steps.")
@@ -409,6 +418,7 @@ def main():
if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
os.makedirs(args.output_dir)
logger.info("Saving model checkpoint to %s", args.output_dir)
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
@@ -427,15 +437,18 @@ def main():
if args.do_eval and args.local_rank in [-1, 0]:
checkpoints = [args.output_dir + './' + WEIGHTS_NAME]
if args.eval_all_checkpoints:
checkpoints = list(os.path.dirname(c) for c in glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
logger.info("Evaluate the following checkpoints: %s", checkpoints)
results = {}
for checkpoint in checkpoints:
global_step = int(checkpoints.split('-')[-1])
model = model_class.from_pretrained(checkpoints)
global_step = int(checkpoint.split('-')[-1])
model = model_class.from_pretrained(checkpoint)
model.to(args.device)
result = evaluate(args, model, tokenizer, prefix=global_step)
result = dict(n + '_{}'.format())
result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
results.update(result)
return results