[distillation] big update w/ new weights

This commit is contained in:
VictorSanh
2019-09-19 19:25:21 +00:00
parent 0d1dad6d53
commit 354944e607
7 changed files with 132 additions and 54 deletions

View File

@@ -18,15 +18,18 @@
import os
import math
import psutil
import time
from tensorboardX import SummaryWriter
from tqdm import trange, tqdm
import numpy as np
import psutil
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from pytorch_transformers import AdamW, WarmupLinearSchedule
from pytorch_transformers import WarmupLinearSchedule
from utils import logger
from dataset import Dataset
@@ -58,10 +61,12 @@ class Distiller:
self.alpha_ce = params.alpha_ce
self.alpha_mlm = params.alpha_mlm
self.alpha_mse = params.alpha_mse
self.alpha_cos = params.alpha_cos
assert self.alpha_ce >= 0.
assert self.alpha_mlm >= 0.
assert self.alpha_mse >= 0.
assert self.alpha_ce + self.alpha_mlm + self.alpha_mse > 0.
assert self.alpha_cos >= 0.
assert self.alpha_ce + self.alpha_mlm + self.alpha_mse + self.alpha_cos > 0.
self.mlm_mask_prop = params.mlm_mask_prop
assert 0.0 <= self.mlm_mask_prop <= 1.0
@@ -81,17 +86,21 @@ class Distiller:
self.last_loss = 0
self.last_loss_ce = 0
self.last_loss_mlm = 0
self.last_loss_mse = 0
if self.alpha_mse > 0.: self.last_loss_mse = 0
if self.alpha_cos > 0.: self.last_loss_cos = 0
self.last_log = 0
self.ce_loss_fct = nn.KLDivLoss(reduction='batchmean')
self.mlm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
self.mse_loss_fct = nn.MSELoss(reduction='sum')
if self.alpha_mse > 0.:
self.mse_loss_fct = nn.MSELoss(reduction='sum')
if self.alpha_cos > 0.:
self.cosine_loss_fct = nn.CosineEmbeddingLoss(reduction='mean')
logger.info('--- Initializing model optimizer')
assert params.gradient_accumulation_steps >= 1
self.num_steps_epoch = int(len(self.dataloader) / params.batch_size) + 1
num_train_optimization_steps = int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1
warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
@@ -104,9 +113,12 @@ class Distiller:
lr=params.learning_rate,
eps=params.adam_epsilon,
betas=(0.9, 0.98))
warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop)
logger.info(f'--- Scheduler: {params.scheduler_type}')
self.scheduler = WarmupLinearSchedule(self.optimizer,
warmup_steps=warmup_steps,
t_total=num_train_optimization_steps)
warmup_steps=warmup_steps,
t_total=num_train_optimization_steps)
if self.fp16:
try:
@@ -272,11 +284,14 @@ class Distiller:
The real training loop.
"""
if self.is_master: logger.info('Starting training')
self.last_log = time.time()
self.student.train()
self.teacher.eval()
for _ in range(self.params.n_epoch):
if self.is_master: logger.info(f'--- Starting epoch {self.epoch}/{self.params.n_epoch-1}')
if self.multi_gpu:
torch.distributed.barrier()
iter_bar = trange(self.num_steps_epoch, desc="-Iter", disable=self.params.local_rank not in [-1, 0])
for __ in range(self.num_steps_epoch):
@@ -314,9 +329,9 @@ class Distiller:
attention_mask: `torch.tensor(bs, seq_length)` - The attention mask for self attention.
mlm_labels: `torch.tensor(bs, seq_length)` - The masked language modeling labels.
"""
s_logits = self.student(input_ids=input_ids, attention_mask=attention_mask)[0] # (bs, seq_length, voc_size)
s_logits, s_hidden_states = self.student(input_ids=input_ids, attention_mask=attention_mask) # (bs, seq_length, voc_size)
with torch.no_grad():
t_logits = self.teacher(input_ids=input_ids, attention_mask=attention_mask)[0] # (bs, seq_length, voc_size)
t_logits, t_hidden_states = self.teacher(input_ids=input_ids, attention_mask=attention_mask) # (bs, seq_length, voc_size)
assert s_logits.size() == t_logits.size()
#https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
@@ -340,6 +355,22 @@ class Distiller:
if self.alpha_mse > 0.:
loss_mse = self.mse_loss_fct(s_logits_slct, t_logits_slct)/s_logits_slct.size(0) # Reproducing batchmean reduction
loss += self.alpha_mse * loss_mse
if self.alpha_cos > 0.:
s_hidden_states = s_hidden_states[-1] # (bs, seq_length, dim)
t_hidden_states = t_hidden_states[-1] # (bs, seq_length, dim)
mask = attention_mask.unsqueeze(-1).expand_as(s_hidden_states) # (bs, seq_length, dim)
assert s_hidden_states.size() == t_hidden_states.size()
dim = s_hidden_states.size(-1)
s_hidden_states_slct = torch.masked_select(s_hidden_states, mask) # (bs * seq_length * dim)
s_hidden_states_slct = s_hidden_states_slct.view(-1, dim) # (bs * seq_length, dim)
t_hidden_states_slct = torch.masked_select(t_hidden_states, mask) # (bs * seq_length * dim)
t_hidden_states_slct = t_hidden_states_slct.view(-1, dim) # (bs * seq_length, dim)
target = s_hidden_states_slct.new(s_hidden_states_slct.size(0)).fill_(1) # (bs * seq_length,)
loss_cos = self.cosine_loss_fct(s_hidden_states_slct, t_hidden_states_slct, target)
loss += self.alpha_cos * loss_cos
self.total_loss_epoch += loss.item()
self.last_loss = loss.item()
@@ -348,6 +379,8 @@ class Distiller:
self.last_loss_mlm = loss_mlm.item()
if self.alpha_mse > 0.:
self.last_loss_mse = loss_mse.item()
if self.alpha_cos > 0.:
self.last_loss_cos = loss_cos.item()
self.optimize(loss)
@@ -396,6 +429,7 @@ class Distiller:
if self.n_total_iter % self.params.log_interval == 0:
self.log_tensorboard()
self.last_log = time.time()
if self.n_total_iter % self.params.checkpoint_interval == 0:
self.save_checkpoint()
@@ -421,9 +455,12 @@ class Distiller:
self.tensorboard.add_scalar(tag="losses/loss_mlm", scalar_value=self.last_loss_mlm, global_step=self.n_total_iter)
if self.alpha_mse > 0.:
self.tensorboard.add_scalar(tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter)
if self.alpha_cos > 0.:
self.tensorboard.add_scalar(tag="losses/loss_cos", scalar_value=self.last_loss_cos, global_step=self.n_total_iter)
self.tensorboard.add_scalar(tag="learning_rate/lr", scalar_value=self.scheduler.get_lr()[0], global_step=self.n_total_iter)
self.tensorboard.add_scalar(tag="global/memory_usage", scalar_value=psutil.virtual_memory()._asdict()['used']/1_000_000, global_step=self.n_total_iter)
self.tensorboard.add_scalar(tag="global/speed", scalar_value=time.time()-self.last_log, global_step=self.n_total_iter)
def end_epoch(self):
"""