Reformat source code with black.

This is the result of:

    $ black --line-length 119 examples templates transformers utils hubconf.py setup.py

There's a lot of fairly long lines in the project. As a consequence, I'm
picking the longest widely accepted line length, 119 characters.

This is also Thomas' preference, because it allows for explicit variable
names, to make the code easier to understand.
This commit is contained in:
Aymeric Augustin
2019-12-21 15:46:46 +01:00
parent 63e3827c6b
commit fa84ae26d6
200 changed files with 17452 additions and 12594 deletions

View File

@@ -40,14 +40,12 @@ from utils import logger
from lm_seqs_dataset import LmSeqsDataset
from grouped_batch_sampler import GroupedBatchSampler, create_lengths_groups
class Distiller:
def __init__(self,
params: dict,
dataset: LmSeqsDataset,
token_probs: torch.tensor,
student: nn.Module,
teacher: nn.Module):
logger.info('Initializing Distiller')
def __init__(
self, params: dict, dataset: LmSeqsDataset, token_probs: torch.tensor, student: nn.Module, teacher: nn.Module
):
logger.info("Initializing Distiller")
self.params = params
self.dump_path = params.dump_path
self.multi_gpu = params.multi_gpu
@@ -70,12 +68,10 @@ class Distiller:
else:
sampler = BatchSampler(sampler=sampler, batch_size=params.batch_size, drop_last=False)
self.dataloader = DataLoader(dataset=dataset,
batch_sampler=sampler,
collate_fn=dataset.batch_sequences)
self.dataloader = DataLoader(dataset=dataset, batch_sampler=sampler, collate_fn=dataset.batch_sequences)
self.temperature = params.temperature
assert self.temperature > 0.
assert self.temperature > 0.0
self.alpha_ce = params.alpha_ce
self.alpha_mlm = params.alpha_mlm
@@ -85,18 +81,18 @@ class Distiller:
self.mlm = params.mlm
if self.mlm:
logger.info(f'Using MLM loss for LM step.')
logger.info(f"Using MLM loss for LM step.")
self.mlm_mask_prop = params.mlm_mask_prop
assert 0.0 <= self.mlm_mask_prop <= 1.0
assert params.word_mask + params.word_keep + params.word_rand == 1.0
self.pred_probs = torch.FloatTensor([params.word_mask, params.word_keep, params.word_rand])
self.pred_probs = self.pred_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else self.pred_probs
self.token_probs = token_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else token_probs
self.pred_probs = self.pred_probs.to(f"cuda:{params.local_rank}") if params.n_gpu > 0 else self.pred_probs
self.token_probs = token_probs.to(f"cuda:{params.local_rank}") if params.n_gpu > 0 else token_probs
if self.fp16:
self.pred_probs = self.pred_probs.half()
self.token_probs = self.token_probs.half()
else:
logger.info(f'Using CLM loss for LM step.')
logger.info(f"Using CLM loss for LM step.")
self.epoch = 0
self.n_iter = 0
@@ -107,38 +103,54 @@ class Distiller:
self.last_loss_ce = 0
self.last_loss_mlm = 0
self.last_loss_clm = 0
if self.alpha_mse > 0.: self.last_loss_mse = 0
if self.alpha_cos > 0.: self.last_loss_cos = 0
if self.alpha_mse > 0.0:
self.last_loss_mse = 0
if self.alpha_cos > 0.0:
self.last_loss_cos = 0
self.last_log = 0
self.ce_loss_fct = nn.KLDivLoss(reduction='batchmean')
self.ce_loss_fct = nn.KLDivLoss(reduction="batchmean")
self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
if self.alpha_mse > 0.:
self.mse_loss_fct = nn.MSELoss(reduction='sum')
if self.alpha_cos > 0.:
self.cosine_loss_fct = nn.CosineEmbeddingLoss(reduction='mean')
if self.alpha_mse > 0.0:
self.mse_loss_fct = nn.MSELoss(reduction="sum")
if self.alpha_cos > 0.0:
self.cosine_loss_fct = nn.CosineEmbeddingLoss(reduction="mean")
logger.info('--- Initializing model optimizer')
logger.info("--- Initializing model optimizer")
assert params.gradient_accumulation_steps >= 1
self.num_steps_epoch = len(self.dataloader)
num_train_optimization_steps = int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1
num_train_optimization_steps = (
int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1
)
no_decay = ['bias', 'LayerNorm.weight']
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
{'params': [p for n, p in student.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': params.weight_decay},
{'params': [p for n, p in student.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad], 'weight_decay': 0.0}
{
"params": [
p for n, p in student.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad
],
"weight_decay": params.weight_decay,
},
{
"params": [
p for n, p in student.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad
],
"weight_decay": 0.0,
},
]
logger.info("------ Number of trainable parameters (student): %i" % sum([p.numel() for p in self.student.parameters() if p.requires_grad]))
logger.info(
"------ Number of trainable parameters (student): %i"
% sum([p.numel() for p in self.student.parameters() if p.requires_grad])
)
logger.info("------ Number of parameters (student): %i" % sum([p.numel() for p in self.student.parameters()]))
self.optimizer = AdamW(optimizer_grouped_parameters,
lr=params.learning_rate,
eps=params.adam_epsilon,
betas=(0.9, 0.98))
self.optimizer = AdamW(
optimizer_grouped_parameters, lr=params.learning_rate, eps=params.adam_epsilon, betas=(0.9, 0.98)
)
warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop)
self.scheduler = get_linear_schedule_with_warmup(self.optimizer,
num_warmup_steps=warmup_steps,
num_training_steps=num_train_optimization_steps)
self.scheduler = get_linear_schedule_with_warmup(
self.optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_optimization_steps
)
if self.fp16:
try:
@@ -146,33 +158,36 @@ class Distiller:
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
logger.info(f"Using fp16 training: {self.params.fp16_opt_level} level")
self.student, self.optimizer = amp.initialize(self.student,
self.optimizer,
opt_level=self.params.fp16_opt_level)
self.student, self.optimizer = amp.initialize(
self.student, self.optimizer, opt_level=self.params.fp16_opt_level
)
self.teacher = self.teacher.half()
if self.multi_gpu:
if self.fp16:
from apex.parallel import DistributedDataParallel
logger.info("Using apex.parallel.DistributedDataParallel for distributed training.")
self.student = DistributedDataParallel(self.student)
else:
from torch.nn.parallel import DistributedDataParallel
logger.info("Using nn.parallel.DistributedDataParallel for distributed training.")
self.student = DistributedDataParallel(self.student,
device_ids=[params.local_rank],
output_device=params.local_rank,
find_unused_parameters=True)
self.student = DistributedDataParallel(
self.student,
device_ids=[params.local_rank],
output_device=params.local_rank,
find_unused_parameters=True,
)
self.is_master = params.is_master
if self.is_master:
logger.info('--- Initializing Tensorboard')
self.tensorboard = SummaryWriter(log_dir=os.path.join(self.dump_path, 'log', 'train'))
self.tensorboard.add_text(tag='config/training', text_string=str(self.params), global_step=0)
self.tensorboard.add_text(tag='config/student', text_string=str(self.student_config), global_step=0)
logger.info("--- Initializing Tensorboard")
self.tensorboard = SummaryWriter(log_dir=os.path.join(self.dump_path, "log", "train"))
self.tensorboard.add_text(tag="config/training", text_string=str(self.params), global_step=0)
self.tensorboard.add_text(tag="config/student", text_string=str(self.student_config), global_step=0)
def prepare_batch_mlm(self,
batch):
def prepare_batch_mlm(self, batch):
"""
Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the masked label for MLM.
@@ -192,7 +207,7 @@ class Distiller:
token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
assert token_ids.size(0) == lengths.size(0)
attn_mask = (torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None])
attn_mask = torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None]
bs, max_seq_len = token_ids.size()
mlm_labels = token_ids.new(token_ids.size()).copy_(token_ids)
@@ -200,11 +215,13 @@ class Distiller:
x_prob = self.token_probs[token_ids.flatten()]
n_tgt = math.ceil(self.mlm_mask_prop * lengths.sum().item())
tgt_ids = torch.multinomial(x_prob / x_prob.sum(), n_tgt, replacement=False)
pred_mask = torch.zeros(bs * max_seq_len, dtype=torch.bool, device=token_ids.device) # previously `dtype=torch.uint8`, cf pytorch 1.2.0 compatibility
pred_mask = torch.zeros(
bs * max_seq_len, dtype=torch.bool, device=token_ids.device
) # previously `dtype=torch.uint8`, cf pytorch 1.2.0 compatibility
pred_mask[tgt_ids] = 1
pred_mask = pred_mask.view(bs, max_seq_len)
pred_mask[token_ids == self.params.special_tok_ids['pad_token']] = 0
pred_mask[token_ids == self.params.special_tok_ids["pad_token"]] = 0
# mask a number of words == 0 [8] (faster with fp16)
if self.fp16:
@@ -213,26 +230,29 @@ class Distiller:
pred_mask = pred_mask.view(-1)
n2 = max(n1 % 8, 8 * (n1 // 8))
if n2 != n1:
pred_mask[torch.nonzero(pred_mask).view(-1)[:n1-n2]] = 0
pred_mask[torch.nonzero(pred_mask).view(-1)[: n1 - n2]] = 0
pred_mask = pred_mask.view(bs, max_seq_len)
assert pred_mask.sum().item() % 8 == 0, pred_mask.sum().item()
_token_ids_real = token_ids[pred_mask]
_token_ids_rand = _token_ids_real.clone().random_(self.vocab_size)
_token_ids_mask = _token_ids_real.clone().fill_(self.params.special_tok_ids['mask_token'])
_token_ids_mask = _token_ids_real.clone().fill_(self.params.special_tok_ids["mask_token"])
probs = torch.multinomial(self.pred_probs, len(_token_ids_real), replacement=True)
_token_ids = _token_ids_mask * (probs == 0).long() + _token_ids_real * (probs == 1).long() + _token_ids_rand * (probs == 2).long()
_token_ids = (
_token_ids_mask * (probs == 0).long()
+ _token_ids_real * (probs == 1).long()
+ _token_ids_rand * (probs == 2).long()
)
token_ids = token_ids.masked_scatter(pred_mask, _token_ids)
mlm_labels[~pred_mask] = -100 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility
mlm_labels[~pred_mask] = -100 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility
# sanity checks
assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size
return token_ids, attn_mask, mlm_labels
def prepare_batch_clm(self,
batch):
def prepare_batch_clm(self, batch):
"""
Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the labels for CLM.
@@ -252,18 +272,16 @@ class Distiller:
token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
assert token_ids.size(0) == lengths.size(0)
attn_mask = (torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None])
attn_mask = torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None]
clm_labels = token_ids.new(token_ids.size()).copy_(token_ids)
clm_labels[~attn_mask] = -100 # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility
clm_labels[~attn_mask] = -100 # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility
# sanity checks
assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size
return token_ids, attn_mask, clm_labels
def round_batch(self,
x: torch.tensor,
lengths: torch.tensor):
def round_batch(self, x: torch.tensor, lengths: torch.tensor):
"""
For float16 only.
Sub-sample sentences in a batch, and add padding, so that each dimension is a multiple of 8.
@@ -299,9 +317,9 @@ class Distiller:
pad = 8 - (ml1 % 8)
ml2 = ml1 + pad
if self.mlm:
pad_id = self.params.special_tok_ids['pad_token']
pad_id = self.params.special_tok_ids["pad_token"]
else:
pad_id = self.params.special_tok_ids['unk_token']
pad_id = self.params.special_tok_ids["unk_token"]
padding_tensor = torch.zeros(bs2, pad, dtype=torch.long, device=x.device).fill_(pad_id)
x = torch.cat([x, padding_tensor], 1)
assert x.size() == (bs2, ml2)
@@ -314,20 +332,22 @@ class Distiller:
"""
The real training loop.
"""
if self.is_master: logger.info('Starting training')
if self.is_master:
logger.info("Starting training")
self.last_log = time.time()
self.student.train()
self.teacher.eval()
for _ in range(self.params.n_epoch):
if self.is_master: logger.info(f'--- Starting epoch {self.epoch}/{self.params.n_epoch-1}')
if self.is_master:
logger.info(f"--- Starting epoch {self.epoch}/{self.params.n_epoch-1}")
if self.multi_gpu:
torch.distributed.barrier()
iter_bar = tqdm(self.dataloader, desc="-Iter", disable=self.params.local_rank not in [-1, 0])
for batch in iter_bar:
if self.params.n_gpu > 0:
batch = tuple(t.to(f'cuda:{self.params.local_rank}') for t in batch)
batch = tuple(t.to(f"cuda:{self.params.local_rank}") for t in batch)
if self.mlm:
token_ids, attn_mask, lm_labels = self.prepare_batch_mlm(batch=batch)
@@ -336,22 +356,21 @@ class Distiller:
self.step(input_ids=token_ids, attention_mask=attn_mask, lm_labels=lm_labels)
iter_bar.update()
iter_bar.set_postfix({'Last_loss': f'{self.last_loss:.2f}',
'Avg_cum_loss': f'{self.total_loss_epoch/self.n_iter:.2f}'})
iter_bar.set_postfix(
{"Last_loss": f"{self.last_loss:.2f}", "Avg_cum_loss": f"{self.total_loss_epoch/self.n_iter:.2f}"}
)
iter_bar.close()
if self.is_master: logger.info(f'--- Ending epoch {self.epoch}/{self.params.n_epoch-1}')
if self.is_master:
logger.info(f"--- Ending epoch {self.epoch}/{self.params.n_epoch-1}")
self.end_epoch()
if self.is_master:
logger.info(f'Save very last checkpoint as `pytorch_model.bin`.')
self.save_checkpoint(checkpoint_name=f'pytorch_model.bin')
logger.info('Training is finished')
logger.info(f"Save very last checkpoint as `pytorch_model.bin`.")
self.save_checkpoint(checkpoint_name=f"pytorch_model.bin")
logger.info("Training is finished")
def step(self,
input_ids: torch.tensor,
attention_mask: torch.tensor,
lm_labels: torch.tensor):
def step(self, input_ids: torch.tensor, attention_mask: torch.tensor, lm_labels: torch.tensor):
"""
One optimization step: forward of student AND teacher, backward on the loss (for gradient accumulation),
and possibly a parameter update (depending on the gradient accumulation).
@@ -363,78 +382,91 @@ class Distiller:
lm_labels: `torch.tensor(bs, seq_length)` - The language modeling labels (mlm labels for MLM and clm labels for CLM).
"""
if self.mlm:
s_logits, s_hidden_states = self.student(input_ids=input_ids, attention_mask=attention_mask) # (bs, seq_length, voc_size)
s_logits, s_hidden_states = self.student(
input_ids=input_ids, attention_mask=attention_mask
) # (bs, seq_length, voc_size)
with torch.no_grad():
t_logits, t_hidden_states = self.teacher(input_ids=input_ids, attention_mask=attention_mask) # (bs, seq_length, voc_size)
t_logits, t_hidden_states = self.teacher(
input_ids=input_ids, attention_mask=attention_mask
) # (bs, seq_length, voc_size)
else:
s_logits, _, s_hidden_states = self.student(input_ids=input_ids, attention_mask=None) # (bs, seq_length, voc_size)
s_logits, _, s_hidden_states = self.student(
input_ids=input_ids, attention_mask=None
) # (bs, seq_length, voc_size)
with torch.no_grad():
t_logits, _, t_hidden_states = self.teacher(input_ids=input_ids, attention_mask=None) # (bs, seq_length, voc_size)
t_logits, _, t_hidden_states = self.teacher(
input_ids=input_ids, attention_mask=None
) # (bs, seq_length, voc_size)
assert s_logits.size() == t_logits.size()
#https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
#https://github.com/peterliht/knowledge-distillation-pytorch/issues/2
# https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
# https://github.com/peterliht/knowledge-distillation-pytorch/issues/2
if self.params.restrict_ce_to_mask:
mask = (lm_labels>-1).unsqueeze(-1).expand_as(s_logits) # (bs, seq_lenth, voc_size)
mask = (lm_labels > -1).unsqueeze(-1).expand_as(s_logits) # (bs, seq_lenth, voc_size)
else:
mask = attention_mask.unsqueeze(-1).expand_as(s_logits) # (bs, seq_lenth, voc_size)
s_logits_slct = torch.masked_select(s_logits, mask) # (bs * seq_length * voc_size) modulo the 1s in mask
s_logits_slct = s_logits_slct.view(-1, s_logits.size(-1)) # (bs * seq_length, voc_size) modulo the 1s in mask
t_logits_slct = torch.masked_select(t_logits, mask) # (bs * seq_length * voc_size) modulo the 1s in mask
t_logits_slct = t_logits_slct.view(-1, s_logits.size(-1)) # (bs * seq_length, voc_size) modulo the 1s in mask
mask = attention_mask.unsqueeze(-1).expand_as(s_logits) # (bs, seq_lenth, voc_size)
s_logits_slct = torch.masked_select(s_logits, mask) # (bs * seq_length * voc_size) modulo the 1s in mask
s_logits_slct = s_logits_slct.view(-1, s_logits.size(-1)) # (bs * seq_length, voc_size) modulo the 1s in mask
t_logits_slct = torch.masked_select(t_logits, mask) # (bs * seq_length * voc_size) modulo the 1s in mask
t_logits_slct = t_logits_slct.view(-1, s_logits.size(-1)) # (bs * seq_length, voc_size) modulo the 1s in mask
assert t_logits_slct.size() == s_logits_slct.size()
loss_ce = self.ce_loss_fct(F.log_softmax(s_logits_slct/self.temperature, dim=-1),
F.softmax(t_logits_slct/self.temperature, dim=-1)) * (self.temperature)**2
loss = self.alpha_ce*loss_ce
loss_ce = (
self.ce_loss_fct(
F.log_softmax(s_logits_slct / self.temperature, dim=-1),
F.softmax(t_logits_slct / self.temperature, dim=-1),
)
* (self.temperature) ** 2
)
loss = self.alpha_ce * loss_ce
if self.alpha_mlm > 0.:
if self.alpha_mlm > 0.0:
loss_mlm = self.lm_loss_fct(s_logits.view(-1, s_logits.size(-1)), lm_labels.view(-1))
loss += self.alpha_mlm * loss_mlm
if self.alpha_clm > 0.:
if self.alpha_clm > 0.0:
shift_logits = s_logits[..., :-1, :].contiguous()
shift_labels = lm_labels[..., 1:].contiguous()
loss_clm = self.lm_loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
shift_labels.view(-1))
loss_clm = self.lm_loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
loss += self.alpha_clm * loss_clm
if self.alpha_mse > 0.:
loss_mse = self.mse_loss_fct(s_logits_slct, t_logits_slct)/s_logits_slct.size(0) # Reproducing batchmean reduction
if self.alpha_mse > 0.0:
loss_mse = self.mse_loss_fct(s_logits_slct, t_logits_slct) / s_logits_slct.size(
0
) # Reproducing batchmean reduction
loss += self.alpha_mse * loss_mse
if self.alpha_cos > 0.:
s_hidden_states = s_hidden_states[-1] # (bs, seq_length, dim)
t_hidden_states = t_hidden_states[-1] # (bs, seq_length, dim)
mask = attention_mask.unsqueeze(-1).expand_as(s_hidden_states) # (bs, seq_length, dim)
if self.alpha_cos > 0.0:
s_hidden_states = s_hidden_states[-1] # (bs, seq_length, dim)
t_hidden_states = t_hidden_states[-1] # (bs, seq_length, dim)
mask = attention_mask.unsqueeze(-1).expand_as(s_hidden_states) # (bs, seq_length, dim)
assert s_hidden_states.size() == t_hidden_states.size()
dim = s_hidden_states.size(-1)
s_hidden_states_slct = torch.masked_select(s_hidden_states, mask) # (bs * seq_length * dim)
s_hidden_states_slct = s_hidden_states_slct.view(-1, dim) # (bs * seq_length, dim)
t_hidden_states_slct = torch.masked_select(t_hidden_states, mask) # (bs * seq_length * dim)
t_hidden_states_slct = t_hidden_states_slct.view(-1, dim) # (bs * seq_length, dim)
target = s_hidden_states_slct.new(s_hidden_states_slct.size(0)).fill_(1) # (bs * seq_length,)
s_hidden_states_slct = torch.masked_select(s_hidden_states, mask) # (bs * seq_length * dim)
s_hidden_states_slct = s_hidden_states_slct.view(-1, dim) # (bs * seq_length, dim)
t_hidden_states_slct = torch.masked_select(t_hidden_states, mask) # (bs * seq_length * dim)
t_hidden_states_slct = t_hidden_states_slct.view(-1, dim) # (bs * seq_length, dim)
target = s_hidden_states_slct.new(s_hidden_states_slct.size(0)).fill_(1) # (bs * seq_length,)
loss_cos = self.cosine_loss_fct(s_hidden_states_slct, t_hidden_states_slct, target)
loss += self.alpha_cos * loss_cos
self.total_loss_epoch += loss.item()
self.last_loss = loss.item()
self.last_loss_ce = loss_ce.item()
if self.alpha_mlm > 0.:
if self.alpha_mlm > 0.0:
self.last_loss_mlm = loss_mlm.item()
if self.alpha_clm > 0.:
if self.alpha_clm > 0.0:
self.last_loss_clm = loss_clm.item()
if self.alpha_mse > 0.:
if self.alpha_mse > 0.0:
self.last_loss_mse = loss_mse.item()
if self.alpha_cos > 0.:
if self.alpha_cos > 0.0:
self.last_loss_cos = loss_cos.item()
self.optimize(loss)
self.n_sequences_epoch += input_ids.size(0)
def optimize(self,
loss):
def optimize(self, loss):
"""
Normalization on the loss (gradient accumulation or distributed training), followed by
backward pass on the loss, possibly followed by a parameter update (depending on the gradient accumulation).
@@ -442,7 +474,7 @@ class Distiller:
"""
# Check for NaN
if (loss != loss).data.any():
logger.error('NaN detected')
logger.error("NaN detected")
exit()
if self.multi_gpu:
@@ -452,6 +484,7 @@ class Distiller:
if self.fp16:
from apex import amp
with amp.scale_loss(loss, self.optimizer) as scaled_loss:
scaled_loss.backward()
else:
@@ -488,53 +521,84 @@ class Distiller:
return
for param_name, param in self.student.named_parameters():
self.tensorboard.add_scalar(tag='parameter_mean/' + param_name, scalar_value=param.data.mean(), global_step=self.n_total_iter)
self.tensorboard.add_scalar(tag='parameter_std/' + param_name, scalar_value=param.data.std(), global_step=self.n_total_iter)
self.tensorboard.add_scalar(
tag="parameter_mean/" + param_name, scalar_value=param.data.mean(), global_step=self.n_total_iter
)
self.tensorboard.add_scalar(
tag="parameter_std/" + param_name, scalar_value=param.data.std(), global_step=self.n_total_iter
)
if param.grad is None:
continue
self.tensorboard.add_scalar(tag="grad_mean/" + param_name, scalar_value=param.grad.data.mean(),global_step=self.n_total_iter)
self.tensorboard.add_scalar(tag="grad_std/" + param_name, scalar_value=param.grad.data.std(), global_step=self.n_total_iter)
self.tensorboard.add_scalar(
tag="grad_mean/" + param_name, scalar_value=param.grad.data.mean(), global_step=self.n_total_iter
)
self.tensorboard.add_scalar(
tag="grad_std/" + param_name, scalar_value=param.grad.data.std(), global_step=self.n_total_iter
)
self.tensorboard.add_scalar(tag="losses/cum_avg_loss_epoch", scalar_value=self.total_loss_epoch/self.n_iter, global_step=self.n_total_iter)
self.tensorboard.add_scalar(
tag="losses/cum_avg_loss_epoch",
scalar_value=self.total_loss_epoch / self.n_iter,
global_step=self.n_total_iter,
)
self.tensorboard.add_scalar(tag="losses/loss", scalar_value=self.last_loss, global_step=self.n_total_iter)
self.tensorboard.add_scalar(tag="losses/loss_ce", scalar_value=self.last_loss_ce, global_step=self.n_total_iter)
if self.alpha_mlm > 0.:
self.tensorboard.add_scalar(tag="losses/loss_mlm", scalar_value=self.last_loss_mlm, global_step=self.n_total_iter)
if self.alpha_clm > 0.:
self.tensorboard.add_scalar(tag="losses/loss_clm", scalar_value=self.last_loss_clm, global_step=self.n_total_iter)
if self.alpha_mse > 0.:
self.tensorboard.add_scalar(tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter)
if self.alpha_cos > 0.:
self.tensorboard.add_scalar(tag="losses/loss_cos", scalar_value=self.last_loss_cos, global_step=self.n_total_iter)
self.tensorboard.add_scalar(tag="learning_rate/lr", scalar_value=self.scheduler.get_lr()[0], global_step=self.n_total_iter)
self.tensorboard.add_scalar(tag="global/memory_usage", scalar_value=psutil.virtual_memory()._asdict()['used']/1_000_000, global_step=self.n_total_iter)
self.tensorboard.add_scalar(tag="global/speed", scalar_value=time.time()-self.last_log, global_step=self.n_total_iter)
self.tensorboard.add_scalar(
tag="losses/loss_ce", scalar_value=self.last_loss_ce, global_step=self.n_total_iter
)
if self.alpha_mlm > 0.0:
self.tensorboard.add_scalar(
tag="losses/loss_mlm", scalar_value=self.last_loss_mlm, global_step=self.n_total_iter
)
if self.alpha_clm > 0.0:
self.tensorboard.add_scalar(
tag="losses/loss_clm", scalar_value=self.last_loss_clm, global_step=self.n_total_iter
)
if self.alpha_mse > 0.0:
self.tensorboard.add_scalar(
tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter
)
if self.alpha_cos > 0.0:
self.tensorboard.add_scalar(
tag="losses/loss_cos", scalar_value=self.last_loss_cos, global_step=self.n_total_iter
)
self.tensorboard.add_scalar(
tag="learning_rate/lr", scalar_value=self.scheduler.get_lr()[0], global_step=self.n_total_iter
)
self.tensorboard.add_scalar(
tag="global/memory_usage",
scalar_value=psutil.virtual_memory()._asdict()["used"] / 1_000_000,
global_step=self.n_total_iter,
)
self.tensorboard.add_scalar(
tag="global/speed", scalar_value=time.time() - self.last_log, global_step=self.n_total_iter
)
def end_epoch(self):
"""
Finally arrived at the end of epoch (full pass on dataset).
Do some tensorboard logging and checkpoint saving.
"""
logger.info(f'{self.n_sequences_epoch} sequences have been trained during this epoch.')
logger.info(f"{self.n_sequences_epoch} sequences have been trained during this epoch.")
if self.is_master:
self.save_checkpoint(checkpoint_name=f'model_epoch_{self.epoch}.pth')
self.tensorboard.add_scalar(tag='epoch/loss', scalar_value=self.total_loss_epoch/self.n_iter, global_step=self.epoch)
self.save_checkpoint(checkpoint_name=f"model_epoch_{self.epoch}.pth")
self.tensorboard.add_scalar(
tag="epoch/loss", scalar_value=self.total_loss_epoch / self.n_iter, global_step=self.epoch
)
self.epoch += 1
self.n_sequences_epoch = 0
self.n_iter = 0
self.total_loss_epoch = 0
def save_checkpoint(self,
checkpoint_name: str = 'checkpoint.pth'):
def save_checkpoint(self, checkpoint_name: str = "checkpoint.pth"):
"""
Save the current state. Only by the master process.
"""
if not self.is_master:
return
mdl_to_save = self.student.module if hasattr(self.student, 'module') else self.student
mdl_to_save = self.student.module if hasattr(self.student, "module") else self.student
mdl_to_save.config.save_pretrained(self.dump_path)
state_dict = mdl_to_save.state_dict()
torch.save(state_dict, os.path.join(self.dump_path, checkpoint_name))