[style] consistent nn. and nn.functional: part 4 examples (#12156)
* consistent nn. and nn.functional: p4 examples * restore
This commit is contained in:
@@ -21,8 +21,7 @@ import time
|
||||
|
||||
import psutil
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torch import nn
|
||||
from torch.optim import AdamW
|
||||
from torch.utils.data import BatchSampler, DataLoader, RandomSampler
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
@@ -412,8 +411,8 @@ class Distiller:
|
||||
|
||||
loss_ce = (
|
||||
self.ce_loss_fct(
|
||||
F.log_softmax(s_logits_slct / self.temperature, dim=-1),
|
||||
F.softmax(t_logits_slct / self.temperature, dim=-1),
|
||||
nn.functional.log_softmax(s_logits_slct / self.temperature, dim=-1),
|
||||
nn.functional.softmax(t_logits_slct / self.temperature, dim=-1),
|
||||
)
|
||||
* (self.temperature) ** 2
|
||||
)
|
||||
@@ -492,9 +491,9 @@ class Distiller:
|
||||
self.iter()
|
||||
if self.n_iter % self.params.gradient_accumulation_steps == 0:
|
||||
if self.fp16:
|
||||
torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.params.max_grad_norm)
|
||||
nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.params.max_grad_norm)
|
||||
else:
|
||||
torch.nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm)
|
||||
nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm)
|
||||
self.optimizer.step()
|
||||
self.optimizer.zero_grad()
|
||||
self.scheduler.step()
|
||||
|
||||
@@ -24,8 +24,7 @@ import timeit
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torch import nn
|
||||
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
|
||||
from torch.utils.data.distributed import DistributedSampler
|
||||
from tqdm import tqdm, trange
|
||||
@@ -138,11 +137,11 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
|
||||
|
||||
# multi-gpu training (should be after apex fp16 initialization)
|
||||
if args.n_gpu > 1:
|
||||
model = torch.nn.DataParallel(model)
|
||||
model = nn.DataParallel(model)
|
||||
|
||||
# Distributed training (should be after apex fp16 initialization)
|
||||
if args.local_rank != -1:
|
||||
model = torch.nn.parallel.DistributedDataParallel(
|
||||
model = nn.parallel.DistributedDataParallel(
|
||||
model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
|
||||
)
|
||||
|
||||
@@ -232,15 +231,15 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
|
||||
loss_fct = nn.KLDivLoss(reduction="batchmean")
|
||||
loss_start = (
|
||||
loss_fct(
|
||||
F.log_softmax(start_logits_stu / args.temperature, dim=-1),
|
||||
F.softmax(start_logits_tea / args.temperature, dim=-1),
|
||||
nn.functional.log_softmax(start_logits_stu / args.temperature, dim=-1),
|
||||
nn.functional.softmax(start_logits_tea / args.temperature, dim=-1),
|
||||
)
|
||||
* (args.temperature ** 2)
|
||||
)
|
||||
loss_end = (
|
||||
loss_fct(
|
||||
F.log_softmax(end_logits_stu / args.temperature, dim=-1),
|
||||
F.softmax(end_logits_tea / args.temperature, dim=-1),
|
||||
nn.functional.log_softmax(end_logits_stu / args.temperature, dim=-1),
|
||||
nn.functional.softmax(end_logits_tea / args.temperature, dim=-1),
|
||||
)
|
||||
* (args.temperature ** 2)
|
||||
)
|
||||
@@ -262,9 +261,9 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
|
||||
tr_loss += loss.item()
|
||||
if (step + 1) % args.gradient_accumulation_steps == 0:
|
||||
if args.fp16:
|
||||
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
|
||||
nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
|
||||
else:
|
||||
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
|
||||
nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
|
||||
|
||||
optimizer.step()
|
||||
scheduler.step() # Update learning rate schedule
|
||||
@@ -326,8 +325,8 @@ def evaluate(args, model, tokenizer, prefix=""):
|
||||
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
||||
|
||||
# multi-gpu evaluate
|
||||
if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
|
||||
model = torch.nn.DataParallel(model)
|
||||
if args.n_gpu > 1 and not isinstance(model, nn.DataParallel):
|
||||
model = nn.DataParallel(model)
|
||||
|
||||
# Eval!
|
||||
logger.info("***** Running evaluation {} *****".format(prefix))
|
||||
|
||||
Reference in New Issue
Block a user