[style] consistent nn. and nn.functional: part 4 examples (#12156)

* consistent nn. and nn.functional: p4 examples

* restore
This commit is contained in:
Stas Bekman
2021-06-14 12:28:24 -07:00
committed by GitHub
parent 372ab9cd6d
commit 88e84186e5
26 changed files with 130 additions and 126 deletions

View File

@@ -21,8 +21,7 @@ import time
import psutil
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import nn
from torch.optim import AdamW
from torch.utils.data import BatchSampler, DataLoader, RandomSampler
from torch.utils.data.distributed import DistributedSampler
@@ -412,8 +411,8 @@ class Distiller:
loss_ce = (
self.ce_loss_fct(
F.log_softmax(s_logits_slct / self.temperature, dim=-1),
F.softmax(t_logits_slct / self.temperature, dim=-1),
nn.functional.log_softmax(s_logits_slct / self.temperature, dim=-1),
nn.functional.softmax(t_logits_slct / self.temperature, dim=-1),
)
* (self.temperature) ** 2
)
@@ -492,9 +491,9 @@ class Distiller:
self.iter()
if self.n_iter % self.params.gradient_accumulation_steps == 0:
if self.fp16:
torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.params.max_grad_norm)
nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.params.max_grad_norm)
else:
torch.nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm)
nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm)
self.optimizer.step()
self.optimizer.zero_grad()
self.scheduler.step()

View File

@@ -24,8 +24,7 @@ import timeit
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import nn
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
@@ -138,11 +137,11 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
# multi-gpu training (should be after apex fp16 initialization)
if args.n_gpu > 1:
model = torch.nn.DataParallel(model)
model = nn.DataParallel(model)
# Distributed training (should be after apex fp16 initialization)
if args.local_rank != -1:
model = torch.nn.parallel.DistributedDataParallel(
model = nn.parallel.DistributedDataParallel(
model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
)
@@ -232,15 +231,15 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
loss_fct = nn.KLDivLoss(reduction="batchmean")
loss_start = (
loss_fct(
F.log_softmax(start_logits_stu / args.temperature, dim=-1),
F.softmax(start_logits_tea / args.temperature, dim=-1),
nn.functional.log_softmax(start_logits_stu / args.temperature, dim=-1),
nn.functional.softmax(start_logits_tea / args.temperature, dim=-1),
)
* (args.temperature ** 2)
)
loss_end = (
loss_fct(
F.log_softmax(end_logits_stu / args.temperature, dim=-1),
F.softmax(end_logits_tea / args.temperature, dim=-1),
nn.functional.log_softmax(end_logits_stu / args.temperature, dim=-1),
nn.functional.softmax(end_logits_tea / args.temperature, dim=-1),
)
* (args.temperature ** 2)
)
@@ -262,9 +261,9 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
tr_loss += loss.item()
if (step + 1) % args.gradient_accumulation_steps == 0:
if args.fp16:
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
else:
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
optimizer.step()
scheduler.step() # Update learning rate schedule
@@ -326,8 +325,8 @@ def evaluate(args, model, tokenizer, prefix=""):
eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
# multi-gpu evaluate
if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
model = torch.nn.DataParallel(model)
if args.n_gpu > 1 and not isinstance(model, nn.DataParallel):
model = nn.DataParallel(model)
# Eval!
logger.info("***** Running evaluation {} *****".format(prefix))