[style] consistent nn. and nn.functional: part 4 examples (#12156)

* consistent nn. and nn.functional: p4 examples * restore
2021-06-14 12:28:24 -07:00
parent 372ab9cd6d
commit 88e84186e5
26 changed files with 130 additions and 126 deletions
--- a/examples/research_projects/distillation/distiller.py
+++ b/examples/research_projects/distillation/distiller.py
@@ -21,8 +21,7 @@ import time

 import psutil
 import torch
-import torch.nn as nn
-import torch.nn.functional as F
+from torch import nn
 from torch.optim import AdamW
 from torch.utils.data import BatchSampler, DataLoader, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
@@ -412,8 +411,8 @@ class Distiller:

        loss_ce = (
            self.ce_loss_fct(
-                F.log_softmax(s_logits_slct / self.temperature, dim=-1),
-                F.softmax(t_logits_slct / self.temperature, dim=-1),
+                nn.functional.log_softmax(s_logits_slct / self.temperature, dim=-1),
+                nn.functional.softmax(t_logits_slct / self.temperature, dim=-1),
            )
            * (self.temperature) ** 2
        )
@@ -492,9 +491,9 @@ class Distiller:
        self.iter()
        if self.n_iter % self.params.gradient_accumulation_steps == 0:
            if self.fp16:
-                torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.params.max_grad_norm)
+                nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.params.max_grad_norm)
            else:
-                torch.nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm)
+                nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm)
            self.optimizer.step()
            self.optimizer.zero_grad()
            self.scheduler.step()
--- a/examples/research_projects/distillation/run_squad_w_distillation.py
+++ b/examples/research_projects/distillation/run_squad_w_distillation.py
@@ -24,8 +24,7 @@ import timeit

 import numpy as np
 import torch
-import torch.nn as nn
-import torch.nn.functional as F
+from torch import nn
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
@@ -138,11 +137,11 @@ def train(args, train_dataset, model, tokenizer, teacher=None):

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
+        model = nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
+        model = nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

@@ -232,15 +231,15 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
                loss_fct = nn.KLDivLoss(reduction="batchmean")
                loss_start = (
                    loss_fct(
-                        F.log_softmax(start_logits_stu / args.temperature, dim=-1),
-                        F.softmax(start_logits_tea / args.temperature, dim=-1),
+                        nn.functional.log_softmax(start_logits_stu / args.temperature, dim=-1),
+                        nn.functional.softmax(start_logits_tea / args.temperature, dim=-1),
                    )
                    * (args.temperature ** 2)
                )
                loss_end = (
                    loss_fct(
-                        F.log_softmax(end_logits_stu / args.temperature, dim=-1),
-                        F.softmax(end_logits_tea / args.temperature, dim=-1),
+                        nn.functional.log_softmax(end_logits_stu / args.temperature, dim=-1),
+                        nn.functional.softmax(end_logits_tea / args.temperature, dim=-1),
                    )
                    * (args.temperature ** 2)
                )
@@ -262,9 +261,9 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
-                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                    nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+                    nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
@@ -326,8 +325,8 @@ def evaluate(args, model, tokenizer, prefix=""):
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    # multi-gpu evaluate
-    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
-        model = torch.nn.DataParallel(model)
+    if args.n_gpu > 1 and not isinstance(model, nn.DataParallel):
+        model = nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))