[style] consistent nn. and nn.functional: part 4 examples (#12156)

* consistent nn. and nn.functional: p4 examples * restore
2021-06-14 12:28:24 -07:00
parent 372ab9cd6d
commit 88e84186e5
26 changed files with 130 additions and 126 deletions
--- a/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py
+++ b/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py
@@ -17,7 +17,7 @@
 import logging
 import torch
-import torch.nn as nn
+from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
 from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
@@ -270,6 +270,7 @@ class AlbertForSequenceClassificationWithPabee(AlbertPreTrainedModel):
                from transformers import AlbertTokenizer
                from pabee import AlbertForSequenceClassificationWithPabee
                from torch import nn
                import torch
                tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
--- a/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_bert.py
+++ b/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_bert.py
@@ -294,6 +294,7 @@ class BertForSequenceClassificationWithPabee(BertPreTrainedModel):
            from transformers import BertTokenizer, BertForSequenceClassification
            from pabee import BertForSequenceClassificationWithPabee
            from torch import nn
            import torch
            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
--- a/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py
+++ b/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py
@@ -25,6 +25,7 @@ import random
 import numpy as np
 import torch
 from torch import nn
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
@@ -117,11 +118,11 @@ def train(args, train_dataset, model, tokenizer):
    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
+        model = nn.DataParallel(model)
    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
+        model = nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
@@ -203,9 +204,9 @@ def train(args, train_dataset, model, tokenizer):
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
-                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                    nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+                    nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
@@ -291,8 +292,8 @@ def evaluate(args, model, tokenizer, prefix="", patience=0):
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
        # multi-gpu eval
-        if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
+        if args.n_gpu > 1 and not isinstance(model, nn.DataParallel):
-            model = torch.nn.DataParallel(model)
+            model = nn.DataParallel(model)
        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
--- a/examples/research_projects/bertology/run_bertology.py
+++ b/examples/research_projects/bertology/run_bertology.py
@@ -26,6 +26,7 @@ from datetime import datetime
 import numpy as np
 import torch
 from torch import nn
 from torch.utils.data import DataLoader, SequentialSampler, Subset
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm
@@ -415,11 +416,11 @@ def main():
    # Distributed and parallel training
    model.to(args.device)
    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
+        model = nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )
    elif args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
+        model = nn.DataParallel(model)
    # Print/save training arguments
    os.makedirs(args.output_dir, exist_ok=True)
--- a/examples/research_projects/bertology/run_prune_gpt.py
+++ b/examples/research_projects/bertology/run_prune_gpt.py
@@ -10,6 +10,7 @@ from datetime import datetime
 import numpy as np
 import torch
 from torch import nn
 from torch.utils.data import DataLoader, RandomSampler, TensorDataset
 from tqdm import tqdm
@@ -352,11 +353,11 @@ def main():
    # Distributed and parallel training
    model.to(args.device)
    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
+        model = nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )
    elif args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
+        model = nn.DataParallel(model)
    # Print/save training arguments
    os.makedirs(args.output_dir, exist_ok=True)
--- a/examples/research_projects/deebert/run_glue_deebert.py
+++ b/examples/research_projects/deebert/run_glue_deebert.py
@@ -9,6 +9,7 @@ import time
 import numpy as np
 import torch
 from torch import nn
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
@@ -135,11 +136,11 @@ def train(args, train_dataset, model, tokenizer, train_highway=False):
    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
+        model = nn.DataParallel(model)
    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
+        model = nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )
@@ -190,9 +191,9 @@ def train(args, train_dataset, model, tokenizer, train_highway=False):
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
-                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                    nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+                    nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
@@ -255,7 +256,7 @@ def evaluate(args, model, tokenizer, prefix="", output_layer=-1, eval_highway=Fa
        # multi-gpu eval
        if args.n_gpu > 1:
-            model = torch.nn.DataParallel(model)
+            model = nn.DataParallel(model)
        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
--- a/examples/research_projects/deebert/src/modeling_highway_roberta.py
+++ b/examples/research_projects/deebert/src/modeling_highway_roberta.py
@@ -1,6 +1,6 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
-import torch.nn as nn
+from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
 from transformers import RobertaConfig
--- a/examples/research_projects/distillation/distiller.py
+++ b/examples/research_projects/distillation/distiller.py
@@ -21,8 +21,7 @@ import time
 import psutil
 import torch
-import torch.nn as nn
+from torch import nn
 import torch.nn.functional as F
 from torch.optim import AdamW
 from torch.utils.data import BatchSampler, DataLoader, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
@@ -412,8 +411,8 @@ class Distiller:
        loss_ce = (
            self.ce_loss_fct(
-                F.log_softmax(s_logits_slct / self.temperature, dim=-1),
+                nn.functional.log_softmax(s_logits_slct / self.temperature, dim=-1),
-                F.softmax(t_logits_slct / self.temperature, dim=-1),
+                nn.functional.softmax(t_logits_slct / self.temperature, dim=-1),
            )
            * (self.temperature) ** 2
        )
@@ -492,9 +491,9 @@ class Distiller:
        self.iter()
        if self.n_iter % self.params.gradient_accumulation_steps == 0:
            if self.fp16:
-                torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.params.max_grad_norm)
+                nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.params.max_grad_norm)
            else:
-                torch.nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm)
+                nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm)
            self.optimizer.step()
            self.optimizer.zero_grad()
            self.scheduler.step()
--- a/examples/research_projects/distillation/run_squad_w_distillation.py
+++ b/examples/research_projects/distillation/run_squad_w_distillation.py
@@ -24,8 +24,7 @@ import timeit
 import numpy as np
 import torch
-import torch.nn as nn
+from torch import nn
 import torch.nn.functional as F
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
@@ -138,11 +137,11 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
+        model = nn.DataParallel(model)
    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
+        model = nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )
@@ -232,15 +231,15 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
                loss_fct = nn.KLDivLoss(reduction="batchmean")
                loss_start = (
                    loss_fct(
-                        F.log_softmax(start_logits_stu / args.temperature, dim=-1),
+                        nn.functional.log_softmax(start_logits_stu / args.temperature, dim=-1),
-                        F.softmax(start_logits_tea / args.temperature, dim=-1),
+                        nn.functional.softmax(start_logits_tea / args.temperature, dim=-1),
                    )
                    * (args.temperature ** 2)
                )
                loss_end = (
                    loss_fct(
-                        F.log_softmax(end_logits_stu / args.temperature, dim=-1),
+                        nn.functional.log_softmax(end_logits_stu / args.temperature, dim=-1),
-                        F.softmax(end_logits_tea / args.temperature, dim=-1),
+                        nn.functional.softmax(end_logits_tea / args.temperature, dim=-1),
                    )
                    * (args.temperature ** 2)
                )
@@ -262,9 +261,9 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
-                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                    nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+                    nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
@@ -326,8 +325,8 @@ def evaluate(args, model, tokenizer, prefix=""):
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
    # multi-gpu evaluate
-    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
+    if args.n_gpu > 1 and not isinstance(model, nn.DataParallel):
-        model = torch.nn.DataParallel(model)
+        model = nn.DataParallel(model)
    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
--- a/examples/research_projects/longform-qa/eli5_utils.py
+++ b/examples/research_projects/longform-qa/eli5_utils.py
@@ -11,6 +11,7 @@ import torch
 import torch.utils.checkpoint as checkpoint
 from elasticsearch import Elasticsearch  # noqa: F401
 from elasticsearch.helpers import bulk, streaming_bulk  # noqa: F401
 from torch import nn
 from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
 from tqdm import tqdm
@@ -116,14 +117,14 @@ class ELI5DatasetQARetriver(Dataset):
        return self.make_example(idx % self.data.num_rows)
-class RetrievalQAEmbedder(torch.nn.Module):
+class RetrievalQAEmbedder(nn.Module):
    def __init__(self, sent_encoder, dim):
        super(RetrievalQAEmbedder, self).__init__()
        self.sent_encoder = sent_encoder
        self.output_dim = 128
-        self.project_q = torch.nn.Linear(dim, self.output_dim, bias=False)
+        self.project_q = nn.Linear(dim, self.output_dim, bias=False)
-        self.project_a = torch.nn.Linear(dim, self.output_dim, bias=False)
+        self.project_a = nn.Linear(dim, self.output_dim, bias=False)
-        self.ce_loss = torch.nn.CrossEntropyLoss(reduction="mean")
+        self.ce_loss = nn.CrossEntropyLoss(reduction="mean")
    def embed_sentences_checkpointed(self, input_ids, attention_mask, checkpoint_batch_size=-1):
        # reproduces BERT forward pass with checkpointing
--- a/examples/research_projects/lxmert/modeling_frcnn.py
+++ b/examples/research_projects/lxmert/modeling_frcnn.py
@@ -25,7 +25,6 @@ from typing import Dict, List, Tuple
 import numpy as np
 import torch
 from torch import nn
 from torch.nn import functional as F
 from torch.nn.modules.batchnorm import BatchNorm2d
 from torchvision.ops import RoIPool
 from torchvision.ops.boxes import batched_nms, nms
@@ -85,7 +84,7 @@ def pad_list_tensors(
            too_small = True
            tensor_i = tensor_i.unsqueeze(-1)
        assert isinstance(tensor_i, torch.Tensor)
-        tensor_i = F.pad(
+        tensor_i = nn.functional.pad(
            input=tensor_i,
            pad=(0, 0, 0, max_detections - preds_per_image[i]),
            mode="constant",
@@ -701,7 +700,7 @@ class RPNOutputs(object):
 # Main Classes
-class Conv2d(torch.nn.Conv2d):
+class Conv2d(nn.Conv2d):
    def __init__(self, *args, **kwargs):
        norm = kwargs.pop("norm", None)
        activation = kwargs.pop("activation", None)
@@ -712,9 +711,9 @@ class Conv2d(torch.nn.Conv2d):
    def forward(self, x):
        if x.numel() == 0 and self.training:
-            assert not isinstance(self.norm, torch.nn.SyncBatchNorm)
+            assert not isinstance(self.norm, nn.SyncBatchNorm)
        if x.numel() == 0:
-            assert not isinstance(self.norm, torch.nn.GroupNorm)
+            assert not isinstance(self.norm, nn.GroupNorm)
            output_shape = [
                (i + 2 * p - (di * (k - 1) + 1)) // s + 1
                for i, p, di, k, s in zip(
@@ -752,7 +751,7 @@ class LastLevelMaxPool(nn.Module):
        self.in_feature = "p5"
    def forward(self, x):
-        return [F.max_pool2d(x, kernel_size=1, stride=2, padding=0)]
+        return [nn.functional.max_pool2d(x, kernel_size=1, stride=2, padding=0)]
 class LastLevelP6P7(nn.Module):
@@ -769,7 +768,7 @@ class LastLevelP6P7(nn.Module):
    def forward(self, c5):
        p6 = self.p6(c5)
-        p7 = self.p7(F.relu(p6))
+        p7 = self.p7(nn.functional.relu(p6))
        return [p6, p7]
@@ -790,11 +789,11 @@ class BasicStem(nn.Module):
    def forward(self, x):
        x = self.conv1(x)
-        x = F.relu_(x)
+        x = nn.functional.relu_(x)
        if self.caffe_maxpool:
-            x = F.max_pool2d(x, kernel_size=3, stride=2, padding=0, ceil_mode=True)
+            x = nn.functional.max_pool2d(x, kernel_size=3, stride=2, padding=0, ceil_mode=True)
        else:
-            x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
+            x = nn.functional.max_pool2d(x, kernel_size=3, stride=2, padding=1)
        return x
    @property
@@ -881,10 +880,10 @@ class BottleneckBlock(ResNetBlockBase):
    def forward(self, x):
        out = self.conv1(x)
-        out = F.relu_(out)
+        out = nn.functional.relu_(out)
        out = self.conv2(out)
-        out = F.relu_(out)
+        out = nn.functional.relu_(out)
        out = self.conv3(out)
@@ -894,7 +893,7 @@ class BottleneckBlock(ResNetBlockBase):
            shortcut = x
        out += shortcut
-        out = F.relu_(out)
+        out = nn.functional.relu_(out)
        return out
@@ -1159,7 +1158,7 @@ class ROIOutputs(object):
        return boxes.view(num_pred, K * B).split(preds_per_image, dim=0)
    def _predict_objs(self, obj_logits, preds_per_image):
-        probs = F.softmax(obj_logits, dim=-1)
+        probs = nn.functional.softmax(obj_logits, dim=-1)
        probs = probs.split(preds_per_image, dim=0)
        return probs
@@ -1490,7 +1489,7 @@ class RPNHead(nn.Module):
        pred_objectness_logits = []
        pred_anchor_deltas = []
        for x in features:
-            t = F.relu(self.conv(x))
+            t = nn.functional.relu(self.conv(x))
            pred_objectness_logits.append(self.objectness_logits(t))
            pred_anchor_deltas.append(self.anchor_deltas(t))
        return pred_objectness_logits, pred_anchor_deltas
@@ -1650,7 +1649,7 @@ class FastRCNNOutputLayers(nn.Module):
            cls_emb = self.cls_embedding(max_class)  # [b] --> [b, 256]
            roi_features = torch.cat([roi_features, cls_emb], -1)  # [b, 2048] + [b, 256] --> [b, 2304]
            roi_features = self.fc_attr(roi_features)
-            roi_features = F.relu(roi_features)
+            roi_features = nn.functional.relu(roi_features)
            attr_scores = self.attr_score(roi_features)
            return scores, attr_scores, proposal_deltas
        else:
--- a/examples/research_projects/lxmert/processing_image.py
+++ b/examples/research_projects/lxmert/processing_image.py
@@ -20,8 +20,8 @@ from typing import Tuple
 import numpy as np
 import torch
 import torch.nn.functional as F
 from PIL import Image
 from torch import nn
 from utils import img_tensorize
@@ -63,7 +63,9 @@ class ResizeShortestEdge:
                img = np.asarray(pil_image)
            else:
                img = img.permute(2, 0, 1).unsqueeze(0)  # 3, 0, 1)  # hw(c) -> nchw
-                img = F.interpolate(img, (newh, neww), mode=self.interp_method, align_corners=False).squeeze(0)
+                img = nn.functional.interpolate(
                    img, (newh, neww), mode=self.interp_method, align_corners=False
                ).squeeze(0)
            img_augs.append(img)
        return img_augs
@@ -85,7 +87,7 @@ class Preprocess:
        max_size = tuple(max(s) for s in zip(*[img.shape for img in images]))
        image_sizes = [im.shape[-2:] for im in images]
        images = [
-            F.pad(
+            nn.functional.pad(
                im,
                [0, max_size[-1] - size[1], 0, max_size[-2] - size[0]],
                value=self.pad_value,
--- a/examples/research_projects/mm-imdb/run_mmimdb.py
+++ b/examples/research_projects/mm-imdb/run_mmimdb.py
@@ -25,8 +25,8 @@ import random
 import numpy as np
 import torch
 import torch.nn as nn
 from sklearn.metrics import f1_score
 from torch import nn
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
@@ -107,11 +107,11 @@ def train(args, train_dataset, model, tokenizer, criterion):
    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
+        model = nn.DataParallel(model)
    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
+        model = nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )
@@ -166,9 +166,9 @@ def train(args, train_dataset, model, tokenizer, criterion):
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
-                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                    nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+                    nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
@@ -248,8 +248,8 @@ def evaluate(args, model, tokenizer, criterion, prefix=""):
    )
    # multi-gpu eval
-    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
+    if args.n_gpu > 1 and not isinstance(model, nn.DataParallel):
-        model = torch.nn.DataParallel(model)
+        model = nn.DataParallel(model)
    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
--- a/examples/research_projects/mm-imdb/utils_mmimdb.py
+++ b/examples/research_projects/mm-imdb/utils_mmimdb.py
@@ -19,10 +19,10 @@ import os
 from collections import Counter
 import torch
 import torch.nn as nn
 import torchvision
 import torchvision.transforms as transforms
 from PIL import Image
 from torch import nn
 from torch.utils.data import Dataset
--- a/examples/research_projects/movement-pruning/Saving_PruneBERT.ipynb
+++ b/examples/research_projects/movement-pruning/Saving_PruneBERT.ipynb
@@ -75,7 +75,7 @@
    "quantized_model = torch.quantization.quantize_dynamic(\n",
    "                    model=model,\n",
    "                    qconfig_spec = {\n",
-    "                        torch.nn.Linear : torch.quantization.default_dynamic_qconfig,\n",
+    "                        nn.Linear : torch.quantization.default_dynamic_qconfig,\n",
    "                    },\n",
    "                    dtype=torch.qint8,\n",
    "                )\n",
--- a/examples/research_projects/movement-pruning/emmental/modules/masked_nn.py
+++ b/examples/research_projects/movement-pruning/emmental/modules/masked_nn.py
@@ -23,7 +23,6 @@ import math
 import torch
 from torch import nn
 from torch.nn import functional as F
 from torch.nn import init
 from .binarizer import MagnitudeBinarizer, ThresholdBinarizer, TopKBinarizer
@@ -104,4 +103,4 @@ class MaskedLinear(nn.Linear):
        # Mask weights with computed mask
        weight_thresholded = mask * self.weight
        # Compute output (linear layer) with masked weights
-        return F.linear(input, weight_thresholded, self.bias)
+        return nn.functional.linear(input, weight_thresholded, self.bias)
--- a/examples/research_projects/movement-pruning/masked_run_glue.py
+++ b/examples/research_projects/movement-pruning/masked_run_glue.py
@@ -24,8 +24,7 @@ import random
 import numpy as np
 import torch
-import torch.nn as nn
+from torch import nn
 import torch.nn.functional as F
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
@@ -168,11 +167,11 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
+        model = nn.DataParallel(model)
    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
+        model = nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
@@ -287,9 +286,9 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
                    )
                loss_logits = (
-                    F.kl_div(
+                    nn.functional.kl_div(
-                        input=F.log_softmax(logits_stu / args.temperature, dim=-1),
+                        input=nn.functional.log_softmax(logits_stu / args.temperature, dim=-1),
-                        target=F.softmax(logits_tea / args.temperature, dim=-1),
+                        target=nn.functional.softmax(logits_tea / args.temperature, dim=-1),
                        reduction="batchmean",
                    )
                    * (args.temperature ** 2)
@@ -320,9 +319,9 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
                and (step + 1) == len(epoch_iterator)
            ):
                if args.fp16:
-                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                    nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+                    nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    tb_writer.add_scalar("threshold", threshold, global_step)
@@ -436,8 +435,8 @@ def evaluate(args, model, tokenizer, prefix=""):
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
        # multi-gpu eval
-        if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
+        if args.n_gpu > 1 and not isinstance(model, nn.DataParallel):
-            model = torch.nn.DataParallel(model)
+            model = nn.DataParallel(model)
        # Eval!
        logger.info("***** Running evaluation {} *****".format(prefix))
--- a/examples/research_projects/movement-pruning/masked_run_squad.py
+++ b/examples/research_projects/movement-pruning/masked_run_squad.py
@@ -25,8 +25,7 @@ import timeit
 import numpy as np
 import torch
-import torch.nn as nn
+from torch import nn
 import torch.nn.functional as F
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
@@ -176,11 +175,11 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
+        model = nn.DataParallel(model)
    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
+        model = nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
@@ -308,17 +307,17 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
                    )
                loss_start = (
-                    F.kl_div(
+                    nn.functional.kl_div(
-                        input=F.log_softmax(start_logits_stu / args.temperature, dim=-1),
+                        input=nn.functional.log_softmax(start_logits_stu / args.temperature, dim=-1),
-                        target=F.softmax(start_logits_tea / args.temperature, dim=-1),
+                        target=nn.functional.softmax(start_logits_tea / args.temperature, dim=-1),
                        reduction="batchmean",
                    )
                    * (args.temperature ** 2)
                )
                loss_end = (
-                    F.kl_div(
+                    nn.functional.kl_div(
-                        input=F.log_softmax(end_logits_stu / args.temperature, dim=-1),
+                        input=nn.functional.log_softmax(end_logits_stu / args.temperature, dim=-1),
-                        target=F.softmax(end_logits_tea / args.temperature, dim=-1),
+                        target=nn.functional.softmax(end_logits_tea / args.temperature, dim=-1),
                        reduction="batchmean",
                    )
                    * (args.temperature ** 2)
@@ -346,9 +345,9 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
-                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                    nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+                    nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    tb_writer.add_scalar("threshold", threshold, global_step)
@@ -454,8 +453,8 @@ def evaluate(args, model, tokenizer, prefix=""):
    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
    # multi-gpu eval
-    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
+    if args.n_gpu > 1 and not isinstance(model, nn.DataParallel):
-        model = torch.nn.DataParallel(model)
+        model = nn.DataParallel(model)
    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
--- a/examples/research_projects/pplm/pplm_classification_head.py
+++ b/examples/research_projects/pplm/pplm_classification_head.py
@@ -1,19 +1,19 @@
-import torch
+from torch import nn
-class ClassificationHead(torch.nn.Module):
+class ClassificationHead(nn.Module):
    """Classification Head for  transformer encoders"""
    def __init__(self, class_size, embed_size):
        super().__init__()
        self.class_size = class_size
        self.embed_size = embed_size
-        # self.mlp1 = torch.nn.Linear(embed_size, embed_size)
+        # self.mlp1 = nn.Linear(embed_size, embed_size)
-        # self.mlp2 = (torch.nn.Linear(embed_size, class_size))
+        # self.mlp2 = (nn.Linear(embed_size, class_size))
-        self.mlp = torch.nn.Linear(embed_size, class_size)
+        self.mlp = nn.Linear(embed_size, class_size)
    def forward(self, hidden_state):
-        # hidden_state = F.relu(self.mlp1(hidden_state))
+        # hidden_state = nn.functional.relu(self.mlp1(hidden_state))
        # hidden_state = self.mlp2(hidden_state)
        logits = self.mlp(hidden_state)
        return logits
--- a/examples/research_projects/pplm/run_pplm.py
+++ b/examples/research_projects/pplm/run_pplm.py
@@ -30,7 +30,7 @@ from typing import List, Optional, Tuple, Union
 import numpy as np
 import torch
-import torch.nn.functional as F
+from torch import nn
 from tqdm import trange
 from pplm_classification_head import ClassificationHead
@@ -160,7 +160,7 @@ def perturb_past(
        new_accumulated_hidden = accumulated_hidden + torch.sum(hidden, dim=1).detach()
        # TODO: Check the layer-norm consistency of this with trained discriminator (Sumanth)
        logits = all_logits[:, -1, :]
-        probs = F.softmax(logits, dim=-1)
+        probs = nn.functional.softmax(logits, dim=-1)
        loss = 0.0
        loss_list = []
@@ -173,7 +173,7 @@ def perturb_past(
            print(" pplm_bow_loss:", loss.data.cpu().numpy())
        if loss_type == 2 or loss_type == 3:
-            ce_loss = torch.nn.CrossEntropyLoss()
+            ce_loss = nn.CrossEntropyLoss()
            # TODO why we need to do this assignment and not just using unpert_past? (Sumanth)
            curr_unpert_past = unpert_past
            curr_probs = torch.unsqueeze(probs, dim=1)
@@ -195,7 +195,7 @@ def perturb_past(
        kl_loss = 0.0
        if kl_scale > 0.0:
-            unpert_probs = F.softmax(unpert_logits[:, -1, :], dim=-1)
+            unpert_probs = nn.functional.softmax(unpert_logits[:, -1, :], dim=-1)
            unpert_probs = unpert_probs + SMALL_CONST * (unpert_probs <= SMALL_CONST).float().to(device).detach()
            correction = SMALL_CONST * (probs <= SMALL_CONST).float().to(device).detach()
            corrected_probs = probs + correction.detach()
@@ -527,10 +527,10 @@ def generate_text_pplm(
            else:
                pert_logits[0, token_idx] /= repetition_penalty
-        pert_probs = F.softmax(pert_logits, dim=-1)
+        pert_probs = nn.functional.softmax(pert_logits, dim=-1)
        if classifier is not None:
-            ce_loss = torch.nn.CrossEntropyLoss()
+            ce_loss = nn.CrossEntropyLoss()
            prediction = classifier(torch.mean(unpert_last_hidden, dim=1))
            label = torch.tensor([class_label], device=device, dtype=torch.long)
            unpert_discrim_loss = ce_loss(prediction, label)
@@ -541,7 +541,7 @@ def generate_text_pplm(
        # Fuse the modified model and original model
        if perturb:
-            unpert_probs = F.softmax(unpert_logits[:, -1, :], dim=-1)
+            unpert_probs = nn.functional.softmax(unpert_logits[:, -1, :], dim=-1)
            pert_probs = (pert_probs ** gm_scale) * (unpert_probs ** (1 - gm_scale))  # + SMALL_CONST
            pert_probs = top_k_filter(pert_probs, k=top_k, probs=True)  # + SMALL_CONST
@@ -552,7 +552,7 @@ def generate_text_pplm(
        else:
            pert_logits = top_k_filter(pert_logits, k=top_k)  # + SMALL_CONST
-            pert_probs = F.softmax(pert_logits, dim=-1)
+            pert_probs = nn.functional.softmax(pert_logits, dim=-1)
        # sample or greedy
        if sample:
--- a/examples/research_projects/pplm/run_pplm_discrim_train.py
+++ b/examples/research_projects/pplm/run_pplm_discrim_train.py
@@ -23,10 +23,10 @@ import time
 import numpy as np
 import torch
 import torch.nn.functional as F
 import torch.optim as optim
 import torch.utils.data as data
 from nltk.tokenize.treebank import TreebankWordDetokenizer
 from torch import nn
 from torchtext import data as torchtext_data
 from torchtext import datasets
 from tqdm import tqdm, trange
@@ -42,7 +42,7 @@ example_sentence = "This is incredible! I love it, this is the best chicken I ha
 max_length_seq = 100
-class Discriminator(torch.nn.Module):
+class Discriminator(nn.Module):
    """Transformer encoder followed by a Classification Head"""
    def __init__(self, class_size, pretrained_model="gpt2-medium", cached_mode=False, device="cpu"):
@@ -76,7 +76,7 @@ class Discriminator(torch.nn.Module):
            avg_hidden = self.avg_representation(x.to(self.device))
        logits = self.classifier_head(avg_hidden)
-        probs = F.log_softmax(logits, dim=-1)
+        probs = nn.functional.log_softmax(logits, dim=-1)
        return probs
@@ -140,7 +140,7 @@ def train_epoch(data_loader, discriminator, optimizer, epoch=0, log_interval=10,
        optimizer.zero_grad()
        output_t = discriminator(input_t)
-        loss = F.nll_loss(output_t, target_t)
+        loss = nn.functional.nll_loss(output_t, target_t)
        loss.backward(retain_graph=True)
        optimizer.step()
@@ -167,7 +167,7 @@ def evaluate_performance(data_loader, discriminator, device="cpu"):
            input_t, target_t = input_t.to(device), target_t.to(device)
            output_t = discriminator(input_t)
            # sum up batch loss
-            test_loss += F.nll_loss(output_t, target_t, reduction="sum").item()
+            test_loss += nn.functional.nll_loss(output_t, target_t, reduction="sum").item()
            # get the index of the max log-probability
            pred_t = output_t.argmax(dim=1, keepdim=True)
            correct += pred_t.eq(target_t.view_as(pred_t)).sum().item()
--- a/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py
+++ b/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py
@@ -8,6 +8,7 @@ from pathlib import Path
 import pytest
 import pytorch_lightning as pl
 import torch
 from torch import nn
 import lightning_base
 from convert_pl_checkpoint_to_hf import convert_pl_to_hf
@@ -183,7 +184,7 @@ class TestSummarizationDistiller(TestCasePlus):
        logits = model(input_ids, attention_mask=mask, decoder_input_ids=decoder_input_ids, use_cache=False).logits
-        lprobs = torch.nn.functional.log_softmax(logits, dim=-1)
+        lprobs = nn.functional.log_softmax(logits, dim=-1)
        smoothed_loss, nll_loss = label_smoothed_nll_loss(
            lprobs, lm_labels, 0.1, ignore_index=model.config.pad_token_id
        )
--- a/examples/research_projects/seq2seq-distillation/distillation.py
+++ b/examples/research_projects/seq2seq-distillation/distillation.py
@@ -10,7 +10,6 @@ from typing import List
 import pytorch_lightning as pl
 import torch
 from torch import nn
 from torch.nn import functional as F
 from finetune import SummarizationModule, TranslationModule
 from finetune import main as ft_main
@@ -123,8 +122,8 @@ class SummarizationDistiller(SummarizationModule):
        assert t_logits_slct.size() == s_logits_slct.size()
        loss_ce = (
            self.ce_loss_fct(
-                F.log_softmax(s_logits_slct / self.temperature, dim=-1),
+                nn.functional.log_softmax(s_logits_slct / self.temperature, dim=-1),
-                F.softmax(t_logits_slct / self.temperature, dim=-1),
+                nn.functional.softmax(t_logits_slct / self.temperature, dim=-1),
            )
            * (self.temperature) ** 2
        )
@@ -160,10 +159,10 @@ class SummarizationDistiller(SummarizationModule):
        assert lm_logits.shape[-1] == self.model.config.vocab_size
        if self.hparams.label_smoothing == 0:
            # Same behavior as modeling_bart.py, besides ignoring pad_token_id
-            loss_fct = torch.nn.CrossEntropyLoss(ignore_index=pad_token_id)
+            loss_fct = nn.CrossEntropyLoss(ignore_index=pad_token_id)
            student_lm_loss = loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), labels.view(-1))
        else:
-            lprobs = F.log_softmax(lm_logits, dim=-1)
+            lprobs = nn.functional.log_softmax(lm_logits, dim=-1)
            student_lm_loss, _ = label_smoothed_nll_loss(
                lprobs, labels, self.hparams.label_smoothing, ignore_index=pad_token_id
            )
@@ -230,9 +229,9 @@ class SummarizationDistiller(SummarizationModule):
        teacher_states = torch.stack([hidden_states_T[j] for j in matches])
        assert student_states.shape == teacher_states.shape, f"{student_states.shape} != {teacher_states.shape}"
        if normalize_hidden:
-            student_states = F.layer_norm(student_states, student_states.shape[1:])
+            student_states = nn.functional.layer_norm(student_states, student_states.shape[1:])
-            teacher_states = F.layer_norm(teacher_states, teacher_states.shape[1:])
+            teacher_states = nn.functional.layer_norm(teacher_states, teacher_states.shape[1:])
-        mse = F.mse_loss(student_states, teacher_states, reduction="none")
+        mse = nn.functional.mse_loss(student_states, teacher_states, reduction="none")
        masked_mse = (mse * mask.unsqueeze(0).unsqueeze(-1)).sum() / valid_count
        return masked_mse
--- a/examples/research_projects/seq2seq-distillation/finetune.py
+++ b/examples/research_projects/seq2seq-distillation/finetune.py
@@ -13,6 +13,7 @@ from typing import Dict, List, Tuple
 import numpy as np
 import pytorch_lightning as pl
 import torch
 from torch import nn
 from torch.utils.data import DataLoader
 from callbacks import Seq2SeqLoggingCallback, get_checkpoint_callback, get_early_stopping_callback
@@ -151,12 +152,12 @@ class SummarizationModule(BaseTransformer):
        lm_logits = outputs["logits"]
        if self.hparams.label_smoothing == 0:
            # Same behavior as modeling_bart.py, besides ignoring pad_token_id
-            ce_loss_fct = torch.nn.CrossEntropyLoss(ignore_index=pad_token_id)
+            ce_loss_fct = nn.CrossEntropyLoss(ignore_index=pad_token_id)
            assert lm_logits.shape[-1] == self.vocab_size
            loss = ce_loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), tgt_ids.view(-1))
        else:
-            lprobs = torch.nn.functional.log_softmax(lm_logits, dim=-1)
+            lprobs = nn.functional.log_softmax(lm_logits, dim=-1)
            loss, nll_loss = label_smoothed_nll_loss(
                lprobs, tgt_ids, self.hparams.label_smoothing, ignore_index=pad_token_id
            )
--- a/examples/research_projects/wav2vec2/run_asr.py
+++ b/examples/research_projects/wav2vec2/run_asr.py
@@ -9,8 +9,8 @@ from typing import Any, Callable, Dict, List, Optional, Set, Union
 import datasets
 import numpy as np
 import torch
 import torch.nn as nn
 from packaging import version
 from torch import nn
 import librosa
 from lang_trans import arabic
--- a/examples/research_projects/wav2vec2/run_pretrain.py
+++ b/examples/research_projects/wav2vec2/run_pretrain.py
@@ -5,9 +5,9 @@ from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Union
 import torch
 import torch.nn as nn
 from datasets import DatasetDict, load_dataset
 from packaging import version
 from torch import nn
 import librosa
 from transformers import (