From 88e84186e5a0d5dd78b62b1a8e97b2c269426442 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Mon, 14 Jun 2021 12:28:24 -0700
Subject: [PATCH] [style] consistent nn. and nn.functional: part 4 `examples`
 (#12156)

* consistent nn. and nn.functional: p4 examples

* restore
---
 .../pabee/modeling_pabee_albert.py            |  3 +-
 .../pabee/modeling_pabee_bert.py              |  1 +
 .../run_glue_with_pabee.py                    | 13 ++++----
 .../bertology/run_bertology.py                |  5 +--
 .../bertology/run_prune_gpt.py                |  5 +--
 .../deebert/run_glue_deebert.py               | 11 ++++---
 .../deebert/src/modeling_highway_roberta.py   |  2 +-
 .../distillation/distiller.py                 | 11 +++----
 .../distillation/run_squad_w_distillation.py  | 23 +++++++-------
 .../longform-qa/eli5_utils.py                 |  9 +++---
 .../lxmert/modeling_frcnn.py                  | 31 +++++++++----------
 .../lxmert/processing_image.py                |  8 +++--
 .../research_projects/mm-imdb/run_mmimdb.py   | 14 ++++-----
 .../research_projects/mm-imdb/utils_mmimdb.py |  2 +-
 .../movement-pruning/Saving_PruneBERT.ipynb   |  2 +-
 .../emmental/modules/masked_nn.py             |  3 +-
 .../movement-pruning/masked_run_glue.py       | 21 ++++++-------
 .../movement-pruning/masked_run_squad.py      | 27 ++++++++--------
 .../pplm/pplm_classification_head.py          | 12 +++----
 examples/research_projects/pplm/run_pplm.py   | 16 +++++-----
 .../pplm/run_pplm_discrim_train.py            | 10 +++---
 .../_test_seq2seq_examples.py                 |  3 +-
 .../seq2seq-distillation/distillation.py      | 15 +++++----
 .../seq2seq-distillation/finetune.py          |  5 +--
 .../research_projects/wav2vec2/run_asr.py     |  2 +-
 .../wav2vec2/run_pretrain.py                  |  2 +-
 26 files changed, 130 insertions(+), 126 deletions(-)

diff --git a/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py b/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py
index 960dd4d830..006ff98c95 100644
--- a/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py
+++ b/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_albert.py
@@ -17,7 +17,7 @@
 import logging
 
 import torch
-import torch.nn as nn
+from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
 
 from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
@@ -270,6 +270,7 @@ class AlbertForSequenceClassificationWithPabee(AlbertPreTrainedModel):
 
                 from transformers import AlbertTokenizer
                 from pabee import AlbertForSequenceClassificationWithPabee
+                from torch import nn
                 import torch
 
                 tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
diff --git a/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_bert.py b/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_bert.py
index 89de6168ec..7384d78fb9 100644
--- a/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_bert.py
+++ b/examples/research_projects/bert-loses-patience/pabee/modeling_pabee_bert.py
@@ -294,6 +294,7 @@ class BertForSequenceClassificationWithPabee(BertPreTrainedModel):
 
             from transformers import BertTokenizer, BertForSequenceClassification
             from pabee import BertForSequenceClassificationWithPabee
+            from torch import nn
             import torch
 
             tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
diff --git a/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py b/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py
index 0366366d71..c5d0633fda 100755
--- a/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py
+++ b/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py
@@ -25,6 +25,7 @@ import random
 
 import numpy as np
 import torch
+from torch import nn
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
@@ -117,11 +118,11 @@ def train(args, train_dataset, model, tokenizer):
 
     # multi-gpu training (should be after apex fp16 initialization)
     if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
+        model = nn.DataParallel(model)
 
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
+        model = nn.parallel.DistributedDataParallel(
             model,
             device_ids=[args.local_rank],
             output_device=args.local_rank,
@@ -203,9 +204,9 @@ def train(args, train_dataset, model, tokenizer):
             tr_loss += loss.item()
             if (step + 1) % args.gradient_accumulation_steps == 0:
                 if args.fp16:
-                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                    nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                 else:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+                    nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
 
                 optimizer.step()
                 scheduler.step()  # Update learning rate schedule
@@ -291,8 +292,8 @@ def evaluate(args, model, tokenizer, prefix="", patience=0):
         eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
 
         # multi-gpu eval
-        if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
-            model = torch.nn.DataParallel(model)
+        if args.n_gpu > 1 and not isinstance(model, nn.DataParallel):
+            model = nn.DataParallel(model)
 
         # Eval!
         logger.info("***** Running evaluation {} *****".format(prefix))
diff --git a/examples/research_projects/bertology/run_bertology.py b/examples/research_projects/bertology/run_bertology.py
index fb1c24e5bc..1018359dc6 100644
--- a/examples/research_projects/bertology/run_bertology.py
+++ b/examples/research_projects/bertology/run_bertology.py
@@ -26,6 +26,7 @@ from datetime import datetime
 
 import numpy as np
 import torch
+from torch import nn
 from torch.utils.data import DataLoader, SequentialSampler, Subset
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm
@@ -415,11 +416,11 @@ def main():
     # Distributed and parallel training
     model.to(args.device)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
+        model = nn.parallel.DistributedDataParallel(
             model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
         )
     elif args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
+        model = nn.DataParallel(model)
 
     # Print/save training arguments
     os.makedirs(args.output_dir, exist_ok=True)
diff --git a/examples/research_projects/bertology/run_prune_gpt.py b/examples/research_projects/bertology/run_prune_gpt.py
index 5dbabe3912..49a867b96d 100644
--- a/examples/research_projects/bertology/run_prune_gpt.py
+++ b/examples/research_projects/bertology/run_prune_gpt.py
@@ -10,6 +10,7 @@ from datetime import datetime
 
 import numpy as np
 import torch
+from torch import nn
 from torch.utils.data import DataLoader, RandomSampler, TensorDataset
 from tqdm import tqdm
 
@@ -352,11 +353,11 @@ def main():
     # Distributed and parallel training
     model.to(args.device)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
+        model = nn.parallel.DistributedDataParallel(
             model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
         )
     elif args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
+        model = nn.DataParallel(model)
 
     # Print/save training arguments
     os.makedirs(args.output_dir, exist_ok=True)
diff --git a/examples/research_projects/deebert/run_glue_deebert.py b/examples/research_projects/deebert/run_glue_deebert.py
index 97ae17faab..fce491e790 100644
--- a/examples/research_projects/deebert/run_glue_deebert.py
+++ b/examples/research_projects/deebert/run_glue_deebert.py
@@ -9,6 +9,7 @@ import time
 
 import numpy as np
 import torch
+from torch import nn
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
@@ -135,11 +136,11 @@ def train(args, train_dataset, model, tokenizer, train_highway=False):
 
     # multi-gpu training (should be after apex fp16 initialization)
     if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
+        model = nn.DataParallel(model)
 
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
+        model = nn.parallel.DistributedDataParallel(
             model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
         )
 
@@ -190,9 +191,9 @@ def train(args, train_dataset, model, tokenizer, train_highway=False):
             tr_loss += loss.item()
             if (step + 1) % args.gradient_accumulation_steps == 0:
                 if args.fp16:
-                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                    nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                 else:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+                    nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
 
                 optimizer.step()
                 scheduler.step()  # Update learning rate schedule
@@ -255,7 +256,7 @@ def evaluate(args, model, tokenizer, prefix="", output_layer=-1, eval_highway=Fa
 
         # multi-gpu eval
         if args.n_gpu > 1:
-            model = torch.nn.DataParallel(model)
+            model = nn.DataParallel(model)
 
         # Eval!
         logger.info("***** Running evaluation {} *****".format(prefix))
diff --git a/examples/research_projects/deebert/src/modeling_highway_roberta.py b/examples/research_projects/deebert/src/modeling_highway_roberta.py
index 7534026595..c8358ac994 100644
--- a/examples/research_projects/deebert/src/modeling_highway_roberta.py
+++ b/examples/research_projects/deebert/src/modeling_highway_roberta.py
@@ -1,6 +1,6 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import torch.nn as nn
+from torch import nn
 from torch.nn import CrossEntropyLoss, MSELoss
 
 from transformers import RobertaConfig
diff --git a/examples/research_projects/distillation/distiller.py b/examples/research_projects/distillation/distiller.py
index 95e6ac0bbc..a9716506c1 100644
--- a/examples/research_projects/distillation/distiller.py
+++ b/examples/research_projects/distillation/distiller.py
@@ -21,8 +21,7 @@ import time
 
 import psutil
 import torch
-import torch.nn as nn
-import torch.nn.functional as F
+from torch import nn
 from torch.optim import AdamW
 from torch.utils.data import BatchSampler, DataLoader, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
@@ -412,8 +411,8 @@ class Distiller:
 
         loss_ce = (
             self.ce_loss_fct(
-                F.log_softmax(s_logits_slct / self.temperature, dim=-1),
-                F.softmax(t_logits_slct / self.temperature, dim=-1),
+                nn.functional.log_softmax(s_logits_slct / self.temperature, dim=-1),
+                nn.functional.softmax(t_logits_slct / self.temperature, dim=-1),
             )
             * (self.temperature) ** 2
         )
@@ -492,9 +491,9 @@ class Distiller:
         self.iter()
         if self.n_iter % self.params.gradient_accumulation_steps == 0:
             if self.fp16:
-                torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.params.max_grad_norm)
+                nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), self.params.max_grad_norm)
             else:
-                torch.nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm)
+                nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm)
             self.optimizer.step()
             self.optimizer.zero_grad()
             self.scheduler.step()
diff --git a/examples/research_projects/distillation/run_squad_w_distillation.py b/examples/research_projects/distillation/run_squad_w_distillation.py
index 1c7256fccf..3d2320490f 100644
--- a/examples/research_projects/distillation/run_squad_w_distillation.py
+++ b/examples/research_projects/distillation/run_squad_w_distillation.py
@@ -24,8 +24,7 @@ import timeit
 
 import numpy as np
 import torch
-import torch.nn as nn
-import torch.nn.functional as F
+from torch import nn
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
@@ -138,11 +137,11 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
 
     # multi-gpu training (should be after apex fp16 initialization)
     if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
+        model = nn.DataParallel(model)
 
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
+        model = nn.parallel.DistributedDataParallel(
             model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
         )
 
@@ -232,15 +231,15 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
                 loss_fct = nn.KLDivLoss(reduction="batchmean")
                 loss_start = (
                     loss_fct(
-                        F.log_softmax(start_logits_stu / args.temperature, dim=-1),
-                        F.softmax(start_logits_tea / args.temperature, dim=-1),
+                        nn.functional.log_softmax(start_logits_stu / args.temperature, dim=-1),
+                        nn.functional.softmax(start_logits_tea / args.temperature, dim=-1),
                     )
                     * (args.temperature ** 2)
                 )
                 loss_end = (
                     loss_fct(
-                        F.log_softmax(end_logits_stu / args.temperature, dim=-1),
-                        F.softmax(end_logits_tea / args.temperature, dim=-1),
+                        nn.functional.log_softmax(end_logits_stu / args.temperature, dim=-1),
+                        nn.functional.softmax(end_logits_tea / args.temperature, dim=-1),
                     )
                     * (args.temperature ** 2)
                 )
@@ -262,9 +261,9 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
             tr_loss += loss.item()
             if (step + 1) % args.gradient_accumulation_steps == 0:
                 if args.fp16:
-                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                    nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                 else:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+                    nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
 
                 optimizer.step()
                 scheduler.step()  # Update learning rate schedule
@@ -326,8 +325,8 @@ def evaluate(args, model, tokenizer, prefix=""):
     eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
 
     # multi-gpu evaluate
-    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
-        model = torch.nn.DataParallel(model)
+    if args.n_gpu > 1 and not isinstance(model, nn.DataParallel):
+        model = nn.DataParallel(model)
 
     # Eval!
     logger.info("***** Running evaluation {} *****".format(prefix))
diff --git a/examples/research_projects/longform-qa/eli5_utils.py b/examples/research_projects/longform-qa/eli5_utils.py
index 60bc424a7f..ff72a16bfd 100644
--- a/examples/research_projects/longform-qa/eli5_utils.py
+++ b/examples/research_projects/longform-qa/eli5_utils.py
@@ -11,6 +11,7 @@ import torch
 import torch.utils.checkpoint as checkpoint
 from elasticsearch import Elasticsearch  # noqa: F401
 from elasticsearch.helpers import bulk, streaming_bulk  # noqa: F401
+from torch import nn
 from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
 from tqdm import tqdm
 
@@ -116,14 +117,14 @@ class ELI5DatasetQARetriver(Dataset):
         return self.make_example(idx % self.data.num_rows)
 
 
-class RetrievalQAEmbedder(torch.nn.Module):
+class RetrievalQAEmbedder(nn.Module):
     def __init__(self, sent_encoder, dim):
         super(RetrievalQAEmbedder, self).__init__()
         self.sent_encoder = sent_encoder
         self.output_dim = 128
-        self.project_q = torch.nn.Linear(dim, self.output_dim, bias=False)
-        self.project_a = torch.nn.Linear(dim, self.output_dim, bias=False)
-        self.ce_loss = torch.nn.CrossEntropyLoss(reduction="mean")
+        self.project_q = nn.Linear(dim, self.output_dim, bias=False)
+        self.project_a = nn.Linear(dim, self.output_dim, bias=False)
+        self.ce_loss = nn.CrossEntropyLoss(reduction="mean")
 
     def embed_sentences_checkpointed(self, input_ids, attention_mask, checkpoint_batch_size=-1):
         # reproduces BERT forward pass with checkpointing
diff --git a/examples/research_projects/lxmert/modeling_frcnn.py b/examples/research_projects/lxmert/modeling_frcnn.py
index 9489f4c11d..89f01f4fca 100644
--- a/examples/research_projects/lxmert/modeling_frcnn.py
+++ b/examples/research_projects/lxmert/modeling_frcnn.py
@@ -25,7 +25,6 @@ from typing import Dict, List, Tuple
 import numpy as np
 import torch
 from torch import nn
-from torch.nn import functional as F
 from torch.nn.modules.batchnorm import BatchNorm2d
 from torchvision.ops import RoIPool
 from torchvision.ops.boxes import batched_nms, nms
@@ -85,7 +84,7 @@ def pad_list_tensors(
             too_small = True
             tensor_i = tensor_i.unsqueeze(-1)
         assert isinstance(tensor_i, torch.Tensor)
-        tensor_i = F.pad(
+        tensor_i = nn.functional.pad(
             input=tensor_i,
             pad=(0, 0, 0, max_detections - preds_per_image[i]),
             mode="constant",
@@ -701,7 +700,7 @@ class RPNOutputs(object):
 
 
 # Main Classes
-class Conv2d(torch.nn.Conv2d):
+class Conv2d(nn.Conv2d):
     def __init__(self, *args, **kwargs):
         norm = kwargs.pop("norm", None)
         activation = kwargs.pop("activation", None)
@@ -712,9 +711,9 @@ class Conv2d(torch.nn.Conv2d):
 
     def forward(self, x):
         if x.numel() == 0 and self.training:
-            assert not isinstance(self.norm, torch.nn.SyncBatchNorm)
+            assert not isinstance(self.norm, nn.SyncBatchNorm)
         if x.numel() == 0:
-            assert not isinstance(self.norm, torch.nn.GroupNorm)
+            assert not isinstance(self.norm, nn.GroupNorm)
             output_shape = [
                 (i + 2 * p - (di * (k - 1) + 1)) // s + 1
                 for i, p, di, k, s in zip(
@@ -752,7 +751,7 @@ class LastLevelMaxPool(nn.Module):
         self.in_feature = "p5"
 
     def forward(self, x):
-        return [F.max_pool2d(x, kernel_size=1, stride=2, padding=0)]
+        return [nn.functional.max_pool2d(x, kernel_size=1, stride=2, padding=0)]
 
 
 class LastLevelP6P7(nn.Module):
@@ -769,7 +768,7 @@ class LastLevelP6P7(nn.Module):
 
     def forward(self, c5):
         p6 = self.p6(c5)
-        p7 = self.p7(F.relu(p6))
+        p7 = self.p7(nn.functional.relu(p6))
         return [p6, p7]
 
 
@@ -790,11 +789,11 @@ class BasicStem(nn.Module):
 
     def forward(self, x):
         x = self.conv1(x)
-        x = F.relu_(x)
+        x = nn.functional.relu_(x)
         if self.caffe_maxpool:
-            x = F.max_pool2d(x, kernel_size=3, stride=2, padding=0, ceil_mode=True)
+            x = nn.functional.max_pool2d(x, kernel_size=3, stride=2, padding=0, ceil_mode=True)
         else:
-            x = F.max_pool2d(x, kernel_size=3, stride=2, padding=1)
+            x = nn.functional.max_pool2d(x, kernel_size=3, stride=2, padding=1)
         return x
 
     @property
@@ -881,10 +880,10 @@ class BottleneckBlock(ResNetBlockBase):
 
     def forward(self, x):
         out = self.conv1(x)
-        out = F.relu_(out)
+        out = nn.functional.relu_(out)
 
         out = self.conv2(out)
-        out = F.relu_(out)
+        out = nn.functional.relu_(out)
 
         out = self.conv3(out)
 
@@ -894,7 +893,7 @@ class BottleneckBlock(ResNetBlockBase):
             shortcut = x
 
         out += shortcut
-        out = F.relu_(out)
+        out = nn.functional.relu_(out)
         return out
 
 
@@ -1159,7 +1158,7 @@ class ROIOutputs(object):
         return boxes.view(num_pred, K * B).split(preds_per_image, dim=0)
 
     def _predict_objs(self, obj_logits, preds_per_image):
-        probs = F.softmax(obj_logits, dim=-1)
+        probs = nn.functional.softmax(obj_logits, dim=-1)
         probs = probs.split(preds_per_image, dim=0)
         return probs
 
@@ -1490,7 +1489,7 @@ class RPNHead(nn.Module):
         pred_objectness_logits = []
         pred_anchor_deltas = []
         for x in features:
-            t = F.relu(self.conv(x))
+            t = nn.functional.relu(self.conv(x))
             pred_objectness_logits.append(self.objectness_logits(t))
             pred_anchor_deltas.append(self.anchor_deltas(t))
         return pred_objectness_logits, pred_anchor_deltas
@@ -1650,7 +1649,7 @@ class FastRCNNOutputLayers(nn.Module):
             cls_emb = self.cls_embedding(max_class)  # [b] --> [b, 256]
             roi_features = torch.cat([roi_features, cls_emb], -1)  # [b, 2048] + [b, 256] --> [b, 2304]
             roi_features = self.fc_attr(roi_features)
-            roi_features = F.relu(roi_features)
+            roi_features = nn.functional.relu(roi_features)
             attr_scores = self.attr_score(roi_features)
             return scores, attr_scores, proposal_deltas
         else:
diff --git a/examples/research_projects/lxmert/processing_image.py b/examples/research_projects/lxmert/processing_image.py
index ff449985b0..7ea5dace02 100644
--- a/examples/research_projects/lxmert/processing_image.py
+++ b/examples/research_projects/lxmert/processing_image.py
@@ -20,8 +20,8 @@ from typing import Tuple
 
 import numpy as np
 import torch
-import torch.nn.functional as F
 from PIL import Image
+from torch import nn
 
 from utils import img_tensorize
 
@@ -63,7 +63,9 @@ class ResizeShortestEdge:
                 img = np.asarray(pil_image)
             else:
                 img = img.permute(2, 0, 1).unsqueeze(0)  # 3, 0, 1)  # hw(c) -> nchw
-                img = F.interpolate(img, (newh, neww), mode=self.interp_method, align_corners=False).squeeze(0)
+                img = nn.functional.interpolate(
+                    img, (newh, neww), mode=self.interp_method, align_corners=False
+                ).squeeze(0)
             img_augs.append(img)
 
         return img_augs
@@ -85,7 +87,7 @@ class Preprocess:
         max_size = tuple(max(s) for s in zip(*[img.shape for img in images]))
         image_sizes = [im.shape[-2:] for im in images]
         images = [
-            F.pad(
+            nn.functional.pad(
                 im,
                 [0, max_size[-1] - size[1], 0, max_size[-2] - size[0]],
                 value=self.pad_value,
diff --git a/examples/research_projects/mm-imdb/run_mmimdb.py b/examples/research_projects/mm-imdb/run_mmimdb.py
index 4157d2e9cf..7f6f25dd6b 100644
--- a/examples/research_projects/mm-imdb/run_mmimdb.py
+++ b/examples/research_projects/mm-imdb/run_mmimdb.py
@@ -25,8 +25,8 @@ import random
 
 import numpy as np
 import torch
-import torch.nn as nn
 from sklearn.metrics import f1_score
+from torch import nn
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
@@ -107,11 +107,11 @@ def train(args, train_dataset, model, tokenizer, criterion):
 
     # multi-gpu training (should be after apex fp16 initialization)
     if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
+        model = nn.DataParallel(model)
 
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
+        model = nn.parallel.DistributedDataParallel(
             model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
         )
 
@@ -166,9 +166,9 @@ def train(args, train_dataset, model, tokenizer, criterion):
             tr_loss += loss.item()
             if (step + 1) % args.gradient_accumulation_steps == 0:
                 if args.fp16:
-                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                    nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                 else:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+                    nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
 
                 optimizer.step()
                 scheduler.step()  # Update learning rate schedule
@@ -248,8 +248,8 @@ def evaluate(args, model, tokenizer, criterion, prefix=""):
     )
 
     # multi-gpu eval
-    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
-        model = torch.nn.DataParallel(model)
+    if args.n_gpu > 1 and not isinstance(model, nn.DataParallel):
+        model = nn.DataParallel(model)
 
     # Eval!
     logger.info("***** Running evaluation {} *****".format(prefix))
diff --git a/examples/research_projects/mm-imdb/utils_mmimdb.py b/examples/research_projects/mm-imdb/utils_mmimdb.py
index cabc85edbb..df8e38d597 100644
--- a/examples/research_projects/mm-imdb/utils_mmimdb.py
+++ b/examples/research_projects/mm-imdb/utils_mmimdb.py
@@ -19,10 +19,10 @@ import os
 from collections import Counter
 
 import torch
-import torch.nn as nn
 import torchvision
 import torchvision.transforms as transforms
 from PIL import Image
+from torch import nn
 from torch.utils.data import Dataset
 
 
diff --git a/examples/research_projects/movement-pruning/Saving_PruneBERT.ipynb b/examples/research_projects/movement-pruning/Saving_PruneBERT.ipynb
index b9ce4bb892..6faeea1a00 100644
--- a/examples/research_projects/movement-pruning/Saving_PruneBERT.ipynb
+++ b/examples/research_projects/movement-pruning/Saving_PruneBERT.ipynb
@@ -75,7 +75,7 @@
     "quantized_model = torch.quantization.quantize_dynamic(\n",
     "                    model=model,\n",
     "                    qconfig_spec = {\n",
-    "                        torch.nn.Linear : torch.quantization.default_dynamic_qconfig,\n",
+    "                        nn.Linear : torch.quantization.default_dynamic_qconfig,\n",
     "                    },\n",
     "                    dtype=torch.qint8,\n",
     "                )\n",
diff --git a/examples/research_projects/movement-pruning/emmental/modules/masked_nn.py b/examples/research_projects/movement-pruning/emmental/modules/masked_nn.py
index 72fa629aff..e3c9483685 100644
--- a/examples/research_projects/movement-pruning/emmental/modules/masked_nn.py
+++ b/examples/research_projects/movement-pruning/emmental/modules/masked_nn.py
@@ -23,7 +23,6 @@ import math
 
 import torch
 from torch import nn
-from torch.nn import functional as F
 from torch.nn import init
 
 from .binarizer import MagnitudeBinarizer, ThresholdBinarizer, TopKBinarizer
@@ -104,4 +103,4 @@ class MaskedLinear(nn.Linear):
         # Mask weights with computed mask
         weight_thresholded = mask * self.weight
         # Compute output (linear layer) with masked weights
-        return F.linear(input, weight_thresholded, self.bias)
+        return nn.functional.linear(input, weight_thresholded, self.bias)
diff --git a/examples/research_projects/movement-pruning/masked_run_glue.py b/examples/research_projects/movement-pruning/masked_run_glue.py
index 48605ee053..7a74d0724c 100644
--- a/examples/research_projects/movement-pruning/masked_run_glue.py
+++ b/examples/research_projects/movement-pruning/masked_run_glue.py
@@ -24,8 +24,7 @@ import random
 
 import numpy as np
 import torch
-import torch.nn as nn
-import torch.nn.functional as F
+from torch import nn
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
@@ -168,11 +167,11 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
 
     # multi-gpu training (should be after apex fp16 initialization)
     if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
+        model = nn.DataParallel(model)
 
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
+        model = nn.parallel.DistributedDataParallel(
             model,
             device_ids=[args.local_rank],
             output_device=args.local_rank,
@@ -287,9 +286,9 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
                     )
 
                 loss_logits = (
-                    F.kl_div(
-                        input=F.log_softmax(logits_stu / args.temperature, dim=-1),
-                        target=F.softmax(logits_tea / args.temperature, dim=-1),
+                    nn.functional.kl_div(
+                        input=nn.functional.log_softmax(logits_stu / args.temperature, dim=-1),
+                        target=nn.functional.softmax(logits_tea / args.temperature, dim=-1),
                         reduction="batchmean",
                     )
                     * (args.temperature ** 2)
@@ -320,9 +319,9 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
                 and (step + 1) == len(epoch_iterator)
             ):
                 if args.fp16:
-                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                    nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                 else:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+                    nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
 
                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                     tb_writer.add_scalar("threshold", threshold, global_step)
@@ -436,8 +435,8 @@ def evaluate(args, model, tokenizer, prefix=""):
         eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
 
         # multi-gpu eval
-        if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
-            model = torch.nn.DataParallel(model)
+        if args.n_gpu > 1 and not isinstance(model, nn.DataParallel):
+            model = nn.DataParallel(model)
 
         # Eval!
         logger.info("***** Running evaluation {} *****".format(prefix))
diff --git a/examples/research_projects/movement-pruning/masked_run_squad.py b/examples/research_projects/movement-pruning/masked_run_squad.py
index 56f26eff10..a1c1cf2cfc 100644
--- a/examples/research_projects/movement-pruning/masked_run_squad.py
+++ b/examples/research_projects/movement-pruning/masked_run_squad.py
@@ -25,8 +25,7 @@ import timeit
 
 import numpy as np
 import torch
-import torch.nn as nn
-import torch.nn.functional as F
+from torch import nn
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
@@ -176,11 +175,11 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
 
     # multi-gpu training (should be after apex fp16 initialization)
     if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
+        model = nn.DataParallel(model)
 
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
+        model = nn.parallel.DistributedDataParallel(
             model,
             device_ids=[args.local_rank],
             output_device=args.local_rank,
@@ -308,17 +307,17 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
                     )
 
                 loss_start = (
-                    F.kl_div(
-                        input=F.log_softmax(start_logits_stu / args.temperature, dim=-1),
-                        target=F.softmax(start_logits_tea / args.temperature, dim=-1),
+                    nn.functional.kl_div(
+                        input=nn.functional.log_softmax(start_logits_stu / args.temperature, dim=-1),
+                        target=nn.functional.softmax(start_logits_tea / args.temperature, dim=-1),
                         reduction="batchmean",
                     )
                     * (args.temperature ** 2)
                 )
                 loss_end = (
-                    F.kl_div(
-                        input=F.log_softmax(end_logits_stu / args.temperature, dim=-1),
-                        target=F.softmax(end_logits_tea / args.temperature, dim=-1),
+                    nn.functional.kl_div(
+                        input=nn.functional.log_softmax(end_logits_stu / args.temperature, dim=-1),
+                        target=nn.functional.softmax(end_logits_tea / args.temperature, dim=-1),
                         reduction="batchmean",
                     )
                     * (args.temperature ** 2)
@@ -346,9 +345,9 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
             tr_loss += loss.item()
             if (step + 1) % args.gradient_accumulation_steps == 0:
                 if args.fp16:
-                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                    nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                 else:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+                    nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
 
                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                     tb_writer.add_scalar("threshold", threshold, global_step)
@@ -454,8 +453,8 @@ def evaluate(args, model, tokenizer, prefix=""):
     eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
 
     # multi-gpu eval
-    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
-        model = torch.nn.DataParallel(model)
+    if args.n_gpu > 1 and not isinstance(model, nn.DataParallel):
+        model = nn.DataParallel(model)
 
     # Eval!
     logger.info("***** Running evaluation {} *****".format(prefix))
diff --git a/examples/research_projects/pplm/pplm_classification_head.py b/examples/research_projects/pplm/pplm_classification_head.py
index e85ba608b2..e26521fe39 100644
--- a/examples/research_projects/pplm/pplm_classification_head.py
+++ b/examples/research_projects/pplm/pplm_classification_head.py
@@ -1,19 +1,19 @@
-import torch
+from torch import nn
 
 
-class ClassificationHead(torch.nn.Module):
+class ClassificationHead(nn.Module):
     """Classification Head for  transformer encoders"""
 
     def __init__(self, class_size, embed_size):
         super().__init__()
         self.class_size = class_size
         self.embed_size = embed_size
-        # self.mlp1 = torch.nn.Linear(embed_size, embed_size)
-        # self.mlp2 = (torch.nn.Linear(embed_size, class_size))
-        self.mlp = torch.nn.Linear(embed_size, class_size)
+        # self.mlp1 = nn.Linear(embed_size, embed_size)
+        # self.mlp2 = (nn.Linear(embed_size, class_size))
+        self.mlp = nn.Linear(embed_size, class_size)
 
     def forward(self, hidden_state):
-        # hidden_state = F.relu(self.mlp1(hidden_state))
+        # hidden_state = nn.functional.relu(self.mlp1(hidden_state))
         # hidden_state = self.mlp2(hidden_state)
         logits = self.mlp(hidden_state)
         return logits
diff --git a/examples/research_projects/pplm/run_pplm.py b/examples/research_projects/pplm/run_pplm.py
index 8d605fac49..4be4f01fd4 100644
--- a/examples/research_projects/pplm/run_pplm.py
+++ b/examples/research_projects/pplm/run_pplm.py
@@ -30,7 +30,7 @@ from typing import List, Optional, Tuple, Union
 
 import numpy as np
 import torch
-import torch.nn.functional as F
+from torch import nn
 from tqdm import trange
 
 from pplm_classification_head import ClassificationHead
@@ -160,7 +160,7 @@ def perturb_past(
         new_accumulated_hidden = accumulated_hidden + torch.sum(hidden, dim=1).detach()
         # TODO: Check the layer-norm consistency of this with trained discriminator (Sumanth)
         logits = all_logits[:, -1, :]
-        probs = F.softmax(logits, dim=-1)
+        probs = nn.functional.softmax(logits, dim=-1)
 
         loss = 0.0
         loss_list = []
@@ -173,7 +173,7 @@ def perturb_past(
             print(" pplm_bow_loss:", loss.data.cpu().numpy())
 
         if loss_type == 2 or loss_type == 3:
-            ce_loss = torch.nn.CrossEntropyLoss()
+            ce_loss = nn.CrossEntropyLoss()
             # TODO why we need to do this assignment and not just using unpert_past? (Sumanth)
             curr_unpert_past = unpert_past
             curr_probs = torch.unsqueeze(probs, dim=1)
@@ -195,7 +195,7 @@ def perturb_past(
 
         kl_loss = 0.0
         if kl_scale > 0.0:
-            unpert_probs = F.softmax(unpert_logits[:, -1, :], dim=-1)
+            unpert_probs = nn.functional.softmax(unpert_logits[:, -1, :], dim=-1)
             unpert_probs = unpert_probs + SMALL_CONST * (unpert_probs <= SMALL_CONST).float().to(device).detach()
             correction = SMALL_CONST * (probs <= SMALL_CONST).float().to(device).detach()
             corrected_probs = probs + correction.detach()
@@ -527,10 +527,10 @@ def generate_text_pplm(
             else:
                 pert_logits[0, token_idx] /= repetition_penalty
 
-        pert_probs = F.softmax(pert_logits, dim=-1)
+        pert_probs = nn.functional.softmax(pert_logits, dim=-1)
 
         if classifier is not None:
-            ce_loss = torch.nn.CrossEntropyLoss()
+            ce_loss = nn.CrossEntropyLoss()
             prediction = classifier(torch.mean(unpert_last_hidden, dim=1))
             label = torch.tensor([class_label], device=device, dtype=torch.long)
             unpert_discrim_loss = ce_loss(prediction, label)
@@ -541,7 +541,7 @@ def generate_text_pplm(
         # Fuse the modified model and original model
         if perturb:
 
-            unpert_probs = F.softmax(unpert_logits[:, -1, :], dim=-1)
+            unpert_probs = nn.functional.softmax(unpert_logits[:, -1, :], dim=-1)
 
             pert_probs = (pert_probs ** gm_scale) * (unpert_probs ** (1 - gm_scale))  # + SMALL_CONST
             pert_probs = top_k_filter(pert_probs, k=top_k, probs=True)  # + SMALL_CONST
@@ -552,7 +552,7 @@ def generate_text_pplm(
 
         else:
             pert_logits = top_k_filter(pert_logits, k=top_k)  # + SMALL_CONST
-            pert_probs = F.softmax(pert_logits, dim=-1)
+            pert_probs = nn.functional.softmax(pert_logits, dim=-1)
 
         # sample or greedy
         if sample:
diff --git a/examples/research_projects/pplm/run_pplm_discrim_train.py b/examples/research_projects/pplm/run_pplm_discrim_train.py
index 51cdb56773..ec8cd9b9fa 100644
--- a/examples/research_projects/pplm/run_pplm_discrim_train.py
+++ b/examples/research_projects/pplm/run_pplm_discrim_train.py
@@ -23,10 +23,10 @@ import time
 
 import numpy as np
 import torch
-import torch.nn.functional as F
 import torch.optim as optim
 import torch.utils.data as data
 from nltk.tokenize.treebank import TreebankWordDetokenizer
+from torch import nn
 from torchtext import data as torchtext_data
 from torchtext import datasets
 from tqdm import tqdm, trange
@@ -42,7 +42,7 @@ example_sentence = "This is incredible! I love it, this is the best chicken I ha
 max_length_seq = 100
 
 
-class Discriminator(torch.nn.Module):
+class Discriminator(nn.Module):
     """Transformer encoder followed by a Classification Head"""
 
     def __init__(self, class_size, pretrained_model="gpt2-medium", cached_mode=False, device="cpu"):
@@ -76,7 +76,7 @@ class Discriminator(torch.nn.Module):
             avg_hidden = self.avg_representation(x.to(self.device))
 
         logits = self.classifier_head(avg_hidden)
-        probs = F.log_softmax(logits, dim=-1)
+        probs = nn.functional.log_softmax(logits, dim=-1)
 
         return probs
 
@@ -140,7 +140,7 @@ def train_epoch(data_loader, discriminator, optimizer, epoch=0, log_interval=10,
         optimizer.zero_grad()
 
         output_t = discriminator(input_t)
-        loss = F.nll_loss(output_t, target_t)
+        loss = nn.functional.nll_loss(output_t, target_t)
         loss.backward(retain_graph=True)
         optimizer.step()
 
@@ -167,7 +167,7 @@ def evaluate_performance(data_loader, discriminator, device="cpu"):
             input_t, target_t = input_t.to(device), target_t.to(device)
             output_t = discriminator(input_t)
             # sum up batch loss
-            test_loss += F.nll_loss(output_t, target_t, reduction="sum").item()
+            test_loss += nn.functional.nll_loss(output_t, target_t, reduction="sum").item()
             # get the index of the max log-probability
             pred_t = output_t.argmax(dim=1, keepdim=True)
             correct += pred_t.eq(target_t.view_as(pred_t)).sum().item()
diff --git a/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py b/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py
index 57e99e30ea..0e27896b1c 100644
--- a/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py
+++ b/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py
@@ -8,6 +8,7 @@ from pathlib import Path
 import pytest
 import pytorch_lightning as pl
 import torch
+from torch import nn
 
 import lightning_base
 from convert_pl_checkpoint_to_hf import convert_pl_to_hf
@@ -183,7 +184,7 @@ class TestSummarizationDistiller(TestCasePlus):
 
         logits = model(input_ids, attention_mask=mask, decoder_input_ids=decoder_input_ids, use_cache=False).logits
 
-        lprobs = torch.nn.functional.log_softmax(logits, dim=-1)
+        lprobs = nn.functional.log_softmax(logits, dim=-1)
         smoothed_loss, nll_loss = label_smoothed_nll_loss(
             lprobs, lm_labels, 0.1, ignore_index=model.config.pad_token_id
         )
diff --git a/examples/research_projects/seq2seq-distillation/distillation.py b/examples/research_projects/seq2seq-distillation/distillation.py
index 3b3bd80589..1f9106f0c0 100755
--- a/examples/research_projects/seq2seq-distillation/distillation.py
+++ b/examples/research_projects/seq2seq-distillation/distillation.py
@@ -10,7 +10,6 @@ from typing import List
 import pytorch_lightning as pl
 import torch
 from torch import nn
-from torch.nn import functional as F
 
 from finetune import SummarizationModule, TranslationModule
 from finetune import main as ft_main
@@ -123,8 +122,8 @@ class SummarizationDistiller(SummarizationModule):
         assert t_logits_slct.size() == s_logits_slct.size()
         loss_ce = (
             self.ce_loss_fct(
-                F.log_softmax(s_logits_slct / self.temperature, dim=-1),
-                F.softmax(t_logits_slct / self.temperature, dim=-1),
+                nn.functional.log_softmax(s_logits_slct / self.temperature, dim=-1),
+                nn.functional.softmax(t_logits_slct / self.temperature, dim=-1),
             )
             * (self.temperature) ** 2
         )
@@ -160,10 +159,10 @@ class SummarizationDistiller(SummarizationModule):
         assert lm_logits.shape[-1] == self.model.config.vocab_size
         if self.hparams.label_smoothing == 0:
             # Same behavior as modeling_bart.py, besides ignoring pad_token_id
-            loss_fct = torch.nn.CrossEntropyLoss(ignore_index=pad_token_id)
+            loss_fct = nn.CrossEntropyLoss(ignore_index=pad_token_id)
             student_lm_loss = loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), labels.view(-1))
         else:
-            lprobs = F.log_softmax(lm_logits, dim=-1)
+            lprobs = nn.functional.log_softmax(lm_logits, dim=-1)
             student_lm_loss, _ = label_smoothed_nll_loss(
                 lprobs, labels, self.hparams.label_smoothing, ignore_index=pad_token_id
             )
@@ -230,9 +229,9 @@ class SummarizationDistiller(SummarizationModule):
         teacher_states = torch.stack([hidden_states_T[j] for j in matches])
         assert student_states.shape == teacher_states.shape, f"{student_states.shape} != {teacher_states.shape}"
         if normalize_hidden:
-            student_states = F.layer_norm(student_states, student_states.shape[1:])
-            teacher_states = F.layer_norm(teacher_states, teacher_states.shape[1:])
-        mse = F.mse_loss(student_states, teacher_states, reduction="none")
+            student_states = nn.functional.layer_norm(student_states, student_states.shape[1:])
+            teacher_states = nn.functional.layer_norm(teacher_states, teacher_states.shape[1:])
+        mse = nn.functional.mse_loss(student_states, teacher_states, reduction="none")
         masked_mse = (mse * mask.unsqueeze(0).unsqueeze(-1)).sum() / valid_count
         return masked_mse
 
diff --git a/examples/research_projects/seq2seq-distillation/finetune.py b/examples/research_projects/seq2seq-distillation/finetune.py
index 156b4695a6..5874509377 100755
--- a/examples/research_projects/seq2seq-distillation/finetune.py
+++ b/examples/research_projects/seq2seq-distillation/finetune.py
@@ -13,6 +13,7 @@ from typing import Dict, List, Tuple
 import numpy as np
 import pytorch_lightning as pl
 import torch
+from torch import nn
 from torch.utils.data import DataLoader
 
 from callbacks import Seq2SeqLoggingCallback, get_checkpoint_callback, get_early_stopping_callback
@@ -151,12 +152,12 @@ class SummarizationModule(BaseTransformer):
         lm_logits = outputs["logits"]
         if self.hparams.label_smoothing == 0:
             # Same behavior as modeling_bart.py, besides ignoring pad_token_id
-            ce_loss_fct = torch.nn.CrossEntropyLoss(ignore_index=pad_token_id)
+            ce_loss_fct = nn.CrossEntropyLoss(ignore_index=pad_token_id)
 
             assert lm_logits.shape[-1] == self.vocab_size
             loss = ce_loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), tgt_ids.view(-1))
         else:
-            lprobs = torch.nn.functional.log_softmax(lm_logits, dim=-1)
+            lprobs = nn.functional.log_softmax(lm_logits, dim=-1)
             loss, nll_loss = label_smoothed_nll_loss(
                 lprobs, tgt_ids, self.hparams.label_smoothing, ignore_index=pad_token_id
             )
diff --git a/examples/research_projects/wav2vec2/run_asr.py b/examples/research_projects/wav2vec2/run_asr.py
index 410d5c2d3a..426643e0a4 100755
--- a/examples/research_projects/wav2vec2/run_asr.py
+++ b/examples/research_projects/wav2vec2/run_asr.py
@@ -9,8 +9,8 @@ from typing import Any, Callable, Dict, List, Optional, Set, Union
 import datasets
 import numpy as np
 import torch
-import torch.nn as nn
 from packaging import version
+from torch import nn
 
 import librosa
 from lang_trans import arabic
diff --git a/examples/research_projects/wav2vec2/run_pretrain.py b/examples/research_projects/wav2vec2/run_pretrain.py
index a34fa404a7..e0081e1dda 100755
--- a/examples/research_projects/wav2vec2/run_pretrain.py
+++ b/examples/research_projects/wav2vec2/run_pretrain.py
@@ -5,9 +5,9 @@ from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Union
 
 import torch
-import torch.nn as nn
 from datasets import DatasetDict, load_dataset
 from packaging import version
+from torch import nn
 
 import librosa
 from transformers import (