From 040283170cd559b59b8eb37fe9fe8e99ff7edcbc Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Mon, 14 Jun 2021 13:34:32 -0700
Subject: [PATCH] consistent nn. and nn.functional: part 5 docs (#12161)

---
 docs/source/add_new_model.rst        | 2 +-
 docs/source/main_classes/trainer.rst | 4 ++--
 docs/source/migration.md             | 4 ++--
 docs/source/quicktour.rst            | 4 ++--
 docs/source/task_summary.rst         | 4 ++--
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/source/add_new_model.rst b/docs/source/add_new_model.rst
index a7d47b600e..8a231cbca5 100644
--- a/docs/source/add_new_model.rst
+++ b/docs/source/add_new_model.rst
@@ -518,7 +518,7 @@ PyTorch, called ``SimpleModel`` as follows:
 
 .. code:: python
 
-   import torch.nn as nn
+   from torch import nn
 
    class SimpleModel(nn.Module):
        def __init__(self):
diff --git a/docs/source/main_classes/trainer.rst b/docs/source/main_classes/trainer.rst
index d702605f2e..35dfdcad33 100644
--- a/docs/source/main_classes/trainer.rst
+++ b/docs/source/main_classes/trainer.rst
@@ -59,7 +59,7 @@ classification:
 
 .. code-block:: python
 
-    import torch
+    from torch import nn
     from transformers import Trainer
 
     class MultilabelTrainer(Trainer):
@@ -67,7 +67,7 @@ classification:
             labels = inputs.pop("labels")
             outputs = model(**inputs)
             logits = outputs.logits
-            loss_fct = torch.nn.BCEWithLogitsLoss()
+            loss_fct = nn.BCEWithLogitsLoss()
             loss = loss_fct(logits.view(-1, self.model.config.num_labels),
                             labels.float().view(-1, self.model.config.num_labels))
             return (loss, outputs) if return_outputs else loss
diff --git a/docs/source/migration.md b/docs/source/migration.md
index 7b97867e33..37c50cb053 100644
--- a/docs/source/migration.md
+++ b/docs/source/migration.md
@@ -23,7 +23,7 @@ expected changes:
 
 #### 1. AutoTokenizers and pipelines now use fast (rust) tokenizers by default.
 
-The python and rust tokenizers have roughly the same API, but the rust tokenizers have a more complete feature set. 
+The python and rust tokenizers have roughly the same API, but the rust tokenizers have a more complete feature set.
 
 This introduces two breaking changes:
 - The handling of overflowing tokens between the python and rust tokenizers is different.
@@ -85,7 +85,7 @@ This is a breaking change as importing intermediary layers using a model's modul
 
 ##### How to obtain the same behavior as v3.x in v4.x
 
-In order to obtain the same behavior as version `v3.x`, you should update the path used to access the layers. 
+In order to obtain the same behavior as version `v3.x`, you should update the path used to access the layers.
 
 In version `v3.x`:
 ```bash
diff --git a/docs/source/quicktour.rst b/docs/source/quicktour.rst
index c77da9894c..0e649b4c58 100644
--- a/docs/source/quicktour.rst
+++ b/docs/source/quicktour.rst
@@ -265,8 +265,8 @@ Let's apply the SoftMax activation to get predictions.
 .. code-block::
 
     >>> ## PYTORCH CODE
-    >>> import torch.nn.functional as F
-    >>> pt_predictions = F.softmax(pt_outputs.logits, dim=-1)
+    >>> from torch import nn
+    >>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
     >>> ## TENSORFLOW CODE
     >>> import tensorflow as tf
     >>> tf.nn.softmax(tf_outputs.logits, axis=-1)
diff --git a/docs/source/task_summary.rst b/docs/source/task_summary.rst
index 93a6716b65..bcce95fab2 100644
--- a/docs/source/task_summary.rst
+++ b/docs/source/task_summary.rst
@@ -451,7 +451,7 @@ of tokens.
     >>> ## PYTORCH CODE
     >>> from transformers import AutoModelWithLMHead, AutoTokenizer, top_k_top_p_filtering
     >>> import torch
-    >>> from torch.nn import functional as F
+    >>> from torch import nn
 
     >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
     >>> model = AutoModelWithLMHead.from_pretrained("gpt2")
@@ -467,7 +467,7 @@ of tokens.
     >>> filtered_next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)
 
     >>> # sample
-    >>> probs = F.softmax(filtered_next_token_logits, dim=-1)
+    >>> probs = nn.functional.softmax(filtered_next_token_logits, dim=-1)
     >>> next_token = torch.multinomial(probs, num_samples=1)
 
     >>> generated = torch.cat([input_ids, next_token], dim=-1)