From 3c289fb38c61db0efc11a6b32451734fb385fccc Mon Sep 17 00:00:00 2001
From: Kevin Canwen Xu <canwenxu@126.com>
Date: Tue, 4 Aug 2020 01:17:56 +0800
Subject: [PATCH] Remove outdated BERT tips (#6217)

* Remove out-dated BERT tips

* Update modeling_outputs.py

* Update bert.rst

* Update bert.rst
---
 docs/source/model_doc/bert.rst       | 9 ++-------
 src/transformers/modeling_outputs.py | 4 ----
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/docs/source/model_doc/bert.rst b/docs/source/model_doc/bert.rst
index cbc1c8aa77..1666260f96 100644
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -27,13 +27,8 @@ Tips:
 
 - BERT is a model with absolute position embeddings so it's usually advised to pad the inputs on
   the right rather than the left.
-- BERT was trained with a masked language modeling (MLM) objective. It is therefore efficient at predicting masked
-  tokens and at NLU in general, but is not optimal for text generation. Models trained with a causal language
-  modeling (CLM) objective are better in that regard.
-- Alongside MLM, BERT was trained using a next sentence prediction (NSP) objective using the [CLS] token as a sequence
-  approximate. The user may use this token (the first token in a sequence built with special tokens) to get a sequence
-  prediction rather than a token prediction. However, averaging over the sequence may yield better results than using
-  the [CLS] token.
+- BERT was trained with the masked language modeling (MLM) and next sentence prediction (NSP) objectives. It is efficient at predicting masked
+  tokens and at NLU in general, but is not optimal for text generation.
 
 The original code can be found `here <https://github.com/google-research/bert>`_.
 
diff --git a/src/transformers/modeling_outputs.py b/src/transformers/modeling_outputs.py
index 3a91d17904..1c36dc2d81 100644
--- a/src/transformers/modeling_outputs.py
+++ b/src/transformers/modeling_outputs.py
@@ -45,10 +45,6 @@ class BaseModelOutputWithPooling(ModelOutput):
             further processed by a Linear layer and a Tanh activation function. The Linear
             layer weights are trained from the next sentence prediction (classification)
             objective during pretraining.
-
-            This output is usually *not* a good summary
-            of the semantic content of the input, you're often better with averaging or pooling
-            the sequence of hidden-states for the whole input sequence.
         hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
             Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
             of shape :obj:`(batch_size, sequence_length, hidden_size)`.