From b5e2b183af5e40e33a4dc7659e697d137259d56e Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Mon, 27 Dec 2021 19:07:46 -0500
Subject: [PATCH] Doc styler examples (#14953)

* Fix bad examples

* Add black formatting to style_doc

* Use first nonempty line

* Put it at the right place

* Don't add spaces to empty lines

* Better templates

* Deal with triple quotes in docstrings

* Result of style_doc

* Enable mdx treatment and fix code examples in MDXs

* Result of doc styler on doc source files

* Last fixes

* Break copy from
---
 docs/source/add_new_model.mdx                 |  23 +-
 docs/source/add_new_pipeline.mdx              |   4 +-
 docs/source/benchmarks.mdx                    |  13 +-
 docs/source/custom_datasets.mdx               |  95 +++---
 docs/source/debugging.mdx                     |  19 +-
 docs/source/glossary.mdx                      |   5 +-
 docs/source/internal/generation_utils.mdx     |   4 +-
 docs/source/main_classes/callback.mdx         |   3 +-
 docs/source/main_classes/deepspeed.mdx        |  40 ++-
 docs/source/main_classes/logging.mdx          |   1 +
 docs/source/main_classes/output.mdx           |   4 +-
 docs/source/main_classes/pipelines.mdx        |  55 +--
 docs/source/main_classes/processors.mdx       |   4 +-
 docs/source/main_classes/trainer.mdx          |   8 +-
 docs/source/migration.mdx                     |  38 ++-
 docs/source/model_doc/bart.mdx                |   9 +-
 docs/source/model_doc/bartpho.mdx             |   6 +-
 docs/source/model_doc/bert_japanese.mdx       |   6 +-
 docs/source/model_doc/bertgeneration.mdx      |  14 +-
 docs/source/model_doc/bertweet.mdx            |   6 +-
 docs/source/model_doc/blenderbot.mdx          |   5 +-
 docs/source/model_doc/byt5.mdx                |  22 +-
 docs/source/model_doc/canine.mdx              |  10 +-
 docs/source/model_doc/clip.mdx                |   4 +-
 docs/source/model_doc/gpt_neo.mdx             |  16 +-
 docs/source/model_doc/gptj.mdx                |  35 +-
 docs/source/model_doc/herbert.mdx             |   2 +-
 docs/source/model_doc/layoutlm.mdx            |  12 +-
 docs/source/model_doc/layoutlmv2.mdx          |  28 +-
 docs/source/model_doc/layoutxlm.mdx           |   4 +-
 docs/source/model_doc/longformer.mdx          |   4 +-
 docs/source/model_doc/luke.mdx                |  11 +-
 docs/source/model_doc/m2m_100.mdx             |   6 +-
 docs/source/model_doc/marian.mdx              |  33 +-
 docs/source/model_doc/mbart.mdx               |   6 +-
 docs/source/model_doc/mluke.mdx               |   4 +-
 docs/source/model_doc/pegasus.mdx             |  22 +-
 docs/source/model_doc/qdqbert.mdx             |  13 +-
 docs/source/model_doc/reformer.mdx            |   2 +-
 docs/source/model_doc/speech_to_text.mdx      |  10 +-
 docs/source/model_doc/speech_to_text_2.mdx    |   8 +-
 docs/source/model_doc/t5.mdx                  |  47 +--
 docs/source/model_doc/t5v1.1.mdx              |   2 +-
 docs/source/model_doc/tapas.mdx               | 315 ++++++++++--------
 docs/source/model_doc/visual_bert.mdx         |  12 +-
 docs/source/model_sharing.mdx                 |  10 +-
 docs/source/multilingual.mdx                  |   6 +-
 docs/source/perplexity.mdx                    |  16 +-
 docs/source/preprocessing.mdx                 |  65 ++--
 docs/source/quicktour.mdx                     |  52 +--
 docs/source/serialization.mdx                 |  13 +-
 docs/source/task_summary.mdx                  |  69 ++--
 docs/source/testing.mdx                       |  99 ++++--
 docs/source/tokenizer_summary.mdx             |   2 +
 docs/source/training.mdx                      |  19 +-
 src/transformers/configuration_utils.py       |  19 +-
 src/transformers/data/processors/squad.py     |   1 +
 src/transformers/debug_utils.py               |   4 +-
 src/transformers/feature_extraction_utils.py  |  21 +-
 src/transformers/file_utils.py                | 152 ++++-----
 src/transformers/generation_tf_utils.py       |  78 +++--
 src/transformers/generation_utils.py          | 124 ++++---
 src/transformers/keras_callbacks.py           |   4 +-
 src/transformers/modelcard.py                 |  12 +-
 src/transformers/modeling_flax_utils.py       |  34 +-
 src/transformers/modeling_tf_utils.py         |  11 +-
 src/transformers/modeling_utils.py            |  14 +-
 .../models/albert/configuration_albert.py     |   9 +-
 .../models/albert/modeling_albert.py          |   8 +-
 .../models/albert/modeling_flax_albert.py     |   4 +-
 .../models/albert/modeling_tf_albert.py       |   8 +-
 src/transformers/models/auto/auto_factory.py  |  33 +-
 .../models/auto/configuration_auto.py         |  16 +-
 .../models/auto/feature_extraction_auto.py    |   4 +-
 .../models/auto/processing_auto.py            |   4 +-
 .../models/auto/tokenization_auto.py          |   6 +-
 src/transformers/models/bart/modeling_bart.py |   4 +-
 .../models/bart/modeling_flax_bart.py         |  18 +-
 src/transformers/models/beit/modeling_beit.py |  24 +-
 .../models/beit/modeling_flax_beit.py         |  18 +-
 src/transformers/models/bert/modeling_bert.py |  16 +-
 .../models/bert/modeling_flax_bert.py         |  12 +-
 .../models/bert/modeling_tf_bert.py           |  18 +-
 .../modeling_bert_generation.py               |   6 +-
 .../models/bertweet/tokenization_bertweet.py  |   3 +-
 .../models/big_bird/modeling_big_bird.py      |   8 +-
 .../models/big_bird/modeling_flax_big_bird.py |   4 +-
 .../modeling_bigbird_pegasus.py               |   5 +-
 .../models/blenderbot/modeling_blenderbot.py  |   8 +-
 .../blenderbot/modeling_flax_blenderbot.py    |  18 +-
 .../modeling_blenderbot_small.py              |   8 +-
 .../modeling_flax_blenderbot_small.py         |  18 +-
 src/transformers/models/clip/modeling_clip.py |  16 +-
 .../models/clip/modeling_flax_clip.py         |  16 +-
 .../models/clip/modeling_tf_clip.py           |  16 +-
 .../models/convbert/configuration_convbert.py |   1 +
 .../models/deberta/modeling_deberta.py        |   4 +-
 .../models/deberta_v2/modeling_deberta_v2.py  |   4 +-
 src/transformers/models/deit/modeling_deit.py |  18 +-
 src/transformers/models/detr/modeling_detr.py |  18 +-
 .../models/distilbert/modeling_distilbert.py  |   8 +-
 src/transformers/models/dpr/modeling_dpr.py   |  29 +-
 .../models/dpr/modeling_tf_dpr.py             |  29 +-
 .../models/dpr/tokenization_dpr.py            |  15 +-
 .../models/dpr/tokenization_dpr_fast.py       |  15 +-
 .../models/electra/modeling_electra.py        |   8 +-
 .../models/electra/modeling_flax_electra.py   |   4 +-
 .../models/electra/modeling_tf_electra.py     |   4 +-
 .../configuration_encoder_decoder.py          |   8 +-
 .../modeling_encoder_decoder.py               |   9 +-
 .../modeling_flax_encoder_decoder.py          |  30 +-
 .../modeling_tf_encoder_decoder.py            |  12 +-
 src/transformers/models/fnet/modeling_fnet.py |  14 +-
 .../models/fnet/tokenization_fnet.py          |   2 +-
 .../models/fsmt/configuration_fsmt.py         |   2 +-
 .../models/funnel/modeling_funnel.py          |   6 +-
 .../models/funnel/modeling_tf_funnel.py       |   6 +-
 src/transformers/models/gpt2/modeling_gpt2.py |  40 ++-
 .../models/gpt2/modeling_tf_gpt2.py           |  10 +-
 src/transformers/models/gptj/modeling_gptj.py |  28 +-
 .../models/hubert/modeling_hubert.py          |   2 +
 .../models/hubert/modeling_tf_hubert.py       |   9 +-
 .../models/imagegpt/modeling_imagegpt.py      |  32 +-
 .../models/layoutlm/modeling_layoutlm.py      |  57 ++--
 .../models/layoutlm/modeling_tf_layoutlm.py   |  57 ++--
 .../models/layoutlmv2/modeling_layoutlmv2.py  |  18 +-
 src/transformers/models/led/modeling_led.py   |  29 +-
 .../models/led/modeling_tf_led.py             |  27 +-
 .../models/longformer/modeling_longformer.py  |  67 ++--
 .../longformer/modeling_tf_longformer.py      |  22 +-
 src/transformers/models/luke/modeling_luke.py |  29 +-
 .../models/m2m_100/modeling_m2m_100.py        |   8 +-
 .../models/m2m_100/tokenization_m2m_100.py    |   9 +-
 .../models/marian/modeling_flax_marian.py     |  24 +-
 .../models/marian/modeling_marian.py          |  26 +-
 .../models/marian/modeling_tf_marian.py       |   7 +-
 .../models/marian/tokenization_marian.py      |   8 +-
 .../models/mbart/modeling_flax_mbart.py       |  18 +-
 .../models/mbart/modeling_mbart.py            |   4 +-
 .../models/mbart/tokenization_mbart.py        |   5 +-
 .../models/mbart/tokenization_mbart_fast.py   |   7 +-
 .../models/mbart50/tokenization_mbart50.py    |   5 +-
 .../mbart50/tokenization_mbart50_fast.py      |   5 +-
 .../megatron_bert/modeling_megatron_bert.py   |  16 +-
 src/transformers/models/mmbt/modeling_mmbt.py |   4 +-
 .../models/mobilebert/modeling_mobilebert.py  |  10 +-
 .../mobilebert/modeling_tf_mobilebert.py      |  12 +-
 src/transformers/models/mt5/modeling_mt5.py   |   5 +-
 .../models/mt5/modeling_tf_mt5.py             |   5 +-
 .../models/openai/modeling_openai.py          |  10 +-
 .../models/openai/modeling_tf_openai.py       |  12 +-
 .../models/pegasus/modeling_flax_pegasus.py   |  18 +-
 .../models/pegasus/modeling_pegasus.py        |   8 +-
 .../models/perceiver/modeling_perceiver.py    | 100 +++---
 .../models/prophetnet/modeling_prophetnet.py  |  46 +--
 .../models/qdqbert/modeling_qdqbert.py        |  12 +-
 src/transformers/models/rag/modeling_rag.py   |  49 ++-
 .../models/rag/modeling_tf_rag.py             |  82 ++++-
 src/transformers/models/rag/retrieval_rag.py  |  23 +-
 .../models/rembert/modeling_rembert.py        |   4 +-
 .../models/roberta/modeling_roberta.py        |   4 +-
 .../models/roformer/modeling_roformer.py      |   4 +-
 .../models/roformer/tokenization_roformer.py  |   3 +-
 .../roformer/tokenization_roformer_fast.py    |   3 +-
 .../models/segformer/modeling_segformer.py    |  12 +-
 .../models/sew_d/modeling_sew_d.py            |   4 +-
 .../configuration_speech_encoder_decoder.py   |   8 +-
 .../modeling_speech_encoder_decoder.py        |   9 +-
 .../speech_to_text/modeling_speech_to_text.py |  12 +-
 .../modeling_speech_to_text_2.py              |  10 +-
 .../squeezebert/modeling_squeezebert.py       |   4 +-
 .../models/t5/modeling_flax_t5.py             |  34 +-
 src/transformers/models/t5/modeling_t5.py     |  59 ++--
 src/transformers/models/t5/modeling_tf_t5.py  |  28 +-
 .../models/tapas/configuration_tapas.py       |   1 +
 .../models/tapas/modeling_tapas.py            |  59 ++--
 .../models/tapas/modeling_tf_tapas.py         |  59 ++--
 .../models/trocr/modeling_trocr.py            |   2 +-
 .../models/unispeech/modeling_unispeech.py    |   4 +-
 .../unispeech_sat/modeling_unispeech_sat.py   |   4 +-
 .../configuration_vision_encoder_decoder.py   |   8 +-
 .../modeling_flax_vision_encoder_decoder.py   |  25 +-
 .../modeling_vision_encoder_decoder.py        |   9 +-
 .../configuration_vision_text_dual_encoder.py |   8 +-
 .../modeling_flax_vision_text_dual_encoder.py |  40 ++-
 .../modeling_vision_text_dual_encoder.py      |  40 ++-
 .../visual_bert/configuration_visual_bert.py  |   2 +-
 .../visual_bert/modeling_visual_bert.py       | 126 ++++---
 .../models/vit/modeling_flax_vit.py           |  12 +-
 .../models/vit/modeling_tf_vit.py             |  12 +-
 src/transformers/models/vit/modeling_vit.py   |  12 +-
 .../models/wav2vec2/modeling_flax_wav2vec2.py |  28 +-
 .../models/wav2vec2/modeling_tf_wav2vec2.py   |  20 +-
 .../models/wav2vec2/modeling_wav2vec2.py      |  12 +-
 .../models/wav2vec2/tokenization_wav2vec2.py  |   8 +-
 .../tokenization_wav2vec2_phoneme.py          |   8 +-
 src/transformers/models/xlm/modeling_xlm.py   |   8 +-
 .../xlm_prophetnet/modeling_xlm_prophetnet.py |  44 ++-
 .../models/xlnet/modeling_tf_xlnet.py         |  26 +-
 .../models/xlnet/modeling_xlnet.py            |  54 ++-
 src/transformers/optimization.py              |   3 +-
 src/transformers/pipelines/__init__.py        |   6 +-
 .../pipelines/table_question_answering.py     |   9 +-
 src/transformers/testing_utils.py             |   9 +-
 src/transformers/tokenization_utils.py        |  10 +-
 src/transformers/tokenization_utils_base.py   |  30 +-
 src/transformers/trainer_callback.py          |   1 -
 src/transformers/trainer_pt_utils.py          |   2 +-
 src/transformers/trainer_utils.py             |   2 +-
 src/transformers/utils/fx.py                  |   1 +
 utils/style_doc.py                            | 193 ++++++++++-
 211 files changed, 2738 insertions(+), 1711 deletions(-)

diff --git a/docs/source/add_new_model.mdx b/docs/source/add_new_model.mdx
index 9f49d78a58..08d804c9e3 100644
--- a/docs/source/add_new_model.mdx
+++ b/docs/source/add_new_model.mdx
@@ -267,7 +267,7 @@ single forward pass using a dummy integer vector of input IDs as an input. Such
 pseudocode):
 
 ```python
-model = BrandNewBertModel.load_pretrained_checkpoint(/path/to/checkpoint/)
+model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/")
 input_ids = [0, 4, 5, 2, 3, 7, 9]  # vector of input ids
 original_output = model.predict(input_ids)
 ```
@@ -476,6 +476,7 @@ following command should work:
 
 ```python
 from transformers import BrandNewBertModel, BrandNewBertConfig
+
 model = BrandNewBertModel(BrandNewBertConfig())
 ```
 
@@ -502,12 +503,13 @@ PyTorch, called `SimpleModel` as follows:
 ```python
 from torch import nn
 
+
 class SimpleModel(nn.Module):
     def __init__(self):
-            super().__init__()
-            self.dense = nn.Linear(10, 10)
-            self.intermediate = nn.Linear(10, 10)
-            self.layer_norm = nn.LayerNorm(10)
+        super().__init__()
+        self.dense = nn.Linear(10, 10)
+        self.intermediate = nn.Linear(10, 10)
+        self.layer_norm = nn.LayerNorm(10)
 ```
 
 Now we can create an instance of this model definition which will fill all weights: `dense`, `intermediate`,
@@ -565,7 +567,7 @@ In the conversion script, you should fill those randomly initialized weights wit
 corresponding layer in the checkpoint. *E.g.*
 
 ```python
-# retrieve matching layer weights, e.g. by 
+# retrieve matching layer weights, e.g. by
 # recursive algorithm
 layer_name = "dense"
 pretrained_weight = array_of_dense_layer
@@ -622,7 +624,7 @@ pass of the model using the original repository. Now you should write an analogo
 implementation instead of the original one. It should look as follows:
 
 ```python
-model = BrandNewBertModel.from_pretrained(/path/to/converted/checkpoint/folder)
+model = BrandNewBertModel.from_pretrained("/path/to/converted/checkpoint/folder")
 input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]
 output = model(input_ids).last_hidden_states
 ```
@@ -668,7 +670,7 @@ fully comply with the required design. To make sure, the implementation is fully
 common tests should pass. The Cookiecutter should have automatically added a test file for your model, probably under
 the same `tests/test_modeling_brand_new_bert.py`. Run this test file to verify that all common tests pass:
 
-```python
+```bash
 pytest tests/test_modeling_brand_new_bert.py
 ```
 
@@ -714,7 +716,7 @@ that inputs a string and returns the `input_ids``. It could look similar to this
 
 ```python
 input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
-model = BrandNewBertModel.load_pretrained_checkpoint(/path/to/checkpoint/)
+model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/")
 input_ids = model.tokenize(input_str)
 ```
 
@@ -725,9 +727,10 @@ created. It should look similar to this:
 
 ```python
 from transformers import BrandNewBertTokenizer
+
 input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
 
-tokenizer = BrandNewBertTokenizer.from_pretrained(/path/to/tokenizer/folder/)
+tokenizer = BrandNewBertTokenizer.from_pretrained("/path/to/tokenizer/folder/")
 
 input_ids = tokenizer(input_str).input_ids
 ```
diff --git a/docs/source/add_new_pipeline.mdx b/docs/source/add_new_pipeline.mdx
index 661e92a449..096ea423ec 100644
--- a/docs/source/add_new_pipeline.mdx
+++ b/docs/source/add_new_pipeline.mdx
@@ -26,6 +26,7 @@ Start by inheriting the base class `Pipeline`. with the 4 methods needed to impl
 ```python
 from transformers import Pipeline
 
+
 class MyPipeline(Pipeline):
     def _sanitize_parameters(self, **kwargs):
         preprocess_kwargs = {}
@@ -34,7 +35,7 @@ class MyPipeline(Pipeline):
         return preprocess_kwargs, {}, {}
 
     def preprocess(self, inputs, maybe_arg=2):
-        model_input = Tensor(....)
+        model_input = Tensor(inputs["input_ids"])
         return {"model_input": model_input}
 
     def _forward(self, model_inputs):
@@ -90,6 +91,7 @@ def postprocess(self, model_outputs, top_k=5):
     # Add logic to handle top_k
     return best_class
 
+
 def _sanitize_parameters(self, **kwargs):
     preprocess_kwargs = {}
     if "maybe_arg" in kwargs:
diff --git a/docs/source/benchmarks.mdx b/docs/source/benchmarks.mdx
index 731e6c1dfb..8752f76305 100644
--- a/docs/source/benchmarks.mdx
+++ b/docs/source/benchmarks.mdx
@@ -37,11 +37,12 @@ The benchmark classes [`PyTorchBenchmark`] and [`TensorFlowBenchmark`] expect an
 
 >>> args = PyTorchBenchmarkArguments(models=["bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
 >>> benchmark = PyTorchBenchmark(args)
-
 ===PT-TF-SPLIT===
 >>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments
 
->>> args = TensorFlowBenchmarkArguments(models=["bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
+>>> args = TensorFlowBenchmarkArguments(
+...     models=["bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512]
+... )
 >>> benchmark = TensorFlowBenchmark(args)
 ```
 
@@ -174,7 +175,9 @@ configurations must be inserted with the benchmark args as follows.
 ```py
 >>> from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments, BertConfig
 
->>> args = PyTorchBenchmarkArguments(models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
+>>> args = PyTorchBenchmarkArguments(
+...     models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512]
+... )
 >>> config_base = BertConfig()
 >>> config_384_hid = BertConfig(hidden_size=384)
 >>> config_6_lay = BertConfig(num_hidden_layers=6)
@@ -244,7 +247,9 @@ bert-6-lay                 8              512            1359
 ===PT-TF-SPLIT===
 >>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments, BertConfig
 
->>> args = TensorFlowBenchmarkArguments(models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
+>>> args = TensorFlowBenchmarkArguments(
+...     models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512]
+... )
 >>> config_base = BertConfig()
 >>> config_384_hid = BertConfig(hidden_size=384)
 >>> config_6_lay = BertConfig(num_hidden_layers=6)
diff --git a/docs/source/custom_datasets.mdx b/docs/source/custom_datasets.mdx
index 4ffcbbcbf9..5dd1801f38 100644
--- a/docs/source/custom_datasets.mdx
+++ b/docs/source/custom_datasets.mdx
@@ -54,6 +54,7 @@ The 🤗 Datasets library makes it simple to load a dataset:
 
 ```python
 from datasets import load_dataset
+
 imdb = load_dataset("imdb")
 ```
 
@@ -61,8 +62,9 @@ This loads a `DatasetDict` object which you can index into to view an example:
 
 ```python
 imdb["train"][0]
-{'label': 1,
- 'text': 'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'
+{
+    "label": 1,
+    "text": "Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as \"Teachers\". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is \"Teachers\". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!",
 }
 ```
 
@@ -74,6 +76,7 @@ model was trained with to ensure appropriately tokenized words. Load the DistilB
 
 ```python
 from transformers import AutoTokenizer
+
 tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
 ```
 
@@ -99,6 +102,7 @@ batch. This is known as **dynamic padding**. You can do this with the `DataColla
 
 ```python
 from transformers import DataCollatorWithPadding
+
 data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
 ```
 
@@ -108,6 +112,7 @@ Now load your model with the [`AutoModelForSequenceClassification`] class along
 
 ```python
 from transformers import AutoModelForSequenceClassification
+
 model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
 ```
 
@@ -121,7 +126,7 @@ At this point, only three steps remain:
 from transformers import TrainingArguments, Trainer
 
 training_args = TrainingArguments(
-    output_dir='./results',
+    output_dir="./results",
     learning_rate=2e-5,
     per_device_train_batch_size=16,
     per_device_eval_batch_size=16,
@@ -150,6 +155,7 @@ Make sure you set `return_tensors="tf"` to return `tf.Tensor` outputs instead of
 
 ```python
 from transformers import DataCollatorWithPadding
+
 data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
 ```
 
@@ -158,14 +164,14 @@ Next, convert your datasets to the `tf.data.Dataset` format with `to_tf_dataset`
 
 ```python
 tf_train_dataset = tokenized_imdb["train"].to_tf_dataset(
-    columns=['attention_mask', 'input_ids', 'label'],
+    columns=["attention_mask", "input_ids", "label"],
     shuffle=True,
     batch_size=16,
     collate_fn=data_collator,
 )
 
 tf_validation_dataset = tokenized_imdb["train"].to_tf_dataset(
-    columns=['attention_mask', 'input_ids', 'label'],
+    columns=["attention_mask", "input_ids", "label"],
     shuffle=False,
     batch_size=16,
     collate_fn=data_collator,
@@ -182,17 +188,14 @@ batch_size = 16
 num_epochs = 5
 batches_per_epoch = len(tokenized_imdb["train"]) // batch_size
 total_train_steps = int(batches_per_epoch * num_epochs)
-optimizer, schedule = create_optimizer(
-    init_lr=2e-5, 
-    num_warmup_steps=0, 
-    num_train_steps=total_train_steps
-)
+optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
 ```
 
 Load your model with the [`TFAutoModelForSequenceClassification`] class along with the number of expected labels:
 
 ```python
 from transformers import TFAutoModelForSequenceClassification
+
 model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
 ```
 
@@ -200,6 +203,7 @@ Compile the model:
 
 ```python
 import tensorflow as tf
+
 model.compile(optimizer=optimizer)
 ```
 
@@ -234,14 +238,15 @@ or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/no
 Load the WNUT 17 dataset from the 🤗 Datasets library:
 
 ```python
-from datasets import load_dataset
-wnut = load_dataset("wnut_17")
+>>> from datasets import load_dataset
+
+>>> wnut = load_dataset("wnut_17")
 ```
 
 A quick look at the dataset shows the labels associated with each word in the sentence:
 
 ```python
-wnut["train"][0]
+>>> wnut["train"][0]
 {'id': '0',
  'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0],
  'tokens': ['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.']
@@ -251,21 +256,22 @@ wnut["train"][0]
 View the specific NER tags by:
 
 ```python
-label_list = wnut["train"].features[f"ner_tags"].feature.names
-label_list
-['O',
- 'B-corporation',
- 'I-corporation',
- 'B-creative-work',
- 'I-creative-work',
- 'B-group',
- 'I-group',
- 'B-location',
- 'I-location',
- 'B-person',
- 'I-person',
- 'B-product',
- 'I-product'
+>>> label_list = wnut["train"].features[f"ner_tags"].feature.names
+>>> label_list
+[
+    "O",
+    "B-corporation",
+    "I-corporation",
+    "B-creative-work",
+    "I-creative-work",
+    "B-group",
+    "I-group",
+    "B-location",
+    "I-location",
+    "B-person",
+    "I-person",
+    "B-product",
+    "I-product",
 ]
 ```
 
@@ -282,6 +288,7 @@ Now you need to tokenize the text. Load the DistilBERT tokenizer with an [`AutoT
 
 ```python
 from transformers import AutoTokenizer
+
 tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
 ```
 
@@ -289,9 +296,9 @@ Since the input has already been split into words, set `is_split_into_words=True
 subwords:
 
 ```python
-tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
-tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
-tokens
+>>> tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
+>>> tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
+>>> tokens
 ['[CLS]', '@', 'paul', '##walk', 'it', "'", 's', 'the', 'view', 'from', 'where', 'i', "'", 'm', 'living', 'for', 'two', 'weeks', '.', 'empire', 'state', 'building', '=', 'es', '##b', '.', 'pretty', 'bad', 'storm', 'here', 'last', 'evening', '.', '[SEP]']
 ```
 
@@ -314,10 +321,10 @@ def tokenize_and_align_labels(examples):
         word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
         previous_word_idx = None
         label_ids = []
-        for word_idx in word_ids:                            # Set the special tokens to -100.
+        for word_idx in word_ids:  # Set the special tokens to -100.
             if word_idx is None:
                 label_ids.append(-100)
-            elif word_idx != previous_word_idx:              # Only label the first token of a given word.
+            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                 label_ids.append(label[word_idx])
 
         labels.append(label_ids)
@@ -336,6 +343,7 @@ Finally, pad your text and labels, so they are a uniform length:
 
 ```python
 from transformers import DataCollatorForTokenClassification
+
 data_collator = DataCollatorForTokenClassification(tokenizer)
 ```
 
@@ -345,6 +353,7 @@ Load your model with the [`AutoModelForTokenClassification`] class along with th
 
 ```python
 from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
+
 model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_list))
 ```
 
@@ -352,7 +361,7 @@ Gather your training arguments in [`TrainingArguments`]:
 
 ```python
 training_args = TrainingArguments(
-    output_dir='./results',
+    output_dir="./results",
     evaluation_strategy="epoch",
     learning_rate=2e-5,
     per_device_train_batch_size=16,
@@ -387,6 +396,7 @@ Batch your examples together and pad your text and labels, so they are a uniform
 
 ```python
 from transformers import DataCollatorForTokenClassification
+
 data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors="tf")
 ```
 
@@ -412,6 +422,7 @@ Load the model with the [`TFAutoModelForTokenClassification`] class along with t
 
 ```python
 from transformers import TFAutoModelForTokenClassification
+
 model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_list))
 ```
 
@@ -435,6 +446,7 @@ Compile the model:
 
 ```python
 import tensorflow as tf
+
 model.compile(optimizer=optimizer)
 ```
 
@@ -469,13 +481,14 @@ Load the SQuAD dataset from the 🤗 Datasets library:
 
 ```python
 from datasets import load_dataset
+
 squad = load_dataset("squad")
 ```
 
 Take a look at an example from the dataset:
 
 ```python
-squad["train"][0]
+>>> squad["train"][0]
 {'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']},
  'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
  'id': '5733be284776f41900661182',
@@ -490,6 +503,7 @@ Load the DistilBERT tokenizer with an [`AutoTokenizer`]:
 
 ```python
 from transformers import AutoTokenizer
+
 tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
 ```
 
@@ -567,6 +581,7 @@ Batch the processed examples together:
 
 ```python
 from transformers import default_data_collator
+
 data_collator = default_data_collator
 ```
 
@@ -576,6 +591,7 @@ Load your model with the [`AutoModelForQuestionAnswering`] class:
 
 ```python
 from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
+
 model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
 ```
 
@@ -583,7 +599,7 @@ Gather your training arguments in [`TrainingArguments`]:
 
 ```python
 training_args = TrainingArguments(
-    output_dir='./results',
+    output_dir="./results",
     evaluation_strategy="epoch",
     learning_rate=2e-5,
     per_device_train_batch_size=16,
@@ -618,6 +634,7 @@ Batch the processed examples together with a TensorFlow default data collator:
 
 ```python
 from transformers.data.data_collator import tf_default_collator
+
 data_collator = tf_default_collator
 ```
 
@@ -650,8 +667,8 @@ batch_size = 16
 num_epochs = 2
 total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs
 optimizer, schedule = create_optimizer(
-    init_lr=2e-5, 
-    num_warmup_steps=0, 
+    init_lr=2e-5,
+    num_warmup_steps=0,
     num_train_steps=total_train_steps,
 )
 ```
@@ -660,6 +677,7 @@ Load your model with the [`TFAutoModelForQuestionAnswering`] class:
 
 ```python
 from transformers import TFAutoModelForQuestionAnswering
+
 model = TFAutoModelForQuestionAnswering("distilbert-base-uncased")
 ```
 
@@ -667,6 +685,7 @@ Compile the model:
 
 ```python
 import tensorflow as tf
+
 model.compile(optimizer=optimizer)
 ```
 
diff --git a/docs/source/debugging.mdx b/docs/source/debugging.mdx
index a3f05df48e..edb3a6ece9 100644
--- a/docs/source/debugging.mdx
+++ b/docs/source/debugging.mdx
@@ -49,6 +49,7 @@ If you're using your own training loop or another Trainer you can accomplish the
 
 ```python
 from .debug_utils import DebugUnderflowOverflow
+
 debug_overflow = DebugUnderflowOverflow(model)
 ```
 
@@ -200,13 +201,16 @@ def _forward(self, hidden_states):
     hidden_states = self.wo(hidden_states)
     return hidden_states
 
+
 import torch
+
+
 def forward(self, hidden_states):
     if torch.is_autocast_enabled():
-         with torch.cuda.amp.autocast(enabled=False):
-             return self._forward(hidden_states)
-     else:
-         return self._forward(hidden_states)
+        with torch.cuda.amp.autocast(enabled=False):
+            return self._forward(hidden_states)
+    else:
+        return self._forward(hidden_states)
 ```
 
 Since the automatic detector only reports on inputs and outputs of full frames, once you know where to look, you may
@@ -216,8 +220,10 @@ want to analyse the intermediary stages of any specific `forward` function as we
 ```python
 from debug_utils import detect_overflow
 
+
 class T5LayerFF(nn.Module):
     [...]
+
     def forward(self, hidden_states):
         forwarded_states = self.layer_norm(hidden_states)
         detect_overflow(forwarded_states, "after layer_norm")
@@ -237,6 +243,7 @@ its default, e.g.:
 
 ```python
 from .debug_utils import DebugUnderflowOverflow
+
 debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100)
 ```
 
@@ -248,7 +255,7 @@ Let's say you want to watch the absolute min and max values for all the ingredie
 batch, and only do that for batches 1 and 3. Then you instantiate this class as:
 
 ```python
-debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3])
+debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3])
 ```
 
 And now full batches 1 and 3 will be traced using the same format as the underflow/overflow detector does.
@@ -295,5 +302,5 @@ numbers started to diverge.
 You can also specify the batch number after which to stop the training, with:
 
 ```python
-debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3], abort_after_batch_num=3)
+debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3], abort_after_batch_num=3)
 ```
diff --git a/docs/source/glossary.mdx b/docs/source/glossary.mdx
index 2685e2082d..b6cb2259d6 100644
--- a/docs/source/glossary.mdx
+++ b/docs/source/glossary.mdx
@@ -58,6 +58,7 @@ tokenizer, which is a [WordPiece](https://arxiv.org/pdf/1609.08144.pdf) tokenize
 
 ```python
 >>> from transformers import BertTokenizer
+
 >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
 
 >>> sequence = "A Titan RTX has 24GB of VRAM"
@@ -126,6 +127,7 @@ For example, consider these two sequences:
 
 ```python
 >>> from transformers import BertTokenizer
+
 >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
 
 >>> sequence_a = "This is a short sequence."
@@ -190,6 +192,7 @@ arguments (and not a list, like before) like this:
 
 ```python
 >>> from transformers import BertTokenizer
+
 >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
 >>> sequence_a = "HuggingFace is based in NYC"
 >>> sequence_b = "Where is HuggingFace based?"
@@ -212,7 +215,7 @@ the two types of sequence in the model.
 The tokenizer returns this mask as the "token_type_ids" entry:
 
 ```python
->>> encoded_dict['token_type_ids']
+>>> encoded_dict["token_type_ids"]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
 ```
 
diff --git a/docs/source/internal/generation_utils.mdx b/docs/source/internal/generation_utils.mdx
index e1f844254f..88e5e9e315 100644
--- a/docs/source/internal/generation_utils.mdx
+++ b/docs/source/internal/generation_utils.mdx
@@ -32,8 +32,8 @@ Here's an example:
 ```python
 from transformers import GPT2Tokenizer, GPT2LMHeadModel
 
-tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-model = GPT2LMHeadModel.from_pretrained('gpt2')
+tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+model = GPT2LMHeadModel.from_pretrained("gpt2")
 
 inputs = tokenizer("Hello, my dog is cute and ", return_tensors="pt")
 generation_output = model.generate(**inputs, return_dict_in_generate=True, output_scores=True)
diff --git a/docs/source/main_classes/callback.mdx b/docs/source/main_classes/callback.mdx
index 9847be9357..032f1b571b 100644
--- a/docs/source/main_classes/callback.mdx
+++ b/docs/source/main_classes/callback.mdx
@@ -79,12 +79,13 @@ class MyCallback(TrainerCallback):
     def on_train_begin(self, args, state, control, **kwargs):
         print("Starting training")
 
+
 trainer = Trainer(
     model,
     args,
     train_dataset=train_dataset,
     eval_dataset=eval_dataset,
-    callbacks=[MyCallback]  # We can either pass the callback class this way or an instance of it (MyCallback())
+    callbacks=[MyCallback],  # We can either pass the callback class this way or an instance of it (MyCallback())
 )
 ```
 
diff --git a/docs/source/main_classes/deepspeed.mdx b/docs/source/main_classes/deepspeed.mdx
index c68a15fbc6..fb7ac7c4cb 100644
--- a/docs/source/main_classes/deepspeed.mdx
+++ b/docs/source/main_classes/deepspeed.mdx
@@ -295,11 +295,12 @@ If you're using only 1 GPU, here is how you'd have to adjust your training code
 # DeepSpeed requires a distributed environment even when only one process is used.
 # This emulates a launcher in the notebook
 import os
-os.environ['MASTER_ADDR'] = 'localhost'
-os.environ['MASTER_PORT'] = '9994' # modify if RuntimeError: Address already in use
-os.environ['RANK'] = "0"
-os.environ['LOCAL_RANK'] = "0"
-os.environ['WORLD_SIZE'] = "1"
+
+os.environ["MASTER_ADDR"] = "localhost"
+os.environ["MASTER_PORT"] = "9994"  # modify if RuntimeError: Address already in use
+os.environ["RANK"] = "0"
+os.environ["LOCAL_RANK"] = "0"
+os.environ["WORLD_SIZE"] = "1"
 
 # Now proceed as normal, plus pass the deepspeed config file
 training_args = TrainingArguments(..., deepspeed="ds_config_zero3.json")
@@ -316,7 +317,7 @@ at the beginning of this section.
 If you want to create the config file on the fly in the notebook in the current directory, you could have a dedicated
 cell with:
 
-```python
+```python no-style
 %%bash
 cat <<'EOT' > ds_config_zero3.json
 {
@@ -382,14 +383,14 @@ EOT
 If the training script is in a normal file and not in the notebook cells, you can launch `deepspeed` normally via
 shell from a cell. For example, to use `run_translation.py` you would launch it with:
 
-```python
+```python no-style
 !git clone https://github.com/huggingface/transformers
 !cd transformers; deepspeed examples/pytorch/translation/run_translation.py ...
 ```
 
 or with `%%bash` magic, where you can write a multi-line code for the shell program to run:
 
-```python
+```python no-style
 %%bash
 
 git clone https://github.com/huggingface/transformers
@@ -512,7 +513,7 @@ TrainingArguments(..., deepspeed="/path/to/ds_config.json")
 or:
 
 ```python
-ds_config_dict=dict(scheduler=scheduler_params, optimizer=optimizer_params)
+ds_config_dict = dict(scheduler=scheduler_params, optimizer=optimizer_params)
 TrainingArguments(..., deepspeed=ds_config_dict)
 ```
 
@@ -1430,6 +1431,7 @@ If you have saved at least one checkpoint, and you want to use the latest one, y
 ```python
 from transformers.trainer_utils import get_last_checkpoint
 from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+
 checkpoint_dir = get_last_checkpoint(trainer.args.output_dir)
 fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
 ```
@@ -1439,6 +1441,7 @@ checkpoint), then you can finish the training by first saving the final model ex
 
 ```python
 from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+
 checkpoint_dir = os.path.join(trainer.args.output_dir, "checkpoint-final")
 trainer.deepspeed.save_checkpoint(checkpoint_dir)
 fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
@@ -1461,7 +1464,8 @@ these yourself as is shown in the following example:
 
 ```python
 from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
-state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+
+state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir)  # already on cpu
 model = model.cpu()
 model.load_state_dict(state_dict)
 ```
@@ -1529,9 +1533,10 @@ context manager (which is also a function decorator), like so:
 ```python
 from transformers import T5ForConditionalGeneration, T5Config
 import deepspeed
+
 with deepspeed.zero.Init():
-   config = T5Config.from_pretrained("t5-small")
-   model = T5ForConditionalGeneration(config)
+    config = T5Config.from_pretrained("t5-small")
+    model = T5ForConditionalGeneration(config)
 ```
 
 As you can see this gives you a randomly initialized model.
@@ -1544,6 +1549,7 @@ section. Thus you must create the [`TrainingArguments`] object **before** callin
 
 ```python
 from transformers import AutoModel, Trainer, TrainingArguments
+
 training_args = TrainingArguments(..., deepspeed=ds_config)
 model = AutoModel.from_pretrained("t5-small")
 trainer = Trainer(model=model, args=training_args, ...)
@@ -1574,7 +1580,7 @@ limitations.
 Also under ZeRO-3, if you write your own code and run into a model parameter weight that looks like:
 
 ```python
-tensor([1.], device='cuda:0', dtype=torch.float16, requires_grad=True)
+tensor([1.0], device="cuda:0", dtype=torch.float16, requires_grad=True)
 ```
 
 stress on `tensor([1.])`, or if you get an error where it says the parameter is of size `1`, instead of some much
@@ -1715,9 +1721,9 @@ For example for a pretrained model:
 from transformers.deepspeed import HfDeepSpeedConfig
 from transformers import AutoModel, deepspeed
 
-ds_config = { ... } # deepspeed config object or path to the file
+ds_config = {...}  # deepspeed config object or path to the file
 # must run before instantiating the model
-dschf = HfDeepSpeedConfig(ds_config) # keep this object alive
+dschf = HfDeepSpeedConfig(ds_config)  # keep this object alive
 model = AutoModel.from_pretrained("gpt2")
 engine = deepspeed.initialize(model=model, config_params=ds_config, ...)
 ```
@@ -1728,9 +1734,9 @@ or for non-pretrained model:
 from transformers.deepspeed import HfDeepSpeedConfig
 from transformers import AutoModel, AutoConfig, deepspeed
 
-ds_config = { ... } # deepspeed config object or path to the file
+ds_config = {...}  # deepspeed config object or path to the file
 # must run before instantiating the model
-dschf = HfDeepSpeedConfig(ds_config) # keep this object alive
+dschf = HfDeepSpeedConfig(ds_config)  # keep this object alive
 config = AutoConfig.from_pretrained("gpt2")
 model = AutoModel.from_config(config)
 engine = deepspeed.initialize(model=model, config_params=ds_config, ...)
diff --git a/docs/source/main_classes/logging.mdx b/docs/source/main_classes/logging.mdx
index 467fe6d189..b707ca8698 100644
--- a/docs/source/main_classes/logging.mdx
+++ b/docs/source/main_classes/logging.mdx
@@ -21,6 +21,7 @@ to the INFO level.
 
 ```python
 import transformers
+
 transformers.logging.set_verbosity_info()
 ```
 
diff --git a/docs/source/main_classes/output.mdx b/docs/source/main_classes/output.mdx
index 5d406ac5d5..e0ef92eebc 100644
--- a/docs/source/main_classes/output.mdx
+++ b/docs/source/main_classes/output.mdx
@@ -22,8 +22,8 @@ Let's see of this looks on an example:
 from transformers import BertTokenizer, BertForSequenceClassification
 import torch
 
-tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
 
 inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
 labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
diff --git a/docs/source/main_classes/pipelines.mdx b/docs/source/main_classes/pipelines.mdx
index 16b337aa6f..a60dce9dc5 100644
--- a/docs/source/main_classes/pipelines.mdx
+++ b/docs/source/main_classes/pipelines.mdx
@@ -101,6 +101,7 @@ from transformers import pipeline
 
 pipe = pipeline("text-classification")
 
+
 def data():
     while True:
         # This could come from a dataset, a database, a queue or HTTP request
@@ -110,6 +111,7 @@ def data():
         # does the preprocessing while the main runs the big inference
         yield "This is a test"
 
+
 for out in pipe(data()):
     print(out)
     # {"text": "NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD NIGHT HUSBAND"}
@@ -125,10 +127,10 @@ All pipelines can use batching. This will work
 whenever the pipeline uses its streaming ability (so when passing lists or `Dataset` or `generator`).
 
 ```python
-from transformers import pipeline                                                   
+from transformers import pipeline
 from transformers.pipelines.base import KeyDataset
 import datasets
-import tqdm                                                                         
+import tqdm
 
 dataset = datasets.load_dataset("imdb", name="plain_text", split="unsupervised")
 pipe = pipeline("text-classification", device=0)
@@ -149,28 +151,28 @@ Example where it's mostly a speedup:
 </Tip>
 
 ```python
-from transformers import pipeline                                                   
-from torch.utils.data import Dataset                                                
-import tqdm                                                                         
+from transformers import pipeline
+from torch.utils.data import Dataset
+import tqdm
 
 
-pipe = pipeline("text-classification", device=0)                                    
+pipe = pipeline("text-classification", device=0)
 
 
-class MyDataset(Dataset):                                                           
-    def __len__(self):                                                              
-        return 5000                                                                 
+class MyDataset(Dataset):
+    def __len__(self):
+        return 5000
 
-    def __getitem__(self, i):                                                       
-        return "This is a test"                                                     
+    def __getitem__(self, i):
+        return "This is a test"
 
 
-dataset = MyDataset()   
+dataset = MyDataset()
 
 for batch_size in [1, 8, 64, 256]:
-    print("-" * 30)                                                                     
-    print(f"Streaming batch_size={batch_size}")    
-    for out in tqdm.tqdm(pipe(dataset, batch_size=batch_size), total=len(dataset)):              
+    print("-" * 30)
+    print(f"Streaming batch_size={batch_size}")
+    for out in tqdm.tqdm(pipe(dataset, batch_size=batch_size), total=len(dataset)):
         pass
 ```
 
@@ -194,15 +196,15 @@ Streaming batch_size=256
 Example where it's most a slowdown:
 
 ```python
-class MyDataset(Dataset):                                                           
-    def __len__(self):                                                              
-        return 5000                                                                 
+class MyDataset(Dataset):
+    def __len__(self):
+        return 5000
 
-    def __getitem__(self, i):                                                       
-        if i % 64 == 0:                                                          
-            n = 100                                                              
-        else:                                                                    
-            n = 1                                                                
+    def __getitem__(self, i):
+        if i % 64 == 0:
+            n = 100
+        else:
+            n = 1
         return "This is a test" * n
 ```
 
@@ -298,10 +300,11 @@ If you want to try simply you can:
 
 ```python
 class MyPipeline(TextClassificationPipeline):
-    def postprocess(...):
-        ...
+    def postprocess():
+        # Your code goes here
         scores = scores * 100
-        ...
+        # And here
+
 
 my_pipeline = MyPipeline(model=model, tokenizer=tokenizer, ...)
 # or if you use *pipeline* function, then:
diff --git a/docs/source/main_classes/processors.mdx b/docs/source/main_classes/processors.mdx
index 9ed295378c..8f5f1e048d 100644
--- a/docs/source/main_classes/processors.mdx
+++ b/docs/source/main_classes/processors.mdx
@@ -122,7 +122,7 @@ examples = processor.get_dev_examples(squad_v2_data_dir)
 processor = SquadV1Processor()
 examples = processor.get_dev_examples(squad_v1_data_dir)
 
-features = squad_convert_examples_to_features( 
+features = squad_convert_examples_to_features(
     examples=examples,
     tokenizer=tokenizer,
     max_seq_length=max_seq_length,
@@ -139,7 +139,7 @@ Using *tensorflow_datasets* is as easy as using a data file:
 tfds_examples = tfds.load("squad")
 examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
 
-features = squad_convert_examples_to_features( 
+features = squad_convert_examples_to_features(
     examples=examples,
     tokenizer=tokenizer,
     max_seq_length=max_seq_length,
diff --git a/docs/source/main_classes/trainer.mdx b/docs/source/main_classes/trainer.mdx
index dabda44439..a193b40ac8 100644
--- a/docs/source/main_classes/trainer.mdx
+++ b/docs/source/main_classes/trainer.mdx
@@ -53,14 +53,16 @@ Here is an example of how to customize [`Trainer`] using a custom loss function
 from torch import nn
 from transformers import Trainer
 
+
 class MultilabelTrainer(Trainer):
     def compute_loss(self, model, inputs, return_outputs=False):
         labels = inputs.get("labels")
         outputs = model(**inputs)
-        logits = outputs.get('logits')
+        logits = outputs.get("logits")
         loss_fct = nn.BCEWithLogitsLoss()
-        loss = loss_fct(logits.view(-1, self.model.config.num_labels),
-                        labels.float().view(-1, self.model.config.num_labels))
+        loss = loss_fct(
+            logits.view(-1, self.model.config.num_labels), labels.float().view(-1, self.model.config.num_labels)
+        )
         return (loss, outputs) if return_outputs else loss
 ```
 
diff --git a/docs/source/migration.mdx b/docs/source/migration.mdx
index 65182561d9..7abf958751 100644
--- a/docs/source/migration.mdx
+++ b/docs/source/migration.mdx
@@ -209,7 +209,7 @@ Here is a `pytorch-pretrained-bert` to 🤗 Transformers conversion example for
 
 ```python
 # Let's load our model
-model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
 
 # If you used to have this line in pytorch-pretrained-bert:
 loss = model(input_ids, labels=labels)
@@ -222,7 +222,7 @@ loss = outputs[0]
 loss, logits = outputs[:2]
 
 # And even the attention weights if you configure the model to output them (and other outputs too, see the docstrings and documentation)
-model = BertForSequenceClassification.from_pretrained('bert-base-uncased', output_attentions=True)
+model = BertForSequenceClassification.from_pretrained("bert-base-uncased", output_attentions=True)
 outputs = model(input_ids, labels=labels)
 loss, logits, attentions = outputs
 ```
@@ -241,23 +241,23 @@ Here is an example:
 
 ```python
 ### Let's load a model and tokenizer
-model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
-tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
+tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
 
 ### Do some stuff to our model and tokenizer
 # Ex: add new tokens to the vocabulary and embeddings of our model
-tokenizer.add_tokens(['[SPECIAL_TOKEN_1]', '[SPECIAL_TOKEN_2]'])
+tokenizer.add_tokens(["[SPECIAL_TOKEN_1]", "[SPECIAL_TOKEN_2]"])
 model.resize_token_embeddings(len(tokenizer))
 # Train our model
 train(model)
 
 ### Now let's save our model and tokenizer to a directory
-model.save_pretrained('./my_saved_model_directory/')
-tokenizer.save_pretrained('./my_saved_model_directory/')
+model.save_pretrained("./my_saved_model_directory/")
+tokenizer.save_pretrained("./my_saved_model_directory/")
 
 ### Reload the model and the tokenizer
-model = BertForSequenceClassification.from_pretrained('./my_saved_model_directory/')
-tokenizer = BertTokenizer.from_pretrained('./my_saved_model_directory/')
+model = BertForSequenceClassification.from_pretrained("./my_saved_model_directory/")
+tokenizer = BertTokenizer.from_pretrained("./my_saved_model_directory/")
 ```
 
 ### Optimizers: BertAdam & OpenAIAdam are now AdamW, schedules are standard PyTorch schedules
@@ -283,7 +283,13 @@ num_warmup_steps = 100
 warmup_proportion = float(num_warmup_steps) / float(num_training_steps)  # 0.1
 
 ### Previously BertAdam optimizer was instantiated like this:
-optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, num_training_steps=num_training_steps)
+optimizer = BertAdam(
+    model.parameters(),
+    lr=lr,
+    schedule="warmup_linear",
+    warmup=warmup_proportion,
+    num_training_steps=num_training_steps,
+)
 ### and used like this:
 for batch in train_data:
     loss = model(batch)
@@ -291,13 +297,19 @@ for batch in train_data:
     optimizer.step()
 
 ### In 🤗 Transformers, optimizer and schedules are split and instantiated like this:
-optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
-scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler
+optimizer = AdamW(
+    model.parameters(), lr=lr, correct_bias=False
+)  # To reproduce BertAdam specific behavior set correct_bias=False
+scheduler = get_linear_schedule_with_warmup(
+    optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps
+)  # PyTorch scheduler
 ### and used like this:
 for batch in train_data:
     loss = model(batch)
     loss.backward()
-    torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
+    torch.nn.utils.clip_grad_norm_(
+        model.parameters(), max_grad_norm
+    )  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
     optimizer.step()
     scheduler.step()
 ```
diff --git a/docs/source/model_doc/bart.mdx b/docs/source/model_doc/bart.mdx
index 9097cde5f5..18d0e1e10f 100644
--- a/docs/source/model_doc/bart.mdx
+++ b/docs/source/model_doc/bart.mdx
@@ -64,12 +64,15 @@ The `facebook/bart-base` and `facebook/bart-large` checkpoints can be used to fi
 
 ```python
 from transformers import BartForConditionalGeneration, BartTokenizer
+
 model = BartForConditionalGeneration.from_pretrained("facebook/bart-large", forced_bos_token_id=0)
 tok = BartTokenizer.from_pretrained("facebook/bart-large")
 example_english_phrase = "UN Chief Says There Is No <mask> in Syria"
-batch = tok(example_english_phrase, return_tensors='pt')
-generated_ids = model.generate(batch['input_ids'])
-assert tok.batch_decode(generated_ids, skip_special_tokens=True) == ['UN Chief Says There Is No Plan to Stop Chemical Weapons in Syria']
+batch = tok(example_english_phrase, return_tensors="pt")
+generated_ids = model.generate(batch["input_ids"])
+assert tok.batch_decode(generated_ids, skip_special_tokens=True) == [
+    "UN Chief Says There Is No Plan to Stop Chemical Weapons in Syria"
+]
 ```
 
 ## BartConfig
diff --git a/docs/source/model_doc/bartpho.mdx b/docs/source/model_doc/bartpho.mdx
index 2c704a2f22..d940173b42 100644
--- a/docs/source/model_doc/bartpho.mdx
+++ b/docs/source/model_doc/bartpho.mdx
@@ -44,6 +44,7 @@ Example of use:
 
 >>> # With TensorFlow 2.0+:
 >>> from transformers import TFAutoModel
+
 >>> bartpho = TFAutoModel.from_pretrained("vinai/bartpho-syllable")
 >>> input_ids = tokenizer(line, return_tensors="tf")
 >>> features = bartpho(**input_ids)
@@ -58,9 +59,10 @@ Tips:
 
 ```python
 >>> from transformers import MBartForConditionalGeneration
+
 >>> bartpho = MBartForConditionalGeneration.from_pretrained("vinai/bartpho-syllable")
->>> TXT = 'Chúng tôi là <mask> nghiên cứu viên.'
->>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
+>>> TXT = "Chúng tôi là <mask> nghiên cứu viên."
+>>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
 >>> logits = bartpho(input_ids).logits
 >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
 >>> probs = logits[0, masked_index].softmax(dim=0)
diff --git a/docs/source/model_doc/bert_japanese.mdx b/docs/source/model_doc/bert_japanese.mdx
index 170c3625e5..312714b379 100644
--- a/docs/source/model_doc/bert_japanese.mdx
+++ b/docs/source/model_doc/bert_japanese.mdx
@@ -30,7 +30,7 @@ Example of using a model with MeCab and WordPiece tokenization:
 
 ```python
 >>> import torch
->>> from transformers import AutoModel, AutoTokenizer 
+>>> from transformers import AutoModel, AutoTokenizer
 
 >>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese")
 >>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
@@ -40,7 +40,7 @@ Example of using a model with MeCab and WordPiece tokenization:
 
 >>> inputs = tokenizer(line, return_tensors="pt")
 
->>> print(tokenizer.decode(inputs['input_ids'][0]))
+>>> print(tokenizer.decode(inputs["input_ids"][0]))
 [CLS] 吾輩 は 猫 で ある 。 [SEP]
 
 >>> outputs = bertjapanese(**inputs)
@@ -57,7 +57,7 @@ Example of using a model with Character tokenization:
 
 >>> inputs = tokenizer(line, return_tensors="pt")
 
->>> print(tokenizer.decode(inputs['input_ids'][0]))
+>>> print(tokenizer.decode(inputs["input_ids"][0]))
 [CLS] 吾 輩 は 猫 で あ る 。 [SEP]
 
 >>> outputs = bertjapanese(**inputs)
diff --git a/docs/source/model_doc/bertgeneration.mdx b/docs/source/model_doc/bertgeneration.mdx
index cd5a04398f..3c6c229b6a 100644
--- a/docs/source/model_doc/bertgeneration.mdx
+++ b/docs/source/model_doc/bertgeneration.mdx
@@ -39,14 +39,18 @@ Usage:
 >>> # use BERT's cls token as BOS token and sep token as EOS token
 >>> encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased", bos_token_id=101, eos_token_id=102)
 >>> # add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
->>> decoder = BertGenerationDecoder.from_pretrained("bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102)
+>>> decoder = BertGenerationDecoder.from_pretrained(
+...     "bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102
+... )
 >>> bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
 
 >>> # create tokenizer...
 >>> tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
 
->>> input_ids = tokenizer('This is a long article to summarize', add_special_tokens=False, return_tensors="pt").input_ids
->>> labels = tokenizer('This is a short summary', return_tensors="pt").input_ids
+>>> input_ids = tokenizer(
+...     "This is a long article to summarize", add_special_tokens=False, return_tensors="pt"
+>>> ).input_ids
+>>> labels = tokenizer("This is a short summary", return_tensors="pt").input_ids
 
 >>> # train...
 >>> loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss
@@ -61,7 +65,9 @@ Usage:
 >>> sentence_fuser = EncoderDecoderModel.from_pretrained("google/roberta2roberta_L-24_discofuse")
 >>> tokenizer = AutoTokenizer.from_pretrained("google/roberta2roberta_L-24_discofuse")
 
->>> input_ids = tokenizer('This is the first sentence. This is the second sentence.', add_special_tokens=False, return_tensors="pt").input_ids
+>>> input_ids = tokenizer(
+...     "This is the first sentence. This is the second sentence.", add_special_tokens=False, return_tensors="pt"
+>>> ).input_ids
 
 >>> outputs = sentence_fuser.generate(input_ids)
 
diff --git a/docs/source/model_doc/bertweet.mdx b/docs/source/model_doc/bertweet.mdx
index e685483cc5..df55360646 100644
--- a/docs/source/model_doc/bertweet.mdx
+++ b/docs/source/model_doc/bertweet.mdx
@@ -28,14 +28,14 @@ Example of use:
 
 ```python
 >>> import torch
->>> from transformers import AutoModel, AutoTokenizer 
+>>> from transformers import AutoModel, AutoTokenizer
 
 >>> bertweet = AutoModel.from_pretrained("vinai/bertweet-base")
 
->>> # For transformers v4.x+: 
+>>> # For transformers v4.x+:
 >>> tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
 
->>> # For transformers v3.x: 
+>>> # For transformers v3.x:
 >>> # tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
 
 >>> # INPUT TWEET IS ALREADY NORMALIZED!
diff --git a/docs/source/model_doc/blenderbot.mdx b/docs/source/model_doc/blenderbot.mdx
index 72ce9c04bd..ef0d5f9574 100644
--- a/docs/source/model_doc/blenderbot.mdx
+++ b/docs/source/model_doc/blenderbot.mdx
@@ -50,11 +50,12 @@ Here is an example of model usage:
 
 ```python
 >>> from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration
->>> mname = 'facebook/blenderbot-400M-distill'
+
+>>> mname = "facebook/blenderbot-400M-distill"
 >>> model = BlenderbotForConditionalGeneration.from_pretrained(mname)
 >>> tokenizer = BlenderbotTokenizer.from_pretrained(mname)
 >>> UTTERANCE = "My friends are cool but they eat too many carbs."
->>> inputs = tokenizer([UTTERANCE], return_tensors='pt')
+>>> inputs = tokenizer([UTTERANCE], return_tensors="pt")
 >>> reply_ids = model.generate(**inputs)
 >>> print(tokenizer.batch_decode(reply_ids))
 ["<s> That's unfortunate. Are they trying to lose weight or are they just trying to be healthier?</s>"]
diff --git a/docs/source/model_doc/byt5.mdx b/docs/source/model_doc/byt5.mdx
index 5b5932a165..06ed195226 100644
--- a/docs/source/model_doc/byt5.mdx
+++ b/docs/source/model_doc/byt5.mdx
@@ -51,12 +51,14 @@ ByT5 works on raw UTF-8 bytes, so it can be used without a tokenizer:
 from transformers import T5ForConditionalGeneration
 import torch
 
-model = T5ForConditionalGeneration.from_pretrained('google/byt5-small')
+model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
 
 input_ids = torch.tensor([list("Life is like a box of chocolates.".encode("utf-8"))]) + 3  # add 3 for special tokens
-labels = torch.tensor([list("La vie est comme une boîte de chocolat.".encode("utf-8"))]) + 3  # add 3 for special tokens
+labels = (
+    torch.tensor([list("La vie est comme une boîte de chocolat.".encode("utf-8"))]) + 3
+)  # add 3 for special tokens
 
-loss = model(input_ids, labels=labels).loss # forward pass
+loss = model(input_ids, labels=labels).loss  # forward pass
 ```
 
 For batched inference and training it is however recommended to make use of the tokenizer:
@@ -64,13 +66,17 @@ For batched inference and training it is however recommended to make use of the
 ```python
 from transformers import T5ForConditionalGeneration, AutoTokenizer
 
-model = T5ForConditionalGeneration.from_pretrained('google/byt5-small')
-tokenizer = AutoTokenizer.from_pretrained('google/byt5-small')
+model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
+tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")
 
-model_inputs = tokenizer(["Life is like a box of chocolates.", "Today is Monday."], padding="longest", return_tensors="pt")
-labels = tokenizer(["La vie est comme une boîte de chocolat.", "Aujourd'hui c'est lundi."], padding="longest", return_tensors="pt").input_ids
+model_inputs = tokenizer(
+    ["Life is like a box of chocolates.", "Today is Monday."], padding="longest", return_tensors="pt"
+)
+labels = tokenizer(
+    ["La vie est comme une boîte de chocolat.", "Aujourd'hui c'est lundi."], padding="longest", return_tensors="pt"
+).input_ids
 
-loss = model(**model_inputs, labels=labels).loss # forward pass
+loss = model(**model_inputs, labels=labels).loss  # forward pass
 ```
 
 ## ByT5Tokenizer
diff --git a/docs/source/model_doc/canine.mdx b/docs/source/model_doc/canine.mdx
index 712af46049..e73777d000 100644
--- a/docs/source/model_doc/canine.mdx
+++ b/docs/source/model_doc/canine.mdx
@@ -64,13 +64,13 @@ CANINE works on raw characters, so it can be used without a tokenizer:
 >>> from transformers import CanineModel
 >>> import torch
 
->>> model = CanineModel.from_pretrained('google/canine-c') # model pre-trained with autoregressive character loss
+>>> model = CanineModel.from_pretrained("google/canine-c")  # model pre-trained with autoregressive character loss
 
 >>> text = "hello world"
 >>> # use Python's built-in ord() function to turn each character into its unicode code point id
 >>> input_ids = torch.tensor([[ord(char) for char in text]])
 
->>> outputs = model(input_ids) # forward pass
+>>> outputs = model(input_ids)  # forward pass
 >>> pooled_output = outputs.pooler_output
 >>> sequence_output = outputs.last_hidden_state
 ```
@@ -81,13 +81,13 @@ sequences to the same length):
 ```python
 >>> from transformers import CanineTokenizer, CanineModel
 
->>> model = CanineModel.from_pretrained('google/canine-c')
->>> tokenizer = CanineTokenizer.from_pretrained('google/canine-c')
+>>> model = CanineModel.from_pretrained("google/canine-c")
+>>> tokenizer = CanineTokenizer.from_pretrained("google/canine-c")
 
 >>> inputs = ["Life is like a box of chocolates.", "You never know what you gonna get."]
 >>> encoding = tokenizer(inputs, padding="longest", truncation=True, return_tensors="pt")
 
->>> outputs = model(**encoding) # forward pass
+>>> outputs = model(**encoding)  # forward pass
 >>> pooled_output = outputs.pooler_output
 >>> sequence_output = outputs.last_hidden_state
 ```
diff --git a/docs/source/model_doc/clip.mdx b/docs/source/model_doc/clip.mdx
index 2bbc6f1f8e..0ab0ec7689 100644
--- a/docs/source/model_doc/clip.mdx
+++ b/docs/source/model_doc/clip.mdx
@@ -69,8 +69,8 @@ encode the text and prepare the images. The following example shows how to get t
 >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
 
 >>> outputs = model(**inputs)
->>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
->>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
+>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
 ```
 
 This model was contributed by [valhalla](https://huggingface.co/valhalla). The original code can be found [here](https://github.com/openai/CLIP).
diff --git a/docs/source/model_doc/gpt_neo.mdx b/docs/source/model_doc/gpt_neo.mdx
index 97fb46d7b2..f68b92b213 100644
--- a/docs/source/model_doc/gpt_neo.mdx
+++ b/docs/source/model_doc/gpt_neo.mdx
@@ -29,16 +29,24 @@ The `generate()` method can be used to generate text using GPT Neo model.
 
 ```python
 >>> from transformers import GPTNeoForCausalLM, GPT2Tokenizer
+
 >>> model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
 >>> tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")
 
->>> prompt = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, " \
-...          "previously unexplored valley, in the Andes Mountains. Even more surprising to the " \
-...          "researchers was the fact that the unicorns spoke perfect English."
+>>> prompt = (
+...     "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
+...     "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
+...     "researchers was the fact that the unicorns spoke perfect English."
+... )
 
 >>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
 
->>> gen_tokens = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=100,)
+>>> gen_tokens = model.generate(
+...     input_ids,
+...     do_sample=True,
+...     temperature=0.9,
+...     max_length=100,
+... )
 >>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
 ```
 
diff --git a/docs/source/model_doc/gptj.mdx b/docs/source/model_doc/gptj.mdx
index 2e29b4d570..67edd44483 100644
--- a/docs/source/model_doc/gptj.mdx
+++ b/docs/source/model_doc/gptj.mdx
@@ -33,7 +33,9 @@ Tips:
 >>> from transformers import GPTJForCausalLM
 >>> import torch
 
->>> model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", revision="float16", torch_dtype=torch.float16, low_cpu_mem_usage=True)
+>>> model = GPTJForCausalLM.from_pretrained(
+...     "EleutherAI/gpt-j-6B", revision="float16", torch_dtype=torch.float16, low_cpu_mem_usage=True
+... )
 ```
 
 - The model should fit on 16GB GPU for inference. For training/fine-tuning it would take much more GPU RAM. Adam
@@ -56,16 +58,24 @@ model.
 
 ```python
 >>> from transformers import AutoModelForCausalLM, AutoTokenizer
+
 >>> model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")
 >>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
 
->>> prompt = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, " \
-...          "previously unexplored valley, in the Andes Mountains. Even more surprising to the " \
-...          "researchers was the fact that the unicorns spoke perfect English."
+>>> prompt = (
+...     "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
+...     "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
+...     "researchers was the fact that the unicorns spoke perfect English."
+... )
 
 >>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
 
->>> gen_tokens = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=100,)
+>>> gen_tokens = model.generate(
+...     input_ids,
+...     do_sample=True,
+...     temperature=0.9,
+...     max_length=100,
+... )
 >>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
 ```
 
@@ -78,13 +88,20 @@ model.
 >>> model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", torch_dtype=torch.float16)
 >>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
 
->>> prompt = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, " \
-...          "previously unexplored valley, in the Andes Mountains. Even more surprising to the " \
-...          "researchers was the fact that the unicorns spoke perfect English."
+>>> prompt = (
+...     "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
+...     "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
+...     "researchers was the fact that the unicorns spoke perfect English."
+... )
 
 >>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
 
->>> gen_tokens = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=100,)
+>>> gen_tokens = model.generate(
+...     input_ids,
+...     do_sample=True,
+...     temperature=0.9,
+...     max_length=100,
+... )
 >>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
 ```
 
diff --git a/docs/source/model_doc/herbert.mdx b/docs/source/model_doc/herbert.mdx
index 2be9409ef9..90e08ebe9a 100644
--- a/docs/source/model_doc/herbert.mdx
+++ b/docs/source/model_doc/herbert.mdx
@@ -41,7 +41,7 @@ Examples of use:
 >>> tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
 >>> model = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")
 
->>> encoded_input = tokenizer.encode("Kto ma lepszą sztukę, ma lepszy rząd – to jasne.", return_tensors='pt')
+>>> encoded_input = tokenizer.encode("Kto ma lepszą sztukę, ma lepszy rząd – to jasne.", return_tensors="pt")
 >>> outputs = model(encoded_input)
 
 >>> # HerBERT can also be loaded using AutoTokenizer and AutoModel:
diff --git a/docs/source/model_doc/layoutlm.mdx b/docs/source/model_doc/layoutlm.mdx
index ebce375975..b1ee2a8cdb 100644
--- a/docs/source/model_doc/layoutlm.mdx
+++ b/docs/source/model_doc/layoutlm.mdx
@@ -53,12 +53,12 @@ Tips:
 
 ```python
 def normalize_bbox(bbox, width, height):
-     return [
-         int(1000 * (bbox[0] / width)),
-         int(1000 * (bbox[1] / height)),
-         int(1000 * (bbox[2] / width)),
-         int(1000 * (bbox[3] / height)),
-     ]
+    return [
+        int(1000 * (bbox[0] / width)),
+        int(1000 * (bbox[1] / height)),
+        int(1000 * (bbox[2] / width)),
+        int(1000 * (bbox[3] / height)),
+    ]
 ```
 
 Here, `width` and `height` correspond to the width and height of the original document in which the token
diff --git a/docs/source/model_doc/layoutlmv2.mdx b/docs/source/model_doc/layoutlmv2.mdx
index 503cb749ea..b1db86e2a2 100644
--- a/docs/source/model_doc/layoutlmv2.mdx
+++ b/docs/source/model_doc/layoutlmv2.mdx
@@ -70,12 +70,12 @@ Tips:
 
 ```python
 def normalize_bbox(bbox, width, height):
-     return [
-         int(1000 * (bbox[0] / width)),
-         int(1000 * (bbox[1] / height)),
-         int(1000 * (bbox[2] / width)),
-         int(1000 * (bbox[3] / height)),
-     ]
+    return [
+        int(1000 * (bbox[0] / width)),
+        int(1000 * (bbox[1] / height)),
+        int(1000 * (bbox[2] / width)),
+        int(1000 * (bbox[3] / height)),
+    ]
 ```
 
 Here, `width` and `height` correspond to the width and height of the original document in which the token
@@ -123,7 +123,7 @@ modality.
 ```python
 from transformers import LayoutLMv2FeatureExtractor, LayoutLMv2TokenizerFast, LayoutLMv2Processor
 
-feature_extractor = LayoutLMv2FeatureExtractor() # apply_ocr is set to True by default
+feature_extractor = LayoutLMv2FeatureExtractor()  # apply_ocr is set to True by default
 tokenizer = LayoutLMv2TokenizerFast.from_pretrained("microsoft/layoutlmv2-base-uncased")
 processor = LayoutLMv2Processor(feature_extractor, tokenizer)
 ```
@@ -158,7 +158,9 @@ from PIL import Image
 processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
 
 image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
-encoding = processor(image, return_tensors="pt") # you can also add all tokenizer parameters here such as padding, truncation
+encoding = processor(
+    image, return_tensors="pt"
+)  # you can also add all tokenizer parameters here such as padding, truncation
 print(encoding.keys())
 # dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
 ```
@@ -177,7 +179,7 @@ processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncas
 
 image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
 words = ["hello", "world"]
-boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes
+boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]  # make sure to normalize your bounding boxes
 encoding = processor(image, words, boxes=boxes, return_tensors="pt")
 print(encoding.keys())
 # dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
@@ -199,7 +201,7 @@ processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncas
 
 image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
 words = ["hello", "world"]
-boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes
+boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]  # make sure to normalize your bounding boxes
 word_labels = [1, 2]
 encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="pt")
 print(encoding.keys())
@@ -219,7 +221,7 @@ processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncas
 
 image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
 question = "What's his name?"
-encoding = processor(image, question, return_tensors="pt") 
+encoding = processor(image, question, return_tensors="pt")
 print(encoding.keys())
 # dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
 ```
@@ -238,8 +240,8 @@ processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncas
 image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
 question = "What's his name?"
 words = ["hello", "world"]
-boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes
-encoding = processor(image, question, words, boxes=boxes, return_tensors="pt")  
+boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]  # make sure to normalize your bounding boxes
+encoding = processor(image, question, words, boxes=boxes, return_tensors="pt")
 print(encoding.keys())
 # dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
 ```
diff --git a/docs/source/model_doc/layoutxlm.mdx b/docs/source/model_doc/layoutxlm.mdx
index 343302ba94..ed112453be 100644
--- a/docs/source/model_doc/layoutxlm.mdx
+++ b/docs/source/model_doc/layoutxlm.mdx
@@ -34,7 +34,7 @@ One can directly plug in the weights of LayoutXLM into a LayoutLMv2 model, like
 ```python
 from transformers import LayoutLMv2Model
 
-model = LayoutLMv2Model.from_pretrained('microsoft/layoutxlm-base')
+model = LayoutLMv2Model.from_pretrained("microsoft/layoutxlm-base")
 ```
 
 Note that LayoutXLM has its own tokenizer, based on
@@ -44,7 +44,7 @@ follows:
 ```python
 from transformers import LayoutXLMTokenizer
 
-tokenizer = LayoutXLMTokenizer.from_pretrained('microsoft/layoutxlm-base')
+tokenizer = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base")
 ```
 
 Similar to LayoutLMv2, you can use [`LayoutXLMProcessor`] (which internally applies
diff --git a/docs/source/model_doc/longformer.mdx b/docs/source/model_doc/longformer.mdx
index dcb6d8ad67..2bef5ac1a3 100644
--- a/docs/source/model_doc/longformer.mdx
+++ b/docs/source/model_doc/longformer.mdx
@@ -75,8 +75,8 @@ For more information, please refer to the official [paper](https://arxiv.org/pdf
 trained and should be used as follows:
 
 ```python
-input_ids = tokenizer.encode('This is a sentence from [MASK] training data', return_tensors='pt')
-mlm_labels = tokenizer.encode('This is a sentence from the training data', return_tensors='pt')
+input_ids = tokenizer.encode("This is a sentence from [MASK] training data", return_tensors="pt")
+mlm_labels = tokenizer.encode("This is a sentence from the training data", return_tensors="pt")
 
 loss = model(input_ids, labels=input_ids, masked_lm_labels=mlm_labels)[0]
 ```
diff --git a/docs/source/model_doc/luke.mdx b/docs/source/model_doc/luke.mdx
index caf4008815..1bbf6acb84 100644
--- a/docs/source/model_doc/luke.mdx
+++ b/docs/source/model_doc/luke.mdx
@@ -84,24 +84,27 @@ Example:
 
 >>> model = LukeModel.from_pretrained("studio-ousia/luke-base")
 >>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base")
-
 # Example 1: Computing the contextualized entity representation corresponding to the entity mention "Beyoncé"
+
 >>> text = "Beyoncé lives in Los Angeles."
 >>> entity_spans = [(0, 7)]  # character-based entity span corresponding to "Beyoncé"
 >>> inputs = tokenizer(text, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
 >>> outputs = model(**inputs)
 >>> word_last_hidden_state = outputs.last_hidden_state
 >>> entity_last_hidden_state = outputs.entity_last_hidden_state
-
 # Example 2: Inputting Wikipedia entities to obtain enriched contextualized representations
->>> entities = ["Beyoncé", "Los Angeles"]  # Wikipedia entity titles corresponding to the entity mentions "Beyoncé" and "Los Angeles"
+
+>>> entities = [
+...     "Beyoncé",
+...     "Los Angeles",
+>>> ]  # Wikipedia entity titles corresponding to the entity mentions "Beyoncé" and "Los Angeles"
 >>> entity_spans = [(0, 7), (17, 28)]  # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
 >>> inputs = tokenizer(text, entities=entities, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
 >>> outputs = model(**inputs)
 >>> word_last_hidden_state = outputs.last_hidden_state
 >>> entity_last_hidden_state = outputs.entity_last_hidden_state
-
 # Example 3: Classifying the relationship between two entities using LukeForEntityPairClassification head model
+
 >>> model = LukeForEntityPairClassification.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
 >>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
 >>> entity_spans = [(0, 7), (17, 28)]  # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
diff --git a/docs/source/model_doc/m2m_100.mdx b/docs/source/model_doc/m2m_100.mdx
index 58fe6571af..65e119aa4e 100644
--- a/docs/source/model_doc/m2m_100.mdx
+++ b/docs/source/model_doc/m2m_100.mdx
@@ -49,8 +49,8 @@ examples. To install `sentencepiece` run `pip install sentencepiece`.
 ```python
 from transformers import M2M100Config, M2M100ForConditionalGeneration, M2M100Tokenizer
 
-model = M2M100ForConditionalGeneration.from_pretrained('facebook/m2m100_418M')
-tokenizer = M2M100Tokenizer.from_pretrained('facebook/m2m100_418M', src_lang="en", tgt_lang="fr")
+model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
+tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en", tgt_lang="fr")
 
 src_text = "Life is like a box of chocolates."
 tgt_text = "La vie est comme une boîte de chocolat."
@@ -59,7 +59,7 @@ model_inputs = tokenizer(src_text, return_tensors="pt")
 with tokenizer.as_target_tokenizer():
     labels = tokenizer(tgt_text, return_tensors="pt").input_ids
 
-loss = model(**model_inputs, labels=labels) # forward pass
+loss = model(**model_inputs, labels=labels)  # forward pass
 ```
 
 - Generation
diff --git a/docs/source/model_doc/marian.mdx b/docs/source/model_doc/marian.mdx
index c74f9d37fc..74e244b8b6 100644
--- a/docs/source/model_doc/marian.mdx
+++ b/docs/source/model_doc/marian.mdx
@@ -65,13 +65,14 @@ require 3 character language codes:
 
 ```python
 >>> from transformers import MarianMTModel, MarianTokenizer
->>> src_text = [
-...     '>>fra<< this is a sentence in english that we want to translate to french',
-...     '>>por<< This should go to portuguese',
-...     '>>esp<< And this to Spanish'
->>> ]
 
->>> model_name = 'Helsinki-NLP/opus-mt-en-roa'
+>>> src_text = [
+...     ">>fra<< this is a sentence in english that we want to translate to french",
+...     ">>por<< This should go to portuguese",
+...     ">>esp<< And this to Spanish",
+... ]
+
+>>> model_name = "Helsinki-NLP/opus-mt-en-roa"
 >>> tokenizer = MarianTokenizer.from_pretrained(model_name)
 >>> print(tokenizer.supported_language_codes)
 ['>>zlm_Latn<<', '>>mfe<<', '>>hat<<', '>>pap<<', '>>ast<<', '>>cat<<', '>>ind<<', '>>glg<<', '>>wln<<', '>>spa<<', '>>fra<<', '>>ron<<', '>>por<<', '>>ita<<', '>>oci<<', '>>arg<<', '>>min<<']
@@ -88,11 +89,12 @@ Here is the code to see all available pretrained models on the hub:
 
 ```python
 from huggingface_hub import list_models
+
 model_list = list_models()
 org = "Helsinki-NLP"
 model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)]
-suffix = [x.split('/')[1] for x in model_ids]
-old_style_multi_models = [f'{org}/{s}' for s in suffix if s != s.lower()]
+suffix = [x.split("/")[1] for x in model_ids]
+old_style_multi_models = [f"{org}/{s}" for s in suffix if s != s.lower()]
 ```
 
 ## Old Style Multi-Lingual Models
@@ -100,7 +102,7 @@ old_style_multi_models = [f'{org}/{s}' for s in suffix if s != s.lower()]
 These are the old style multi-lingual models ported from the OPUS-MT-Train repo: and the members of each language
 group:
 
-```python
+```python no-style
 ['Helsinki-NLP/opus-mt-NORTH_EU-NORTH_EU',
  'Helsinki-NLP/opus-mt-ROMANCE-en',
  'Helsinki-NLP/opus-mt-SCANDINAVIA-SCANDINAVIA',
@@ -129,13 +131,14 @@ Example of translating english to many romance languages, using old-style 2 char
 
 ```python
 >>> from transformers import MarianMTModel, MarianTokenizer
->>> src_text = [
-...     '>>fr<< this is a sentence in english that we want to translate to french',
-...     '>>pt<< This should go to portuguese',
-...     '>>es<< And this to Spanish'
->>> ]
 
->>> model_name = 'Helsinki-NLP/opus-mt-en-ROMANCE'
+>>> src_text = [
+...     ">>fr<< this is a sentence in english that we want to translate to french",
+...     ">>pt<< This should go to portuguese",
+...     ">>es<< And this to Spanish",
+... ]
+
+>>> model_name = "Helsinki-NLP/opus-mt-en-ROMANCE"
 >>> tokenizer = MarianTokenizer.from_pretrained(model_name)
 
 >>> model = MarianMTModel.from_pretrained(model_name)
diff --git a/docs/source/model_doc/mbart.mdx b/docs/source/model_doc/mbart.mdx
index d61fb33d09..0f3d82ce5d 100644
--- a/docs/source/model_doc/mbart.mdx
+++ b/docs/source/model_doc/mbart.mdx
@@ -52,7 +52,7 @@ inside the context manager [`~MBartTokenizer.as_target_tokenizer`] to encode tar
 
 >>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
 >>> # forward pass
->>> model(**inputs, labels=batch['labels'])
+>>> model(**inputs, labels=batch["labels"])
 ```
 
 - Generation
@@ -106,13 +106,13 @@ model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")
 tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")
 
 src_text = " UN Chief Says There Is No Military Solution in Syria"
-tgt_text =  "Şeful ONU declară că nu există o soluţie militară în Siria"
+tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
 
 model_inputs = tokenizer(src_text, return_tensors="pt")
 with tokenizer.as_target_tokenizer():
     labels = tokenizer(tgt_text, return_tensors="pt").input_ids
 
-model(**model_inputs, labels=labels) # forward pass
+model(**model_inputs, labels=labels)  # forward pass
 ```
 
 - Generation
diff --git a/docs/source/model_doc/mluke.mdx b/docs/source/model_doc/mluke.mdx
index ac1f3fb19f..b910f17ae2 100644
--- a/docs/source/model_doc/mluke.mdx
+++ b/docs/source/model_doc/mluke.mdx
@@ -38,7 +38,7 @@ One can directly plug in the weights of mLUKE into a LUKE model, like so:
 ```python
 from transformers import LukeModel
 
-model = LukeModel.from_pretrained('studio-ousia/mluke-base')
+model = LukeModel.from_pretrained("studio-ousia/mluke-base")
 ```
 
 Note that mLUKE has its own tokenizer, [`MLukeTokenizer`]. You can initialize it as follows:
@@ -46,7 +46,7 @@ Note that mLUKE has its own tokenizer, [`MLukeTokenizer`]. You can initialize it
 ```python
 from transformers import MLukeTokenizer
 
-tokenizer = MLukeTokenizer.from_pretrained('studio-ousia/mluke-base')
+tokenizer = MLukeTokenizer.from_pretrained("studio-ousia/mluke-base")
 ```
 
 As mLUKE's architecture is equivalent to that of LUKE, one can refer to [LUKE's documentation page](luke) for all
diff --git a/docs/source/model_doc/pegasus.mdx b/docs/source/model_doc/pegasus.mdx
index c05fcf80fa..1a8a4a7386 100644
--- a/docs/source/model_doc/pegasus.mdx
+++ b/docs/source/model_doc/pegasus.mdx
@@ -69,18 +69,22 @@ All the [checkpoints](https://huggingface.co/models?search=pegasus) are fine-tun
 ```python
 >>> from transformers import PegasusForConditionalGeneration, PegasusTokenizer
 >>> import torch
+
 >>> src_text = [
 ...     """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."""
->>> ]
+... ]
 
->>> model_name = 'google/pegasus-xsum'
->>> device = 'cuda' if torch.cuda.is_available() else 'cpu'
->>> tokenizer = PegasusTokenizer.from_pretrained(model_name)
->>> model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
->>> batch = tokenizer(src_text, truncation=True, padding='longest', return_tensors="pt").to(device)
->>> translated = model.generate(**batch)
->>> tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
->>> assert tgt_text[0] == "California's largest electricity provider has turned off power to hundreds of thousands of customers."
+... model_name = "google/pegasus-xsum"
+... device = "cuda" if torch.cuda.is_available() else "cpu"
+... tokenizer = PegasusTokenizer.from_pretrained(model_name)
+... model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
+... batch = tokenizer(src_text, truncation=True, padding="longest", return_tensors="pt").to(device)
+... translated = model.generate(**batch)
+... tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
+... assert (
+...     tgt_text[0]
+...     == "California's largest electricity provider has turned off power to hundreds of thousands of customers."
+... )
 ```
 
 ## PegasusConfig
diff --git a/docs/source/model_doc/qdqbert.mdx b/docs/source/model_doc/qdqbert.mdx
index 210fa10892..df7b7bcee6 100644
--- a/docs/source/model_doc/qdqbert.mdx
+++ b/docs/source/model_doc/qdqbert.mdx
@@ -75,9 +75,9 @@ tensors. After setting up the tensor quantizers, one can use the following examp
 ```python
 >>> # Find the TensorQuantizer and enable calibration
 >>> for name, module in model.named_modules():
->>>     if name.endswith('_input_quantizer'):
->>>         module.enable_calib()
->>>         module.disable_quant()  # Use full precision data to calibrate
+...     if name.endswith("_input_quantizer"):
+...         module.enable_calib()
+...         module.disable_quant()  # Use full precision data to calibrate
 
 >>> # Feeding data samples
 >>> model(x)
@@ -85,9 +85,9 @@ tensors. After setting up the tensor quantizers, one can use the following examp
 
 >>> # Finalize calibration
 >>> for name, module in model.named_modules():
->>>     if name.endswith('_input_quantizer'):
->>>         module.load_calib_amax()
->>>         module.enable_quant()
+...     if name.endswith("_input_quantizer"):
+...         module.load_calib_amax()
+...         module.enable_quant()
 
 >>> # If running on GPU, it needs to call .cuda() again because new tensors will be created by calibration process
 >>> model.cuda()
@@ -105,6 +105,7 @@ the instructions in [torch.onnx](https://pytorch.org/docs/stable/onnx.html). Exa
 
 ```python
 >>> from pytorch_quantization.nn import TensorQuantizer
+
 >>> TensorQuantizer.use_fb_fake_quant = True
 
 >>> # Load the calibrated model
diff --git a/docs/source/model_doc/reformer.mdx b/docs/source/model_doc/reformer.mdx
index 24c8375f60..777a333e7b 100644
--- a/docs/source/model_doc/reformer.mdx
+++ b/docs/source/model_doc/reformer.mdx
@@ -134,7 +134,7 @@ easily be trained on sequences as long as 64000 tokens.
 For training, the [`ReformerModelWithLMHead`] should be used as follows:
 
 ```python
-input_ids = tokenizer.encode('This is a sentence from the training data', return_tensors='pt')
+input_ids = tokenizer.encode("This is a sentence from the training data", return_tensors="pt")
 loss = model(input_ids, labels=input_ids)[0]
 ```
 
diff --git a/docs/source/model_doc/speech_to_text.mdx b/docs/source/model_doc/speech_to_text.mdx
index 58945ac3a6..1a8cce2931 100644
--- a/docs/source/model_doc/speech_to_text.mdx
+++ b/docs/source/model_doc/speech_to_text.mdx
@@ -52,11 +52,13 @@ be installed as follows: `apt install libsndfile1-dev`
 >>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
 >>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
 
+
 >>> def map_to_array(batch):
 ...     speech, _ = sf.read(batch["file"])
 ...     batch["speech"] = speech
 ...     return batch
 
+
 >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
 >>> ds = ds.map(map_to_array)
 
@@ -83,16 +85,22 @@ be installed as follows: `apt install libsndfile1-dev`
 >>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")
 >>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")
 
+
 >>> def map_to_array(batch):
 ...     speech, _ = sf.read(batch["file"])
 ...     batch["speech"] = speech
 ...     return batch
 
+
 >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
 >>> ds = ds.map(map_to_array)
 
 >>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
->>> generated_ids = model.generate(input_ids=inputs["input_features"], attention_mask=inputs["attention_mask], forced_bos_token_id=processor.tokenizer.lang_code_to_id["fr"])
+>>> generated_ids = model.generate(
+...     input_ids=inputs["input_features"],
+...     attention_mask=inputs["attention_mask"],
+...     forced_bos_token_id=processor.tokenizer.lang_code_to_id["fr"],
+... )
 
 >>> translation = processor.batch_decode(generated_ids)
 ```
diff --git a/docs/source/model_doc/speech_to_text_2.mdx b/docs/source/model_doc/speech_to_text_2.mdx
index a95f1d0af9..14c4e3160b 100644
--- a/docs/source/model_doc/speech_to_text_2.mdx
+++ b/docs/source/model_doc/speech_to_text_2.mdx
@@ -58,11 +58,13 @@ predicted token ids.
 >>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
 >>> processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
 
+
 >>> def map_to_array(batch):
 ...     speech, _ = sf.read(batch["file"])
 ...     batch["speech"] = speech
 ...     return batch
 
+
 >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
 >>> ds = ds.map(map_to_array)
 
@@ -81,7 +83,11 @@ predicted token ids.
 >>> from transformers import pipeline
 
 >>> librispeech_en = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
->>> asr = pipeline("automatic-speech-recognition", model="facebook/s2t-wav2vec2-large-en-de", feature_extractor="facebook/s2t-wav2vec2-large-en-de")
+>>> asr = pipeline(
+...     "automatic-speech-recognition",
+...     model="facebook/s2t-wav2vec2-large-en-de",
+...     feature_extractor="facebook/s2t-wav2vec2-large-en-de",
+... )
 
 >>> translation_de = asr(librispeech_en[0]["file"])
 ```
diff --git a/docs/source/model_doc/t5.mdx b/docs/source/model_doc/t5.mdx
index 5f1e00d5ba..47bcdc662f 100644
--- a/docs/source/model_doc/t5.mdx
+++ b/docs/source/model_doc/t5.mdx
@@ -98,8 +98,8 @@ language modeling head on top of the decoder.
   tokenizer = T5Tokenizer.from_pretrained("t5-small")
   model = T5ForConditionalGeneration.from_pretrained("t5-small")
 
-  input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
-  labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
+  input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
+  labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
   # the forward function automatically creates the correct decoder_input_ids
   loss = model(input_ids=input_ids, labels=labels).loss
   ```
@@ -120,8 +120,8 @@ language modeling head on top of the decoder.
   tokenizer = T5Tokenizer.from_pretrained("t5-small")
   model = T5ForConditionalGeneration.from_pretrained("t5-small")
 
-  input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
-  labels = tokenizer('Das Haus ist wunderbar.', return_tensors='pt').input_ids
+  input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids
+  labels = tokenizer("Das Haus ist wunderbar.", return_tensors="pt").input_ids
   # the forward function automatically creates the correct decoder_input_ids
   loss = model(input_ids=input_ids, labels=labels).loss
   ```
@@ -148,7 +148,7 @@ language modeling head on top of the decoder.
   ignored. The code example below illustrates all of this.
 
   ```python
-  from transformers import T5Tokenizer, T5ForConditionalGeneration 
+  from transformers import T5Tokenizer, T5ForConditionalGeneration
   import torch
 
   tokenizer = T5Tokenizer.from_pretrained("t5-small")
@@ -168,18 +168,19 @@ language modeling head on top of the decoder.
   # encode the inputs
   task_prefix = "translate English to French: "
   input_sequences = [input_sequence_1, input_sequence_2]
-  encoding = tokenizer([task_prefix + sequence for sequence in input_sequences], 
-                      padding='longest', 
-                      max_length=max_source_length, 
-                      truncation=True, 
-                      return_tensors="pt")
+  encoding = tokenizer(
+      [task_prefix + sequence for sequence in input_sequences],
+      padding="longest",
+      max_length=max_source_length,
+      truncation=True,
+      return_tensors="pt",
+  )
   input_ids, attention_mask = encoding.input_ids, encoding.attention_mask
 
   # encode the targets
-  target_encoding = tokenizer([output_sequence_1, output_sequence_2], 
-                              padding='longest', 
-                              max_length=max_target_length, 
-                              truncation=True)
+  target_encoding = tokenizer(
+      [output_sequence_1, output_sequence_2], padding="longest", max_length=max_target_length, truncation=True
+  )
   labels = target_encoding.input_ids
 
   # replace padding token id's of the labels by -100
@@ -218,12 +219,12 @@ There's also [this blog post](https://huggingface.co/blog/encoder-decoder#encode
 generation works in general in encoder-decoder models.
 
 ```python
-from transformers import T5Tokenizer, T5ForConditionalGeneration 
+from transformers import T5Tokenizer, T5ForConditionalGeneration
 
 tokenizer = T5Tokenizer.from_pretrained("t5-small")
 model = T5ForConditionalGeneration.from_pretrained("t5-small")
 
-input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
+input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids
 outputs = model.generate(input_ids)
 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 # Das Haus ist wunderbar.
@@ -242,17 +243,17 @@ model = T5ForConditionalGeneration.from_pretrained("t5-small")
 
 # when generating, we will use the logits of right-most token to predict the next token
 # so the padding should be on the left
-tokenizer.padding_side = "left" 
-tokenizer.pad_token = tokenizer.eos_token # to avoid an error
+tokenizer.padding_side = "left"
+tokenizer.pad_token = tokenizer.eos_token  # to avoid an error
 
-task_prefix = 'translate English to German: '
-sentences = ['The house is wonderful.', 'I like to work in NYC.'] # use different length sentences to test batching
+task_prefix = "translate English to German: "
+sentences = ["The house is wonderful.", "I like to work in NYC."]  # use different length sentences to test batching
 inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True)
 
 output_sequences = model.generate(
-    input_ids=inputs['input_ids'],
-    attention_mask=inputs['attention_mask'],
-    do_sample=False, # disable sampling to test if batching affects output
+    input_ids=inputs["input_ids"],
+    attention_mask=inputs["attention_mask"],
+    do_sample=False,  # disable sampling to test if batching affects output
 )
 
 print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True))
diff --git a/docs/source/model_doc/t5v1.1.mdx b/docs/source/model_doc/t5v1.1.mdx
index 5c829213a6..512c5c59ce 100644
--- a/docs/source/model_doc/t5v1.1.mdx
+++ b/docs/source/model_doc/t5v1.1.mdx
@@ -22,7 +22,7 @@ One can directly plug in the weights of T5v1.1 into a T5 model, like so:
 ```python
 from transformers import T5ForConditionalGeneration
 
-model = T5ForConditionalGeneration.from_pretrained('google/t5-v1_1-base')
+model = T5ForConditionalGeneration.from_pretrained("google/t5-v1_1-base")
 ```
 
 T5 Version 1.1 includes the following improvements compared to the original T5 model:
diff --git a/docs/source/model_doc/tapas.mdx b/docs/source/model_doc/tapas.mdx
index fe22d63659..f1be4ae9ed 100644
--- a/docs/source/model_doc/tapas.mdx
+++ b/docs/source/model_doc/tapas.mdx
@@ -75,28 +75,28 @@ dependency in case you're using Tensorflow:
 >>> from transformers import TapasConfig, TapasForQuestionAnswering
 
 >>> # for example, the base sized model with default SQA configuration
->>> model = TapasForQuestionAnswering.from_pretrained('google/tapas-base')
+>>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base")
 
 >>> # or, the base sized model with WTQ configuration
->>> config = TapasConfig.from_pretrained('google/tapas-base-finetuned-wtq')
->>> model = TapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)
+>>> config = TapasConfig.from_pretrained("google/tapas-base-finetuned-wtq")
+>>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
 
 >>> # or, the base sized model with WikiSQL configuration
->>> config = TapasConfig('google-base-finetuned-wikisql-supervised')
->>> model = TapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)
+>>> config = TapasConfig("google-base-finetuned-wikisql-supervised")
+>>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
 ===PT-TF-SPLIT===
 >>> from transformers import TapasConfig, TFTapasForQuestionAnswering
 
 >>> # for example, the base sized model with default SQA configuration
->>> model = TFTapasForQuestionAnswering.from_pretrained('google/tapas-base')
+>>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base")
 
 >>> # or, the base sized model with WTQ configuration
->>> config = TapasConfig.from_pretrained('google/tapas-base-finetuned-wtq')
->>> model = TFTapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)
+>>> config = TapasConfig.from_pretrained("google/tapas-base-finetuned-wtq")
+>>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
 
 >>> # or, the base sized model with WikiSQL configuration
->>> config = TapasConfig('google-base-finetuned-wikisql-supervised')
->>> model = TFTapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)
+>>> config = TapasConfig("google-base-finetuned-wikisql-supervised")
+>>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
 ```
 
 Of course, you don't necessarily have to follow one of these three ways in which TAPAS was fine-tuned. You can also experiment by defining any hyperparameters you want when initializing [`TapasConfig`], and then create a [`TapasForQuestionAnswering`] based on that configuration. For example, if you have a dataset that has both conversational questions and questions that might involve aggregation, then you can do it this way. Here's an example:
@@ -107,14 +107,14 @@ Of course, you don't necessarily have to follow one of these three ways in which
 >>> # you can initialize the classification heads any way you want (see docs of TapasConfig)
 >>> config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True)
 >>> # initializing the pre-trained base sized model with our custom classification heads
->>> model = TapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)
+>>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
 ===PT-TF-SPLIT===
 >>> from transformers import TapasConfig, TFTapasForQuestionAnswering
 
 >>> # you can initialize the classification heads any way you want (see docs of TapasConfig)
 >>> config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True)
 >>> # initializing the pre-trained base sized model with our custom classification heads
->>> model = TFTapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)
+>>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
 ```
 
 What you can also do is start from an already fine-tuned checkpoint. A note here is that the already fine-tuned checkpoint on WTQ has some issues due to the L2-loss which is somewhat brittle. See [here](https://github.com/google-research/tapas/issues/91#issuecomment-735719340) for more info.
@@ -154,15 +154,26 @@ inputs to be fine-tuned:
 >>> from transformers import TapasTokenizer
 >>> import pandas as pd
 
->>> model_name = 'google/tapas-base'
+>>> model_name = "google/tapas-base"
 >>> tokenizer = TapasTokenizer.from_pretrained(model_name)
 
->>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], 'Number of movies': ["87", "53", "69"]}
->>> queries = ["What is the name of the first actor?", "How many movies has George Clooney played in?", "What is the total number of movies?"]
+>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
+>>> queries = [
+...     "What is the name of the first actor?",
+...     "How many movies has George Clooney played in?",
+...     "What is the total number of movies?",
+... ]
 >>> answer_coordinates = [[(0, 0)], [(2, 1)], [(0, 1), (1, 1), (2, 1)]]
 >>> answer_text = [["Brad Pitt"], ["69"], ["209"]]
 >>> table = pd.DataFrame.from_dict(data)
->>> inputs = tokenizer(table=table, queries=queries, answer_coordinates=answer_coordinates, answer_text=answer_text, padding='max_length', return_tensors='pt')
+>>> inputs = tokenizer(
+...     table=table,
+...     queries=queries,
+...     answer_coordinates=answer_coordinates,
+...     answer_text=answer_text,
+...     padding="max_length",
+...     return_tensors="pt",
+... )
 >>> inputs
 {'input_ids': tensor([[ ... ]]), 'attention_mask': tensor([[...]]), 'token_type_ids': tensor([[[...]]]),
 'numeric_values': tensor([[ ... ]]), 'numeric_values_scale: tensor([[ ... ]]), labels: tensor([[ ... ]])}
@@ -170,15 +181,26 @@ inputs to be fine-tuned:
 >>> from transformers import TapasTokenizer
 >>> import pandas as pd
 
->>> model_name = 'google/tapas-base'
+>>> model_name = "google/tapas-base"
 >>> tokenizer = TapasTokenizer.from_pretrained(model_name)
 
->>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], 'Number of movies': ["87", "53", "69"]}
->>> queries = ["What is the name of the first actor?", "How many movies has George Clooney played in?", "What is the total number of movies?"]
+>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
+>>> queries = [
+...     "What is the name of the first actor?",
+...     "How many movies has George Clooney played in?",
+...     "What is the total number of movies?",
+... ]
 >>> answer_coordinates = [[(0, 0)], [(2, 1)], [(0, 1), (1, 1), (2, 1)]]
 >>> answer_text = [["Brad Pitt"], ["69"], ["209"]]
 >>> table = pd.DataFrame.from_dict(data)
->>> inputs = tokenizer(table=table, queries=queries, answer_coordinates=answer_coordinates, answer_text=answer_text, padding='max_length', return_tensors='tf')
+>>> inputs = tokenizer(
+...     table=table,
+...     queries=queries,
+...     answer_coordinates=answer_coordinates,
+...     answer_text=answer_text,
+...     padding="max_length",
+...     return_tensors="tf",
+... )
 >>> inputs
 {'input_ids': tensor([[ ... ]]), 'attention_mask': tensor([[...]]), 'token_type_ids': tensor([[[...]]]),
 'numeric_values': tensor([[ ... ]]), 'numeric_values_scale: tensor([[ ... ]]), labels: tensor([[ ... ]])}
@@ -194,32 +216,37 @@ Of course, this only shows how to encode a single training example. It is advise
 >>> tsv_path = "your_path_to_the_tsv_file"
 >>> table_csv_path = "your_path_to_a_directory_containing_all_csv_files"
 
+
 >>> class TableDataset(torch.utils.data.Dataset):
 ...     def __init__(self, data, tokenizer):
 ...         self.data = data
 ...         self.tokenizer = tokenizer
-...
+
 ...     def __getitem__(self, idx):
 ...         item = data.iloc[idx]
-...         table = pd.read_csv(table_csv_path + item.table_file).astype(str) # be sure to make your table data text only
-...         encoding = self.tokenizer(table=table, 
-...                                   queries=item.question, 
-...                                   answer_coordinates=item.answer_coordinates, 
-...                                   answer_text=item.answer_text,
-...                                   truncation=True,
-...                                   padding="max_length",
-...                                   return_tensors="pt"
+...         table = pd.read_csv(table_csv_path + item.table_file).astype(
+...             str
+...         )  # be sure to make your table data text only
+...         encoding = self.tokenizer(
+...             table=table,
+...             queries=item.question,
+...             answer_coordinates=item.answer_coordinates,
+...             answer_text=item.answer_text,
+...             truncation=True,
+...             padding="max_length",
+...             return_tensors="pt",
 ...         )
 ...         # remove the batch dimension which the tokenizer adds by default
 ...         encoding = {key: val.squeeze(0) for key, val in encoding.items()}
 ...         # add the float_answer which is also required (weak supervision for aggregation case)
-...         encoding["float_answer"] = torch.tensor(item.float_answer) 
+...         encoding["float_answer"] = torch.tensor(item.float_answer)
 ...         return encoding
-...
-...     def __len__(self):
-...        return len(self.data)
 
->>> data = pd.read_csv(tsv_path, sep='\t')
+...     def __len__(self):
+...         return len(self.data)
+
+
+>>> data = pd.read_csv(tsv_path, sep="\t")
 >>> train_dataset = TableDataset(data, tokenizer)
 >>> train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32)
 ===PT-TF-SPLIT===
@@ -229,44 +256,50 @@ Of course, this only shows how to encode a single training example. It is advise
 >>> tsv_path = "your_path_to_the_tsv_file"
 >>> table_csv_path = "your_path_to_a_directory_containing_all_csv_files"
 
+
 >>> class TableDataset:
 ...     def __init__(self, data, tokenizer):
 ...         self.data = data
 ...         self.tokenizer = tokenizer
-...
+
 ...     def __iter__(self):
 ...         for idx in range(self.__len__()):
 ...             item = self.data.iloc[idx]
-...             table = pd.read_csv(table_csv_path + item.table_file).astype(str) # be sure to make your table data text only
-...             encoding = self.tokenizer(table=table, 
-...                                   queries=item.question, 
-...                                   answer_coordinates=item.answer_coordinates, 
-...                                   answer_text=item.answer_text,
-...                                   truncation=True,
-...                                   padding="max_length",
-...                                   return_tensors="tf"
+...             table = pd.read_csv(table_csv_path + item.table_file).astype(
+...                 str
+...             )  # be sure to make your table data text only
+...             encoding = self.tokenizer(
+...                 table=table,
+...                 queries=item.question,
+...                 answer_coordinates=item.answer_coordinates,
+...                 answer_text=item.answer_text,
+...                 truncation=True,
+...                 padding="max_length",
+...                 return_tensors="tf",
 ...             )
 ...             # remove the batch dimension which the tokenizer adds by default
-...             encoding = {key: tf.squeeze(val,0) for key, val in encoding.items()}
+...             encoding = {key: tf.squeeze(val, 0) for key, val in encoding.items()}
 ...             # add the float_answer which is also required (weak supervision for aggregation case)
-...             encoding["float_answer"] = tf.convert_to_tensor(item.float_answer,dtype=tf.float32)
-...             yield encoding['input_ids'], encoding['attention_mask'], encoding['numeric_values'], \
-...                   encoding['numeric_values_scale'], encoding['token_type_ids'], encoding['labels'], \
-...                   encoding['float_answer']
-...
-...     def __len__(self):
-...        return len(self.data)
+...             encoding["float_answer"] = tf.convert_to_tensor(item.float_answer, dtype=tf.float32)
+...             yield encoding["input_ids"], encoding["attention_mask"], encoding["numeric_values"], encoding[
+...                 "numeric_values_scale"
+...             ], encoding["token_type_ids"], encoding["labels"], encoding["float_answer"]
 
->>> data = pd.read_csv(tsv_path, sep='\t')
+...     def __len__(self):
+...         return len(self.data)
+
+
+>>> data = pd.read_csv(tsv_path, sep="\t")
 >>> train_dataset = TableDataset(data, tokenizer)
 >>> output_signature = (
-... tf.TensorSpec(shape=(512,), dtype=tf.int32),
-... tf.TensorSpec(shape=(512,), dtype=tf.int32),
-... tf.TensorSpec(shape=(512,), dtype=tf.float32),
-... tf.TensorSpec(shape=(512,), dtype=tf.float32),
-... tf.TensorSpec(shape=(512,7), dtype=tf.int32),
-... tf.TensorSpec(shape=(512,), dtype=tf.int32),
-... tf.TensorSpec(shape=(512,), dtype=tf.float32))
+...     tf.TensorSpec(shape=(512,), dtype=tf.int32),
+...     tf.TensorSpec(shape=(512,), dtype=tf.int32),
+...     tf.TensorSpec(shape=(512,), dtype=tf.float32),
+...     tf.TensorSpec(shape=(512,), dtype=tf.float32),
+...     tf.TensorSpec(shape=(512, 7), dtype=tf.int32),
+...     tf.TensorSpec(shape=(512,), dtype=tf.int32),
+...     tf.TensorSpec(shape=(512,), dtype=tf.float32),
+... )
 >>> train_dataloader = tf.data.Dataset.from_generator(train_dataset, output_signature=output_signature).batch(32)
 ```
 
@@ -282,15 +315,15 @@ You can then fine-tune [`TapasForQuestionAnswering`] or [`TFTapasForQuestionAnsw
 
 >>> # this is the default WTQ configuration
 >>> config = TapasConfig(
-...            num_aggregation_labels = 4,
-...            use_answer_as_supervision = True,
-...            answer_loss_cutoff = 0.664694,
-...            cell_selection_preference = 0.207951,
-...            huber_loss_delta = 0.121194,
-...            init_cell_selection_weights_to_zero = True,
-...            select_one_column = True,
-...            allow_empty_column_selection = False,
-...            temperature = 0.0352513,
+...     num_aggregation_labels=4,
+...     use_answer_as_supervision=True,
+...     answer_loss_cutoff=0.664694,
+...     cell_selection_preference=0.207951,
+...     huber_loss_delta=0.121194,
+...     init_cell_selection_weights_to_zero=True,
+...     select_one_column=True,
+...     allow_empty_column_selection=False,
+...     temperature=0.0352513,
 ... )
 >>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
 
@@ -298,8 +331,8 @@ You can then fine-tune [`TapasForQuestionAnswering`] or [`TFTapasForQuestionAnsw
 
 >>> model.train()
 >>> for epoch in range(2):  # loop over the dataset multiple times
-...    for batch in train_dataloader:
-...         # get the inputs; 
+...     for batch in train_dataloader:
+...         # get the inputs;
 ...         input_ids = batch["input_ids"]
 ...         attention_mask = batch["attention_mask"]
 ...         token_type_ids = batch["token_type_ids"]
@@ -312,9 +345,15 @@ You can then fine-tune [`TapasForQuestionAnswering`] or [`TFTapasForQuestionAnsw
 ...         optimizer.zero_grad()
 
 ...         # forward + backward + optimize
-...         outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, 
-...                        labels=labels, numeric_values=numeric_values, numeric_values_scale=numeric_values_scale, 
-...                        float_answer=float_answer)
+...         outputs = model(
+...             input_ids=input_ids,
+...             attention_mask=attention_mask,
+...             token_type_ids=token_type_ids,
+...             labels=labels,
+...             numeric_values=numeric_values,
+...             numeric_values_scale=numeric_values_scale,
+...             float_answer=float_answer,
+...         )
 ...         loss = outputs.loss
 ...         loss.backward()
 ...         optimizer.step()
@@ -324,23 +363,23 @@ You can then fine-tune [`TapasForQuestionAnswering`] or [`TFTapasForQuestionAnsw
 
 >>> # this is the default WTQ configuration
 >>> config = TapasConfig(
-...            num_aggregation_labels = 4,
-...            use_answer_as_supervision = True,
-...            answer_loss_cutoff = 0.664694,
-...            cell_selection_preference = 0.207951,
-...            huber_loss_delta = 0.121194,
-...            init_cell_selection_weights_to_zero = True,
-...            select_one_column = True,
-...            allow_empty_column_selection = False,
-...            temperature = 0.0352513,
+...     num_aggregation_labels=4,
+...     use_answer_as_supervision=True,
+...     answer_loss_cutoff=0.664694,
+...     cell_selection_preference=0.207951,
+...     huber_loss_delta=0.121194,
+...     init_cell_selection_weights_to_zero=True,
+...     select_one_column=True,
+...     allow_empty_column_selection=False,
+...     temperature=0.0352513,
 ... )
 >>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
 
 >>> optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
 
 >>> for epoch in range(2):  # loop over the dataset multiple times
-...    for batch in train_dataloader:
-...         # get the inputs; 
+...     for batch in train_dataloader:
+...         # get the inputs;
 ...         input_ids = batch[0]
 ...         attention_mask = batch[1]
 ...         token_type_ids = batch[4]
@@ -351,9 +390,15 @@ You can then fine-tune [`TapasForQuestionAnswering`] or [`TFTapasForQuestionAnsw
 
 ...         # forward + backward + optimize
 ...         with tf.GradientTape() as tape:
-...              outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, 
-...                        labels=labels, numeric_values=numeric_values, numeric_values_scale=numeric_values_scale, 
-...                        float_answer=float_answer )
+...             outputs = model(
+...                 input_ids=input_ids,
+...                 attention_mask=attention_mask,
+...                 token_type_ids=token_type_ids,
+...                 labels=labels,
+...                 numeric_values=numeric_values,
+...                 numeric_values_scale=numeric_values_scale,
+...                 float_answer=float_answer,
+...             )
 ...         grads = tape.gradient(outputs.loss, model.trainable_weights)
 ...         optimizer.apply_gradients(zip(grads, model.trainable_weights))
 ```
@@ -366,47 +411,49 @@ However, note that inference is **different** depending on whether or not the se
 
 ```py
 >>> from transformers import TapasTokenizer, TapasForQuestionAnswering
->>> import pandas as pd 
+>>> import pandas as pd
 
->>> model_name = 'google/tapas-base-finetuned-wtq'
+>>> model_name = "google/tapas-base-finetuned-wtq"
 >>> model = TapasForQuestionAnswering.from_pretrained(model_name)
 >>> tokenizer = TapasTokenizer.from_pretrained(model_name)
 
->>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], 'Number of movies': ["87", "53", "69"]}
->>> queries = ["What is the name of the first actor?", "How many movies has George Clooney played in?", "What is the total number of movies?"]
+>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
+>>> queries = [
+...     "What is the name of the first actor?",
+...     "How many movies has George Clooney played in?",
+...     "What is the total number of movies?",
+... ]
 >>> table = pd.DataFrame.from_dict(data)
->>> inputs = tokenizer(table=table, queries=queries, padding='max_length', return_tensors="pt") 
+>>> inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="pt")
 >>> outputs = model(**inputs)
 >>> predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
-...         inputs, 
-...         outputs.logits.detach(), 
-...         outputs.logits_aggregation.detach()
+...     inputs, outputs.logits.detach(), outputs.logits_aggregation.detach()
 ... )
 
 >>> # let's print out the results:
->>> id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3:"COUNT"}
+>>> id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"}
 >>> aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices]
 
 >>> answers = []
 >>> for coordinates in predicted_answer_coordinates:
-...   if len(coordinates) == 1:
-...     # only a single cell:
-...     answers.append(table.iat[coordinates[0]])
-...   else:
-...     # multiple cells
-...     cell_values = []
-...     for coordinate in coordinates:
-...        cell_values.append(table.iat[coordinate])
-...     answers.append(", ".join(cell_values))
+...     if len(coordinates) == 1:
+...         # only a single cell:
+...         answers.append(table.iat[coordinates[0]])
+...     else:
+...         # multiple cells
+...         cell_values = []
+...         for coordinate in coordinates:
+...             cell_values.append(table.iat[coordinate])
+...         answers.append(", ".join(cell_values))
 
 >>> display(table)
 >>> print("")
 >>> for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string):
-...   print(query)
-...   if predicted_agg == "NONE":
-...     print("Predicted answer: " + answer)
-...   else:
-...     print("Predicted answer: " + predicted_agg + " > " + answer)    
+...     print(query)
+...     if predicted_agg == "NONE":
+...         print("Predicted answer: " + answer)
+...     else:
+...         print("Predicted answer: " + predicted_agg + " > " + answer)
 What is the name of the first actor?
 Predicted answer: Brad Pitt
 How many movies has George Clooney played in?
@@ -415,47 +462,49 @@ What is the total number of movies?
 Predicted answer: SUM > 87, 53, 69
 ===PT-TF-SPLIT===
 >>> from transformers import TapasTokenizer, TFTapasForQuestionAnswering
->>> import pandas as pd 
+>>> import pandas as pd
 
->>> model_name = 'google/tapas-base-finetuned-wtq'
+>>> model_name = "google/tapas-base-finetuned-wtq"
 >>> model = TFTapasForQuestionAnswering.from_pretrained(model_name)
 >>> tokenizer = TapasTokenizer.from_pretrained(model_name)
 
->>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], 'Number of movies': ["87", "53", "69"]}
->>> queries = ["What is the name of the first actor?", "How many movies has George Clooney played in?", "What is the total number of movies?"]
+>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
+>>> queries = [
+...     "What is the name of the first actor?",
+...     "How many movies has George Clooney played in?",
+...     "What is the total number of movies?",
+... ]
 >>> table = pd.DataFrame.from_dict(data)
->>> inputs = tokenizer(table=table, queries=queries, padding='max_length', return_tensors="tf") 
+>>> inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="tf")
 >>> outputs = model(**inputs)
 >>> predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
-...         inputs, 
-...         outputs.logits, 
-...         outputs.logits_aggregation
+...     inputs, outputs.logits, outputs.logits_aggregation
 ... )
 
 >>> # let's print out the results:
->>> id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3:"COUNT"}
+>>> id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"}
 >>> aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices]
 
 >>> answers = []
 >>> for coordinates in predicted_answer_coordinates:
-...   if len(coordinates) == 1:
-...     # only a single cell:
-...     answers.append(table.iat[coordinates[0]])
-...   else:
-...     # multiple cells
-...     cell_values = []
-...     for coordinate in coordinates:
-...        cell_values.append(table.iat[coordinate])
-...     answers.append(", ".join(cell_values))
+...     if len(coordinates) == 1:
+...         # only a single cell:
+...         answers.append(table.iat[coordinates[0]])
+...     else:
+...         # multiple cells
+...         cell_values = []
+...         for coordinate in coordinates:
+...             cell_values.append(table.iat[coordinate])
+...         answers.append(", ".join(cell_values))
 
 >>> display(table)
 >>> print("")
 >>> for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string):
-...   print(query)
-...   if predicted_agg == "NONE":
-...     print("Predicted answer: " + answer)
-...   else:
-...     print("Predicted answer: " + predicted_agg + " > " + answer)    
+...     print(query)
+...     if predicted_agg == "NONE":
+...         print("Predicted answer: " + answer)
+...     else:
+...         print("Predicted answer: " + predicted_agg + " > " + answer)
 What is the name of the first actor?
 Predicted answer: Brad Pitt
 How many movies has George Clooney played in?
diff --git a/docs/source/model_doc/visual_bert.mdx b/docs/source/model_doc/visual_bert.mdx
index f8ec0714a7..69507076b0 100644
--- a/docs/source/model_doc/visual_bert.mdx
+++ b/docs/source/model_doc/visual_bert.mdx
@@ -77,11 +77,13 @@ The following example shows how to get the last hidden state using [`VisualBertM
 
 >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
 >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
->>> inputs.update({
-...     "visual_embeds": visual_embeds,
-...     "visual_token_type_ids": visual_token_type_ids,
-...     "visual_attention_mask": visual_attention_mask
-... })
+>>> inputs.update(
+...     {
+...         "visual_embeds": visual_embeds,
+...         "visual_token_type_ids": visual_token_type_ids,
+...         "visual_attention_mask": visual_attention_mask,
+...     }
+... )
 >>> outputs = model(**inputs)
 >>> last_hidden_state = outputs.last_hidden_state
 ```
diff --git a/docs/source/model_sharing.mdx b/docs/source/model_sharing.mdx
index d7f3e7be80..a94988611c 100644
--- a/docs/source/model_sharing.mdx
+++ b/docs/source/model_sharing.mdx
@@ -50,9 +50,8 @@ For instance:
 
 ```python
 >>> model = AutoModel.from_pretrained(
->>>     "julien-c/EsperBERTo-small",
->>>     revision="v2.0.1" # tag name, or branch name, or commit hash
->>> )
+...     "julien-c/EsperBERTo-small", revision="v2.0.1"  # tag name, or branch name, or commit hash
+... )
 ```
 
 ## Push your model from Python
@@ -344,9 +343,8 @@ You may specify a revision by using the `revision` flag in the `from_pretrained`
 
 ```python
 >>> tokenizer = AutoTokenizer.from_pretrained(
->>>   "julien-c/EsperBERTo-small",
->>>   revision="v2.0.1" # tag name, or branch name, or commit hash
->>> )
+...     "julien-c/EsperBERTo-small", revision="v2.0.1"  # tag name, or branch name, or commit hash
+... )
 ```
 
 ## Workflow in a Colab notebook
diff --git a/docs/source/multilingual.mdx b/docs/source/multilingual.mdx
index 49b366b828..3b6df686d7 100644
--- a/docs/source/multilingual.mdx
+++ b/docs/source/multilingual.mdx
@@ -62,18 +62,18 @@ The different languages this model/tokenizer handles, as well as the ids of thes
 These ids should be used when passing a language parameter during a model pass. Let's define our inputs:
 
 ```py
->>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1
+>>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")])  # batch size of 1
 ```
 
 We should now define the language embedding by using the previously defined language id. We want to create a tensor
 filled with the appropriate language ids, of the same size as input_ids. For english, the id is 0:
 
 ```py
->>> language_id = tokenizer.lang2id['en']  # 0
+>>> language_id = tokenizer.lang2id["en"]  # 0
 >>> langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])
 
 >>> # We reshape it to be of size (batch_size, sequence_length)
->>> langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1)
+>>> langs = langs.view(1, -1)  # is now of shape [1, sequence_length] (we have a batch size of 1)
 ```
 
 You can then feed it all as input to your model:
diff --git a/docs/source/perplexity.mdx b/docs/source/perplexity.mdx
index f53b565037..3706a40091 100644
--- a/docs/source/perplexity.mdx
+++ b/docs/source/perplexity.mdx
@@ -69,8 +69,9 @@ Let's demonstrate this process with GPT-2.
 
 ```python
 from transformers import GPT2LMHeadModel, GPT2TokenizerFast
-device = 'cuda'
-model_id = 'gpt2-large'
+
+device = "cuda"
+model_id = "gpt2-large"
 model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
 tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
 ```
@@ -81,8 +82,9 @@ dataset in memory.
 
 ```python
 from datasets import load_dataset
-test = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
-encodings = tokenizer('\n\n'.join(test['text']), return_tensors='pt')
+
+test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")
 ```
 
 With 🤗 Transformers, we can simply pass the `input_ids` as the `labels` to our model, and the average negative
@@ -104,10 +106,10 @@ nlls = []
 for i in tqdm(range(0, encodings.input_ids.size(1), stride)):
     begin_loc = max(i + stride - max_length, 0)
     end_loc = min(i + stride, encodings.input_ids.size(1))
-    trg_len = end_loc - i    # may be different from stride on last loop
-    input_ids = encodings.input_ids[:,begin_loc:end_loc].to(device)
+    trg_len = end_loc - i  # may be different from stride on last loop
+    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
     target_ids = input_ids.clone()
-    target_ids[:,:-trg_len] = -100
+    target_ids[:, :-trg_len] = -100
 
     with torch.no_grad():
         outputs = model(input_ids, labels=target_ids)
diff --git a/docs/source/preprocessing.mdx b/docs/source/preprocessing.mdx
index b53bb00731..331d1566ed 100644
--- a/docs/source/preprocessing.mdx
+++ b/docs/source/preprocessing.mdx
@@ -36,7 +36,8 @@ To automatically download the vocab used during pretraining or fine-tuning a giv
 
 ```py
 from transformers import AutoTokenizer
-tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
+
+tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
 ```
 
 ## Base use
@@ -75,9 +76,7 @@ If you have several sentences you want to process, you can do this efficiently b
 tokenizer:
 
 ```py
->>> batch_sentences = ["Hello I'm a single sentence",
-...                    "And another sentence",
-...                    "And the very very last one"]
+>>> batch_sentences = ["Hello I'm a single sentence", "And another sentence", "And the very very last one"]
 >>> encoded_inputs = tokenizer(batch_sentences)
 >>> print(encoded_inputs)
 {'input_ids': [[101, 8667, 146, 112, 182, 170, 1423, 5650, 102],
@@ -174,12 +173,12 @@ If you have a list of pairs of sequences you want to process, you should feed th
 list of first sentences and the list of second sentences:
 
 ```py
->>> batch_sentences = ["Hello I'm a single sentence",
-...                    "And another sentence",
-...                    "And the very very last one"]
->>> batch_of_second_sentences = ["I'm a sentence that goes with the first sentence",
-...                              "And I should be encoded with the second sentence",
-...                              "And I go with the very last one"]
+>>> batch_sentences = ["Hello I'm a single sentence", "And another sentence", "And the very very last one"]
+>>> batch_of_second_sentences = [
+...     "I'm a sentence that goes with the first sentence",
+...     "And I should be encoded with the second sentence",
+...     "And I go with the very last one",
+... ]
 >>> encoded_inputs = tokenizer(batch_sentences, batch_of_second_sentences)
 >>> print(encoded_inputs)
 {'input_ids': [[101, 8667, 146, 112, 182, 170, 1423, 5650, 102, 146, 112, 182, 170, 5650, 1115, 2947, 1114, 1103, 1148, 5650, 102], 
@@ -199,7 +198,7 @@ To double-check what is fed to the model, we can decode each list in _input_ids_
 
 ```py
 >>> for ids in encoded_inputs["input_ids"]:
->>>     print(tokenizer.decode(ids))
+...     print(tokenizer.decode(ids))
 [CLS] Hello I'm a single sentence [SEP] I'm a sentence that goes with the first sentence [SEP]
 [CLS] And another sentence [SEP] And I should be encoded with the second sentence [SEP]
 [CLS] And the very very last one [SEP] And I go with the very last one [SEP]
@@ -307,35 +306,43 @@ This works exactly as before for batch of sentences or batch of pairs of sentenc
 like this:
 
 ```py
-batch_sentences = [["Hello", "I'm", "a", "single", "sentence"],
-                   ["And", "another", "sentence"],
-                   ["And", "the", "very", "very", "last", "one"]]
+batch_sentences = [
+    ["Hello", "I'm", "a", "single", "sentence"],
+    ["And", "another", "sentence"],
+    ["And", "the", "very", "very", "last", "one"],
+]
 encoded_inputs = tokenizer(batch_sentences, is_split_into_words=True)
 ```
 
 or a batch of pair sentences like this:
 
 ```py
-batch_of_second_sentences = [["I'm", "a", "sentence", "that", "goes", "with", "the", "first", "sentence"],
-                             ["And", "I", "should", "be", "encoded", "with", "the", "second", "sentence"],
-                             ["And", "I", "go", "with", "the", "very", "last", "one"]]
+batch_of_second_sentences = [
+    ["I'm", "a", "sentence", "that", "goes", "with", "the", "first", "sentence"],
+    ["And", "I", "should", "be", "encoded", "with", "the", "second", "sentence"],
+    ["And", "I", "go", "with", "the", "very", "last", "one"],
+]
 encoded_inputs = tokenizer(batch_sentences, batch_of_second_sentences, is_split_into_words=True)
 ```
 
 And you can add padding, truncation as well as directly return tensors like before:
 
 ```py
-batch = tokenizer(batch_sentences,
-                  batch_of_second_sentences,
-                  is_split_into_words=True,
-                  padding=True,
-                  truncation=True,
-                  return_tensors="pt")
+batch = tokenizer(
+    batch_sentences,
+    batch_of_second_sentences,
+    is_split_into_words=True,
+    padding=True,
+    truncation=True,
+    return_tensors="pt",
+)
 ===PT-TF-SPLIT===
-batch = tokenizer(batch_sentences,
-                  batch_of_second_sentences,
-                  is_split_into_words=True,
-                  padding=True,
-                  truncation=True,
-                  return_tensors="tf")
+batch = tokenizer(
+    batch_sentences,
+    batch_of_second_sentences,
+    is_split_into_words=True,
+    padding=True,
+    truncation=True,
+    return_tensors="tf",
+)
 ```
diff --git a/docs/source/quicktour.mdx b/docs/source/quicktour.mdx
index 90ba300e54..38a65cb528 100644
--- a/docs/source/quicktour.mdx
+++ b/docs/source/quicktour.mdx
@@ -57,7 +57,8 @@ pip install tensorflow
 
 ```py
 >>> from transformers import pipeline
->>> classifier = pipeline('sentiment-analysis')
+
+>>> classifier = pipeline("sentiment-analysis")
 ```
 
 When typing this command for the first time, a pretrained model and its tokenizer are downloaded and cached. We will
@@ -67,7 +68,7 @@ make them readable. For instance:
 
 
 ```py
->>> classifier('We are very happy to show you the 🤗 Transformers library.')
+>>> classifier("We are very happy to show you the 🤗 Transformers library.")
 [{'label': 'POSITIVE', 'score': 0.9998}]
 ```
 
@@ -75,8 +76,7 @@ That's encouraging! You can use it on a list of sentences, which will be preproc
 a list of dictionaries like this one:
 
 ```py
->>> results = classifier(["We are very happy to show you the 🤗 Transformers library.",
-...            "We hope you don't hate it."])
+>>> results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."])
 >>> for result in results:
 ...     print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
 label: POSITIVE, with score: 0.9998
@@ -102,7 +102,7 @@ see how we can use it.
 You can directly pass the name of the model to use to [`pipeline`]:
 
 ```py
->>> classifier = pipeline('sentiment-analysis', model="nlptown/bert-base-multilingual-uncased-sentiment")
+>>> classifier = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")
 ```
 
 This classifier can now deal with texts in English, French, but also Dutch, German, Italian and Spanish! You can also
@@ -125,13 +125,13 @@ any other model from the model hub):
 >>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
 >>> model = AutoModelForSequenceClassification.from_pretrained(model_name)
 >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
->>> classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
+>>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
 ===PT-TF-SPLIT===
 >>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
 >>> # This model only exists in PyTorch, so we use the _from_pt_ flag to import that model in TensorFlow.
 >>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name, from_pt=True)
 >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
->>> classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
+>>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
 ```
 
 If you don't find a model that has been pretrained on some data similar to yours, you will need to fine-tune a
@@ -150,11 +150,13 @@ As we saw, the model and tokenizer are created using the `from_pretrained` metho
 
 ```py
 >>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+
 >>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
 >>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
 >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
 ===PT-TF-SPLIT===
 >>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+
 >>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
 >>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
 >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -199,7 +201,7 @@ and get tensors back. You can specify all of that to the tokenizer:
 ...     padding=True,
 ...     truncation=True,
 ...     max_length=512,
-...     return_tensors="pt"
+...     return_tensors="pt",
 ... )
 ===PT-TF-SPLIT===
 >>> tf_batch = tokenizer(
@@ -207,7 +209,7 @@ and get tensors back. You can specify all of that to the tokenizer:
 ...     padding=True,
 ...     truncation=True,
 ...     max_length=512,
-...     return_tensors="tf"
+...     return_tensors="tf",
 ... )
 ```
 
@@ -267,9 +269,11 @@ Let's apply the SoftMax activation to get predictions.
 
 ```py
 >>> from torch import nn
+
 >>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
 ===PT-TF-SPLIT===
 >>> import tensorflow as tf
+
 >>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)
 ```
 
@@ -291,13 +295,15 @@ attribute:
 
 ```py
 >>> import torch
->>> pt_outputs = pt_model(**pt_batch, labels = torch.tensor([1, 0]))
+
+>>> pt_outputs = pt_model(**pt_batch, labels=torch.tensor([1, 0]))
 >>> print(pt_outputs)
 SequenceClassifierOutput(loss=tensor(0.3167, grad_fn=<NllLossBackward>), logits=tensor([[-4.0833,  4.3364],
         [ 0.0818, -0.0418]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
 ===PT-TF-SPLIT===
 >>> import tensorflow as tf
->>> tf_outputs = tf_model(tf_batch, labels = tf.constant([1, 0]))
+
+>>> tf_outputs = tf_model(tf_batch, labels=tf.constant([1, 0]))
 >>> print(tf_outputs)
 TFSequenceClassifierOutput(loss=<tf.Tensor: shape=(2,), dtype=float32, numpy=array([2.2051e-04, 6.3326e-01], dtype=float32)>, logits=<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
 array([[-4.0833 ,  4.3364  ],
@@ -317,11 +323,11 @@ case the attributes not set (that have `None` values) are ignored.
 Once your model is fine-tuned, you can save it with its tokenizer in the following way:
 
 ```py
->>> pt_save_directory = './pt_save_pretrained'
+>>> pt_save_directory = "./pt_save_pretrained"
 >>> tokenizer.save_pretrained(pt_save_directory)
 >>> pt_model.save_pretrained(pt_save_directory)
 ===PT-TF-SPLIT===
->>> tf_save_directory = './tf_save_pretrained'
+>>> tf_save_directory = "./tf_save_pretrained"
 >>> tokenizer.save_pretrained(tf_save_directory)
 >>> tf_model.save_pretrained(tf_save_directory)
 ```
@@ -343,10 +349,12 @@ Then, use the corresponding Auto class to load it like this:
 
 ```py
 >>> from transformers import AutoModel
+
 >>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
 >>> pt_model = AutoModel.from_pretrained(tf_save_directory, from_tf=True)
 ===PT-TF-SPLIT===
 >>> from transformers import TFAutoModel
+
 >>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
 >>> tf_model = TFAutoModel.from_pretrained(pt_save_directory, from_pt=True)
 ```
@@ -356,11 +364,11 @@ Lastly, you can also ask the model to return all hidden states and all attention
 
 ```py
 >>> pt_outputs = pt_model(**pt_batch, output_hidden_states=True, output_attentions=True)
->>> all_hidden_states  = pt_outputs.hidden_states 
+>>> all_hidden_states = pt_outputs.hidden_states
 >>> all_attentions = pt_outputs.attentions
 ===PT-TF-SPLIT===
 >>> tf_outputs = tf_model(tf_batch, output_hidden_states=True, output_attentions=True)
->>> all_hidden_states =  tf_outputs.hidden_states
+>>> all_hidden_states = tf_outputs.hidden_states
 >>> all_attentions = tf_outputs.attentions
 ```
 
@@ -376,11 +384,13 @@ directly instantiate model and tokenizer without the auto magic:
 
 ```py
 >>> from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
+
 >>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
 >>> model = DistilBertForSequenceClassification.from_pretrained(model_name)
 >>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)
 ===PT-TF-SPLIT===
 >>> from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
+
 >>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
 >>> model = TFDistilBertForSequenceClassification.from_pretrained(model_name)
 >>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)
@@ -401,13 +411,15 @@ the model from scratch. Therefore, we instantiate the model from a configuration
 
 ```py
 >>> from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification
->>> config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4*512)
->>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+
+>>> config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4 * 512)
+>>> tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
 >>> model = DistilBertForSequenceClassification(config)
 ===PT-TF-SPLIT===
 >>> from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForSequenceClassification
->>> config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4*512)
->>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+
+>>> config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4 * 512)
+>>> tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
 >>> model = TFDistilBertForSequenceClassification(config)
 ```
 
@@ -419,11 +431,13 @@ configuration appropriately:
 
 ```py
 >>> from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification
+
 >>> model_name = "distilbert-base-uncased"
 >>> model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=10)
 >>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)
 ===PT-TF-SPLIT===
 >>> from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForSequenceClassification
+
 >>> model_name = "distilbert-base-uncased"
 >>> model = TFDistilBertForSequenceClassification.from_pretrained(model_name, num_labels=10)
 >>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)
diff --git a/docs/source/serialization.mdx b/docs/source/serialization.mdx
index 66d0933b04..0d667fc070 100644
--- a/docs/source/serialization.mdx
+++ b/docs/source/serialization.mdx
@@ -109,6 +109,7 @@ This export can now be used in the ONNX inference runtime:
 import onnxruntime as ort
 
 from transformers import BertTokenizerFast
+
 tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
 
 ort_session = ort.InferenceSession("onnx/bert-base-cased/model.onnx")
@@ -382,7 +383,7 @@ tokenized_text = enc.tokenize(text)
 
 # Masking one of the input tokens
 masked_index = 8
-tokenized_text[masked_index] = '[MASK]'
+tokenized_text[masked_index] = "[MASK]"
 indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
 segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
 
@@ -393,8 +394,14 @@ dummy_input = [tokens_tensor, segments_tensors]
 
 # Initializing the model with the torchscript flag
 # Flag set to True even though it is not necessary as this model does not have an LM Head.
-config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-    num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, torchscript=True)
+config = BertConfig(
+    vocab_size_or_config_json_file=32000,
+    hidden_size=768,
+    num_hidden_layers=12,
+    num_attention_heads=12,
+    intermediate_size=3072,
+    torchscript=True,
+)
 
 # Instantiating the model
 model = BertModel(config)
diff --git a/docs/source/task_summary.mdx b/docs/source/task_summary.mdx
index 02b0f314ba..a2bfb379bf 100644
--- a/docs/source/task_summary.mdx
+++ b/docs/source/task_summary.mdx
@@ -188,11 +188,15 @@ positions of the extracted answer in the text.
 
 ```py
 >>> result = question_answerer(question="What is extractive question answering?", context=context)
->>> print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")
+>>> print(
+...     f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}"
+... )
 Answer: 'the task of extracting an answer from a text given a question', score: 0.6177, start: 34, end: 95
 
 >>> result = question_answerer(question="What is a good example of a question answering dataset?", context=context)
->>> print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")
+>>> print(
+...     f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}"
+... )
 Answer: 'SQuAD dataset', score: 0.5152, start: 147, end: 160
 ```
 
@@ -232,18 +236,20 @@ Here is an example of question answering using a model and a tokenizer. The proc
 >>> for question in questions:
 ...     inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt")
 ...     input_ids = inputs["input_ids"].tolist()[0]
-...
+
 ...     outputs = model(**inputs)
 ...     answer_start_scores = outputs.start_logits
 ...     answer_end_scores = outputs.end_logits
-...
+
 ...     # Get the most likely beginning of answer with the argmax of the score
 ...     answer_start = torch.argmax(answer_start_scores)
-...     # Get the most likely end of answer with the argmax of the score 
+...     # Get the most likely end of answer with the argmax of the score
 ...     answer_end = torch.argmax(answer_end_scores) + 1
-...
-...     answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
-...
+
+...     answer = tokenizer.convert_tokens_to_string(
+...         tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])
+...     )
+
 ...     print(f"Question: {question}")
 ...     print(f"Answer: {answer}")
 Question: How many pretrained models are available in 🤗 Transformers?
@@ -275,18 +281,20 @@ Answer: tensorflow 2. 0 and pytorch
 >>> for question in questions:
 ...     inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="tf")
 ...     input_ids = inputs["input_ids"].numpy()[0]
-...
+
 ...     outputs = model(inputs)
 ...     answer_start_scores = outputs.start_logits
 ...     answer_end_scores = outputs.end_logits
-...
+
 ...     # Get the most likely beginning of answer with the argmax of the score
 ...     answer_start = tf.argmax(answer_start_scores, axis=1).numpy()[0]
 ...     # Get the most likely end of answer with the argmax of the score
 ...     answer_end = tf.argmax(answer_end_scores, axis=1).numpy()[0] + 1
-...
-...     answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
-...
+
+...     answer = tokenizer.convert_tokens_to_string(
+...         tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end])
+...     )
+
 ...     print(f"Question: {question}")
 ...     print(f"Answer: {answer}")
 Question: How many pretrained models are available in 🤗 Transformers?
@@ -327,7 +335,12 @@ This outputs the sequences with the mask filled, the confidence score, and the t
 
 ```py
 >>> from pprint import pprint
->>> pprint(unmasker(f"HuggingFace is creating a {unmasker.tokenizer.mask_token} that the community uses to solve NLP tasks."))
+
+>>> pprint(
+...     unmasker(
+...         f"HuggingFace is creating a {unmasker.tokenizer.mask_token} that the community uses to solve NLP tasks."
+...     )
+... )
 [{'score': 0.1793,
   'sequence': 'HuggingFace is creating a tool that the community uses to solve '
               'NLP tasks.',
@@ -374,8 +387,10 @@ Here is an example of doing masked language modeling using a model and a tokeniz
 >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
 >>> model = AutoModelForMaskedLM.from_pretrained("distilbert-base-cased")
 
->>> sequence = "Distilled models are smaller than the models they mimic. Using them instead of the large " \
+>>> sequence = (
+...     "Distilled models are smaller than the models they mimic. Using them instead of the large "
 ...     f"versions would help {tokenizer.mask_token} our carbon footprint."
+... )
 
 >>> inputs = tokenizer(sequence, return_tensors="pt")
 >>> mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
@@ -399,8 +414,10 @@ Distilled models are smaller than the models they mimic. Using them instead of t
 >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
 >>> model = TFAutoModelForMaskedLM.from_pretrained("distilbert-base-cased")
 
->>> sequence = "Distilled models are smaller than the models they mimic. Using them instead of the large " \
+>>> sequence = (
+...     "Distilled models are smaller than the models they mimic. Using them instead of the large "
 ...     f"versions would help {tokenizer.mask_token} our carbon footprint."
+... )
 
 >>> inputs = tokenizer(sequence, return_tensors="tf")
 >>> mask_token_index = tf.where(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1]
@@ -544,7 +561,7 @@ Below is an example of text generation using `XLNet` and its tokenizer, which in
 
 >>> prompt_length = len(tokenizer.decode(inputs[0]))
 >>> outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
->>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length+1:]
+>>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length + 1 :]
 
 >>> print(generated)
 Today the weather is really nice and I am planning ...
@@ -571,7 +588,7 @@ Today the weather is really nice and I am planning ...
 
 >>> prompt_length = len(tokenizer.decode(inputs[0]))
 >>> outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
->>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length+1:]
+>>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length + 1 :]
 
 >>> print(generated)
 Today the weather is really nice and I am planning ...
@@ -660,8 +677,10 @@ Here is an example of doing named entity recognition, using a model and a tokeni
 >>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
 >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
 
->>> sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, " \
-...            "therefore very close to the Manhattan Bridge."
+>>> sequence = (
+...     "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, "
+...     "therefore very close to the Manhattan Bridge."
+... )
 
 >>> inputs = tokenizer(sequence, return_tensors="pt")
 >>> tokens = inputs.tokens()
@@ -675,8 +694,10 @@ Here is an example of doing named entity recognition, using a model and a tokeni
 >>> model = TFAutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
 >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
 
->>> sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, " \
-...            "therefore very close to the Manhattan Bridge."
+>>> sequence = (
+...     "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, "
+...     "therefore very close to the Manhattan Bridge."
+... )
 
 >>> inputs = tokenizer(sequence, return_tensors="tf")
 >>> tokens = inputs.tokens()
@@ -863,7 +884,7 @@ Here is an example of doing translation using a model and a tokenizer. The proce
 
 >>> inputs = tokenizer(
 ...     "translate English to German: Hugging Face is a technology company based in New York and Paris",
-...     return_tensors="pt"
+...     return_tensors="pt",
 ... )
 >>> outputs = model.generate(inputs["input_ids"], max_length=40, num_beams=4, early_stopping=True)
 
@@ -877,7 +898,7 @@ Here is an example of doing translation using a model and a tokenizer. The proce
 
 >>> inputs = tokenizer(
 ...     "translate English to German: Hugging Face is a technology company based in New York and Paris",
-...     return_tensors="tf"
+...     return_tensors="tf",
 ... )
 >>> outputs = model.generate(inputs["input_ids"], max_length=40, num_beams=4, early_stopping=True)
 
diff --git a/docs/source/testing.mdx b/docs/source/testing.mdx
index 65a9881939..7e908f156d 100644
--- a/docs/source/testing.mdx
+++ b/docs/source/testing.mdx
@@ -422,14 +422,14 @@ Let's depict the GPU requirements in the following table:
 
 For example, here is a test that must be run only when there are 2 or more GPUs available and pytorch is installed:
 
-```python
+```python no-style
 @require_torch_multi_gpu
 def test_example_with_multi_gpu():
 ```
 
 If a test requires `tensorflow` use the `require_tf` decorator. For example:
 
-```python
+```python no-style
 @require_tf
 def test_tf_thing_with_tensorflow():
 ```
@@ -437,7 +437,7 @@ def test_tf_thing_with_tensorflow():
 These decorators can be stacked. For example, if a test is slow and requires at least one GPU under pytorch, here is
 how to set it up:
 
-```python
+```python no-style
 @require_torch_gpu
 @slow
 def test_example_slow_on_gpu():
@@ -446,7 +446,7 @@ def test_example_slow_on_gpu():
 Some decorators like `@parametrized` rewrite test names, therefore `@require_*` skip decorators have to be listed
 last for them to work correctly. Here is an example of the correct usage:
 
-```python
+```python no-style
 @parameterized.expand(...)
 @require_torch_multi_gpu
 def test_integration_foo():
@@ -461,7 +461,8 @@ Inside tests:
 
 ```python
 from transformers.testing_utils import get_gpu_count
-n_gpu = get_gpu_count() # works with torch and tf
+
+n_gpu = get_gpu_count()  # works with torch and tf
 ```
 
 ### Distributed training
@@ -544,12 +545,16 @@ the test, but then there is no way of running that test for just one set of argu
 # test_this1.py
 import unittest
 from parameterized import parameterized
+
+
 class TestMathUnitTest(unittest.TestCase):
-    @parameterized.expand([
-        ("negative", -1.5, -2.0),
-        ("integer", 1, 1.0),
-        ("large fraction", 1.6, 1),
-    ])
+    @parameterized.expand(
+        [
+            ("negative", -1.5, -2.0),
+            ("integer", 1, 1.0),
+            ("large fraction", 1.6, 1),
+        ]
+    )
     def test_floor(self, name, input, expected):
         assert_equal(math.floor(input), expected)
 ```
@@ -601,6 +606,8 @@ Here is the same example, this time using `pytest`'s `parametrize` marker:
 ```python
 # test_this2.py
 import pytest
+
+
 @pytest.mark.parametrize(
     "name, input, expected",
     [
@@ -669,6 +676,8 @@ To start using those all you need is to make sure that the test resides in a sub
 
 ```python
 from transformers.testing_utils import TestCasePlus
+
+
 class PathExampleTest(TestCasePlus):
     def test_something_involving_local_locations(self):
         data_dir = self.tests_dir / "fixtures/tests_samples/wmt_en_ro"
@@ -679,6 +688,8 @@ If you don't need to manipulate paths via `pathlib` or you just need a path as a
 
 ```python
 from transformers.testing_utils import TestCasePlus
+
+
 class PathExampleTest(TestCasePlus):
     def test_something_involving_stringified_locations(self):
         examples_dir = self.examples_dir_str
@@ -700,6 +711,8 @@ Here is an example of its usage:
 
 ```python
 from transformers.testing_utils import TestCasePlus
+
+
 class ExamplesTests(TestCasePlus):
     def test_whatever(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
@@ -759,6 +772,7 @@ If you need to temporary override `sys.path` to import from another test for exa
 ```python
 import os
 from transformers.testing_utils import ExtendSysPath
+
 bindir = os.path.abspath(os.path.dirname(__file__))
 with ExtendSysPath(f"{bindir}/.."):
     from test_trainer import TrainerIntegrationCommon  # noqa
@@ -786,20 +800,20 @@ code that's buggy causes some bad state that will affect other tests, do not use
 
 - Here is how to skip whole test unconditionally:
 
-```python
+```python no-style
 @unittest.skip("this bug needs to be fixed")
 def test_feature_x():
 ```
 
 or via pytest:
 
-```python
+```python no-style
 @pytest.mark.skip(reason="this bug needs to be fixed")
 ```
 
 or the `xfail` way:
 
-```python
+```python no-style
 @pytest.mark.xfail
 def test_feature_x():
 ```
@@ -816,6 +830,7 @@ or the whole module:
 
 ```python
 import pytest
+
 if not pytest.config.getoption("--custom-flag"):
     pytest.skip("--custom-flag is missing, skipping tests", allow_module_level=True)
 ```
@@ -835,21 +850,21 @@ docutils = pytest.importorskip("docutils", minversion="0.3")
 
 -  Skip a test based on a condition:
 
-```python
+```python no-style
 @pytest.mark.skipif(sys.version_info < (3,6), reason="requires python3.6 or higher")
 def test_feature_x():
 ```
 
 or:
 
-```python
+```python no-style
 @unittest.skipIf(torch_device == "cpu", "Can't do half precision")
 def test_feature_x():
 ```
 
 or skip the whole module:
 
-```python
+```python no-style
 @pytest.mark.skipif(sys.platform == 'win32', reason="does not run on windows")
 class TestClass():
     def test_feature_x(self):
@@ -863,7 +878,7 @@ The library of tests is ever-growing, and some of the tests take minutes to run,
 an hour for the test suite to complete on CI. Therefore, with some exceptions for essential tests, slow tests should be
 marked as in the example below:
 
-```python
+```python no-style
 from transformers.testing_utils import slow
 @slow
 def test_integration_foo():
@@ -878,8 +893,8 @@ RUN_SLOW=1 pytest tests
 Some decorators like `@parameterized` rewrite test names, therefore `@slow` and the rest of the skip decorators
 `@require_*` have to be listed last for them to work correctly. Here is an example of the correct usage:
 
-```python
-@parameterized.expand(...)
+```python no-style
+@parameteriz ed.expand(...)
 @slow
 def test_integration_foo():
 ```
@@ -935,13 +950,21 @@ In order to test functions that write to `stdout` and/or `stderr`, the test can
 
 ```python
 import sys
-def print_to_stdout(s): print(s)
-def print_to_stderr(s): sys.stderr.write(s)
+
+
+def print_to_stdout(s):
+    print(s)
+
+
+def print_to_stderr(s):
+    sys.stderr.write(s)
+
+
 def test_result_and_stdout(capsys):
     msg = "Hello"
     print_to_stdout(msg)
     print_to_stderr(msg)
-    out, err = capsys.readouterr() # consume the captured output streams
+    out, err = capsys.readouterr()  # consume the captured output streams
     # optional: if you want to replay the consumed streams:
     sys.stdout.write(out)
     sys.stderr.write(err)
@@ -954,10 +977,13 @@ And, of course, most of the time, `stderr` will come as a part of an exception,
 a case:
 
 ```python
-def raise_exception(msg): raise ValueError(msg)
+def raise_exception(msg):
+    raise ValueError(msg)
+
+
 def test_something_exception():
     msg = "Not a good value"
-    error = ''
+    error = ""
     try:
         raise_exception(msg)
     except Exception as e:
@@ -970,7 +996,12 @@ Another approach to capturing stdout is via `contextlib.redirect_stdout`:
 ```python
 from io import StringIO
 from contextlib import redirect_stdout
-def print_to_stdout(s): print(s)
+
+
+def print_to_stdout(s):
+    print(s)
+
+
 def test_result_and_stdout():
     msg = "Hello"
     buffer = StringIO()
@@ -993,6 +1024,7 @@ some `\r`'s in it or not, so it's a simple:
 
 ```python
 from transformers.testing_utils import CaptureStdout
+
 with CaptureStdout() as cs:
     function_that_writes_to_stdout()
 print(cs.out)
@@ -1002,17 +1034,19 @@ Here is a full test example:
 
 ```python
 from transformers.testing_utils import CaptureStdout
+
 msg = "Secret message\r"
 final = "Hello World"
 with CaptureStdout() as cs:
     print(msg + final)
-assert cs.out == final+"\n", f"captured: {cs.out}, expecting {final}"
+assert cs.out == final + "\n", f"captured: {cs.out}, expecting {final}"
 ```
 
 If you'd like to capture `stderr` use the `CaptureStderr` class instead:
 
 ```python
 from transformers.testing_utils import CaptureStderr
+
 with CaptureStderr() as cs:
     function_that_writes_to_stderr()
 print(cs.err)
@@ -1022,6 +1056,7 @@ If you need to capture both streams at once, use the parent `CaptureStd` class:
 
 ```python
 from transformers.testing_utils import CaptureStd
+
 with CaptureStd() as cs:
     function_that_writes_to_stdout_and_stderr()
 print(cs.err, cs.out)
@@ -1044,7 +1079,7 @@ logging.set_verbosity_info()
 logger = logging.get_logger("transformers.models.bart.tokenization_bart")
 with CaptureLogger(logger) as cl:
     logger.info(msg)
-assert cl.out, msg+"\n"
+assert cl.out, msg + "\n"
 ```
 
 ### Testing with environment variables
@@ -1054,6 +1089,8 @@ If you want to test the impact of environment variables for a specific test you
 
 ```python
 from transformers.testing_utils import mockenv
+
+
 class HfArgumentParserTest(unittest.TestCase):
     @mockenv(TRANSFORMERS_VERBOSITY="error")
     def test_env_override(self):
@@ -1065,6 +1102,8 @@ multiple local paths. A helper class `transformers.test_utils.TestCasePlus` come
 
 ```python
 from transformers.testing_utils import TestCasePlus
+
+
 class EnvExampleTest(TestCasePlus):
     def test_external_prog(self):
         env = self.get_env()
@@ -1089,16 +1128,20 @@ seed = 42
 
 # python RNG
 import random
+
 random.seed(seed)
 
 # pytorch RNGs
 import torch
+
 torch.manual_seed(seed)
 torch.backends.cudnn.deterministic = True
-if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
+if torch.cuda.is_available():
+    torch.cuda.manual_seed_all(seed)
 
 # numpy RNG
 import numpy as np
+
 np.random.seed(seed)
 
 # tf RNG
diff --git a/docs/source/tokenizer_summary.mdx b/docs/source/tokenizer_summary.mdx
index db0f9d95dc..401c620d00 100644
--- a/docs/source/tokenizer_summary.mdx
+++ b/docs/source/tokenizer_summary.mdx
@@ -104,6 +104,7 @@ seen before, by decomposing them into known subwords. For instance, the [`~trans
 
 ```py
 >>> from transformers import BertTokenizer
+
 >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
 >>> tokenizer.tokenize("I have a new GPU!")
 ["i", "have", "a", "new", "gp", "##u", "!"]
@@ -117,6 +118,7 @@ As another example, [`~transformers.XLNetTokenizer`] tokenizes our previously ex
 
 ```py
 >>> from transformers import XLNetTokenizer
+
 >>> tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
 >>> tokenizer.tokenize("Don't you love 🤗 Transformers? We sure do.")
 ["▁Don", "'", "t", "▁you", "▁love", "▁", "🤗", "▁", "Transform", "ers", "?", "▁We", "▁sure", "▁do", "."]
diff --git a/docs/source/training.mdx b/docs/source/training.mdx
index 805323df82..753d6313c4 100644
--- a/docs/source/training.mdx
+++ b/docs/source/training.mdx
@@ -74,6 +74,7 @@ However, we can instead apply these preprocessing steps to all the splits of our
 def tokenize_function(examples):
     return tokenizer(examples["text"], padding="max_length", truncation=True)
 
+
 tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
 ```
 
@@ -82,8 +83,8 @@ You can learn more about the map method or the other ways to preprocess the data
 Next we will generate a small subset of the training and validation set, to enable faster training:
 
 ```python
-small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000)) 
-small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000)) 
+small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
+small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
 full_train_dataset = tokenized_datasets["train"]
 full_eval_dataset = tokenized_datasets["test"]
 ```
@@ -130,9 +131,7 @@ Then we can instantiate a [`Trainer`] like this:
 ```python
 from transformers import Trainer
 
-trainer = Trainer(
-    model=model, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_eval_dataset
-)
+trainer = Trainer(model=model, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_eval_dataset)
 ```
 
 To fine-tune our model, we just need to call
@@ -160,6 +159,7 @@ from datasets import load_metric
 
 metric = load_metric("accuracy")
 
+
 def compute_metrics(eval_pred):
     logits, labels = eval_pred
     predictions = np.argmax(logits, axis=-1)
@@ -322,12 +322,7 @@ from transformers import get_scheduler
 
 num_epochs = 3
 num_training_steps = num_epochs * len(train_dataloader)
-lr_scheduler = get_scheduler(
-    "linear",
-    optimizer=optimizer,
-    num_warmup_steps=0,
-    num_training_steps=num_training_steps
-)
+lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)
 ```
 
 One last thing, we will want to use the GPU if we have access to one (otherwise training might take several hours
@@ -372,7 +367,7 @@ use a metric from the datasets library. Here we accumulate the predictions at ea
 result when the loop is finished.
 
 ```python
-metric= load_metric("accuracy")
+metric = load_metric("accuracy")
 model.eval()
 for batch in eval_dataloader:
     batch = {k: v.to(device) for k, v in batch.items()}
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index 407a46f78e..e27812c11f 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -488,15 +488,20 @@ class PretrainedConfig(PushToHubMixin):
         ```python
         # We can't instantiate directly the base class *PretrainedConfig* so let's show the examples on a
         # derived class: BertConfig
-        config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from huggingface.co and cache.
-        config = BertConfig.from_pretrained('./test/saved_model/')  # E.g. config (or model) was saved using *save_pretrained('./test/saved_model/')*
-        config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
-        config = BertConfig.from_pretrained('bert-base-uncased', output_attentions=True, foo=False)
+        config = BertConfig.from_pretrained(
+            "bert-base-uncased"
+        )  # Download configuration from huggingface.co and cache.
+        config = BertConfig.from_pretrained(
+            "./test/saved_model/"
+        )  # E.g. config (or model) was saved using *save_pretrained('./test/saved_model/')*
+        config = BertConfig.from_pretrained("./test/saved_model/my_configuration.json")
+        config = BertConfig.from_pretrained("bert-base-uncased", output_attentions=True, foo=False)
         assert config.output_attentions == True
-        config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attentions=True,
-                                                   foo=False, return_unused_kwargs=True)
+        config, unused_kwargs = BertConfig.from_pretrained(
+            "bert-base-uncased", output_attentions=True, foo=False, return_unused_kwargs=True
+        )
         assert config.output_attentions == True
-        assert unused_kwargs == {'foo': False}
+        assert unused_kwargs == {"foo": False}
         ```"""
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
         if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py
index 5a2c288275..f5e710f5ed 100644
--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@@ -588,6 +588,7 @@ class SquadProcessor(DataProcessor):
 
         ```python
         >>> import tensorflow_datasets as tfds
+
         >>> dataset = tfds.load("squad")
 
         >>> training_examples = get_examples_from_dataset(dataset, evaluate=False)
diff --git a/src/transformers/debug_utils.py b/src/transformers/debug_utils.py
index 6685085549..39780226d2 100644
--- a/src/transformers/debug_utils.py
+++ b/src/transformers/debug_utils.py
@@ -107,7 +107,7 @@ class DebugUnderflowOverflow:
     given batch, and only do that for batches 1 and 3. Then you instantiate this class as :
 
     ```python
-    debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3])
+    debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3])
     ```
 
     And now full batches 1 and 3 will be traced using the same format as explained above. Batches are 0-indexed.
@@ -121,7 +121,7 @@ class DebugUnderflowOverflow:
     You can also specify the batch number after which to stop the training, with :
 
     ```python
-    debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3], abort_after_batch_num=3)
+    debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3], abort_after_batch_num=3)
     ```
 
     This feature is mainly useful in the tracing mode, but you can use it for any mode.
diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py
index f8f3cfcee3..eaa559ae13 100644
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@@ -273,15 +273,22 @@ class FeatureExtractionMixin:
         ```python
         # We can't instantiate directly the base class *FeatureExtractionMixin* nor *SequenceFeatureExtractor* so let's show the examples on a
         # derived class: *Wav2Vec2FeatureExtractor*
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h')    # Download feature_extraction_config from huggingface.co and cache.
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('./test/saved_model/')  # E.g. feature_extractor (or model) was saved using *save_pretrained('./test/saved_model/')*
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('./test/saved_model/preprocessor_config.json')
-        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h', return_attention_mask=False, foo=False)
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
+            "facebook/wav2vec2-base-960h"
+        )  # Download feature_extraction_config from huggingface.co and cache.
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
+            "./test/saved_model/"
+        )  # E.g. feature_extractor (or model) was saved using *save_pretrained('./test/saved_model/')*
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("./test/saved_model/preprocessor_config.json")
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
+            "facebook/wav2vec2-base-960h", return_attention_mask=False, foo=False
+        )
         assert feature_extractor.return_attention_mask is False
-        feature_extractor, unused_kwargs = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h', return_attention_mask=False,
-                                                           foo=False, return_unused_kwargs=True)
+        feature_extractor, unused_kwargs = Wav2Vec2FeatureExtractor.from_pretrained(
+            "facebook/wav2vec2-base-960h", return_attention_mask=False, foo=False, return_unused_kwargs=True
+        )
         assert feature_extractor.return_attention_mask is False
-        assert unused_kwargs == {'foo': False}
+        assert unused_kwargs == {"foo": False}
         ```"""
         feature_extractor_dict, kwargs = cls.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs)
 
diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index c7e1407321..8a809a92e1 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -956,11 +956,11 @@ PT_TOKEN_CLASSIFICATION_SAMPLE = r"""
     >>> from transformers import {processor_class}, {model_class}
     >>> import torch
 
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-    >>> labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0) # Batch size 1
+    >>> labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0)  # Batch size 1
 
     >>> outputs = model(**inputs, labels=labels)
     >>> loss = outputs.loss
@@ -975,11 +975,11 @@ PT_QUESTION_ANSWERING_SAMPLE = r"""
     >>> from transformers import {processor_class}, {model_class}
     >>> import torch
 
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-    >>> inputs = tokenizer(question, text, return_tensors='pt')
+    >>> inputs = tokenizer(question, text, return_tensors="pt")
     >>> start_positions = torch.tensor([1])
     >>> end_positions = torch.tensor([3])
 
@@ -997,11 +997,11 @@ PT_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
     >>> from transformers import {processor_class}, {model_class}
     >>> import torch
 
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-    >>> labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
+    >>> labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
     >>> outputs = model(**inputs, labels=labels)
     >>> loss = outputs.loss
     >>> logits = outputs.logits
@@ -1013,11 +1013,11 @@ PT_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
     >>> from transformers import {processor_class}, {model_class}
     >>> import torch
 
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
-    >>> model = {model_class}.from_pretrained('{checkpoint}', problem_type="multi_label_classification")
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}", problem_type="multi_label_classification")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-    >>> labels = torch.tensor([[1, 1]], dtype=torch.float) # need dtype=float for BCEWithLogitsLoss
+    >>> labels = torch.tensor([[1, 1]], dtype=torch.float)  # need dtype=float for BCEWithLogitsLoss
     >>> outputs = model(**inputs, labels=labels)
     >>> loss = outputs.loss
     >>> logits = outputs.logits
@@ -1032,8 +1032,8 @@ PT_MASKED_LM_SAMPLE = r"""
     >>> from transformers import {processor_class}, {model_class}
     >>> import torch
 
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="pt")
     >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"]
@@ -1051,8 +1051,8 @@ PT_BASE_MODEL_SAMPLE = r"""
     >>> from transformers import {processor_class}, {model_class}
     >>> import torch
 
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
     >>> outputs = model(**inputs)
@@ -1068,16 +1068,16 @@ PT_MULTIPLE_CHOICE_SAMPLE = r"""
     >>> from transformers import {processor_class}, {model_class}
     >>> import torch
 
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
     >>> choice0 = "It is eaten with a fork and a knife."
     >>> choice1 = "It is eaten while held in the hand."
-    >>> labels = torch.tensor(0).unsqueeze(0) # choice0 is correct (according to Wikipedia ;)), batch size 1
+    >>> labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
 
-    >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors='pt', padding=True)
-    >>> outputs = model(**{{k: v.unsqueeze(0) for k,v in encoding.items()}}, labels=labels) # batch size is 1
+    >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors="pt", padding=True)
+    >>> outputs = model(**{{k: v.unsqueeze(0) for k, v in encoding.items()}}, labels=labels)  # batch size is 1
 
     >>> # the linear classifier still needs to be trained
     >>> loss = outputs.loss
@@ -1092,8 +1092,8 @@ PT_CAUSAL_LM_SAMPLE = r"""
     >>> import torch
     >>> from transformers import {processor_class}, {model_class}
 
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
     >>> outputs = model(**inputs, labels=inputs["input_ids"])
@@ -1112,8 +1112,8 @@ PT_SPEECH_BASE_MODEL_SAMPLE = r"""
     >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
     >>> sampling_rate = dataset.features["audio"].sampling_rate
 
-    >>> processor = {processor_class}.from_pretrained('{checkpoint}')
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> processor = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> # audio file is decoded on the fly
     >>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
@@ -1134,8 +1134,8 @@ PT_SPEECH_CTC_SAMPLE = r"""
     >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
     >>> sampling_rate = dataset.features["audio"].sampling_rate
 
-    >>> processor = {processor_class}.from_pretrained('{checkpoint}')
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> processor = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> # audio file is decoded on the fly
     >>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
@@ -1164,8 +1164,8 @@ PT_SPEECH_SEQ_CLASS_SAMPLE = r"""
     >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
     >>> sampling_rate = dataset.features["audio"].sampling_rate
 
-    >>> feature_extractor = {processor_class}.from_pretrained('{checkpoint}')
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> # audio file is decoded on the fly
     >>> inputs = feature_extractor(dataset[0]["audio"]["array"], return_tensors="pt")
@@ -1192,8 +1192,8 @@ PT_SPEECH_FRAME_CLASS_SAMPLE = r"""
     >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
     >>> sampling_rate = dataset.features["audio"].sampling_rate
 
-    >>> feature_extractor = {processor_class}.from_pretrained('{checkpoint}')
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> # audio file is decoded on the fly
     >>> inputs = feature_extractor(dataset[0]["audio"]["array"], return_tensors="pt")
@@ -1216,8 +1216,8 @@ PT_SPEECH_XVECTOR_SAMPLE = r"""
     >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
     >>> sampling_rate = dataset.features["audio"].sampling_rate
 
-    >>> feature_extractor = {processor_class}.from_pretrained('{checkpoint}')
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> # audio file is decoded on the fly
     >>> inputs = feature_extractor(dataset[:2]["audio"]["array"], return_tensors="pt")
@@ -1227,7 +1227,7 @@ PT_SPEECH_XVECTOR_SAMPLE = r"""
     >>> # the resulting embeddings can be used for cosine similarity-based retrieval
     >>> cosine_sim = torch.nn.CosineSimilarity(dim=-1)
     >>> similarity = cosine_sim(embeddings[0], embeddings[1])
-    >>> threshold = 0.7 # the optimal threshold is dataset-dependent
+    >>> threshold = 0.7  # the optimal threshold is dataset-dependent
     >>> if similarity < threshold:
     ...     print("Speakers are not the same!")
     ```
@@ -1256,12 +1256,14 @@ TF_TOKEN_CLASSIFICATION_SAMPLE = r"""
     >>> from transformers import {processor_class}, {model_class}
     >>> import tensorflow as tf
 
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
     >>> input_ids = inputs["input_ids"]
-    >>> inputs["labels"] = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
+    >>> inputs["labels"] = tf.reshape(
+    ...     tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))
+    >>> )  # Batch size 1
 
     >>> outputs = model(inputs)
     >>> loss = outputs.loss
@@ -1276,17 +1278,17 @@ TF_QUESTION_ANSWERING_SAMPLE = r"""
     >>> from transformers import {processor_class}, {model_class}
     >>> import tensorflow as tf
 
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-    >>> input_dict = tokenizer(question, text, return_tensors='tf')
+    >>> input_dict = tokenizer(question, text, return_tensors="tf")
     >>> outputs = model(input_dict)
     >>> start_logits = outputs.start_logits
     >>> end_logits = outputs.end_logits
 
     >>> all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
-    >>> answer = ' '.join(all_tokens[tf.math.argmax(start_logits, 1)[0] : tf.math.argmax(end_logits, 1)[0]+1])
+    >>> answer = " ".join(all_tokens[tf.math.argmax(start_logits, 1)[0] : tf.math.argmax(end_logits, 1)[0] + 1])
     ```
 """
 
@@ -1297,11 +1299,11 @@ TF_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
     >>> from transformers import {processor_class}, {model_class}
     >>> import tensorflow as tf
 
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
-    >>> inputs["labels"] = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
+    >>> inputs["labels"] = tf.reshape(tf.constant(1), (-1, 1))  # Batch size 1
 
     >>> outputs = model(inputs)
     >>> loss = outputs.loss
@@ -1316,8 +1318,8 @@ TF_MASKED_LM_SAMPLE = r"""
     >>> from transformers import {processor_class}, {model_class}
     >>> import tensorflow as tf
 
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="tf")
     >>> inputs["labels"] = tokenizer("The capital of France is Paris.", return_tensors="tf")["input_ids"]
@@ -1335,8 +1337,8 @@ TF_BASE_MODEL_SAMPLE = r"""
     >>> from transformers import {processor_class}, {model_class}
     >>> import tensorflow as tf
 
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
     >>> outputs = model(inputs)
@@ -1352,16 +1354,16 @@ TF_MULTIPLE_CHOICE_SAMPLE = r"""
     >>> from transformers import {processor_class}, {model_class}
     >>> import tensorflow as tf
 
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
     >>> choice0 = "It is eaten with a fork and a knife."
     >>> choice1 = "It is eaten while held in the hand."
 
-    >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors='tf', padding=True)
+    >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors="tf", padding=True)
     >>> inputs = {{k: tf.expand_dims(v, 0) for k, v in encoding.items()}}
-    >>> outputs = model(inputs) # batch size is 1
+    >>> outputs = model(inputs)  # batch size is 1
 
     >>> # the linear classifier still needs to be trained
     >>> logits = outputs.logits
@@ -1375,8 +1377,8 @@ TF_CAUSAL_LM_SAMPLE = r"""
     >>> from transformers import {processor_class}, {model_class}
     >>> import tensorflow as tf
 
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
     >>> outputs = model(inputs)
@@ -1401,10 +1403,10 @@ FLAX_TOKEN_CLASSIFICATION_SAMPLE = r"""
     ```python
     >>> from transformers import {processor_class}, {model_class}
 
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
 
-    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors='jax')
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="jax")
 
     >>> outputs = model(**inputs)
     >>> logits = outputs.logits
@@ -1417,11 +1419,11 @@ FLAX_QUESTION_ANSWERING_SAMPLE = r"""
     ```python
     >>> from transformers import {processor_class}, {model_class}
 
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
-    >>> inputs = tokenizer(question, text, return_tensors='jax')
+    >>> inputs = tokenizer(question, text, return_tensors="jax")
 
     >>> outputs = model(**inputs)
     >>> start_scores = outputs.start_logits
@@ -1435,10 +1437,10 @@ FLAX_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
     ```python
     >>> from transformers import {processor_class}, {model_class}
 
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
 
-    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors='jax')
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="jax")
 
     >>> outputs = model(**inputs)
     >>> logits = outputs.logits
@@ -1451,10 +1453,10 @@ FLAX_MASKED_LM_SAMPLE = r"""
     ```python
     >>> from transformers import {processor_class}, {model_class}
 
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
 
-    >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors='jax')
+    >>> inputs = tokenizer("The capital of France is {mask}.", return_tensors="jax")
 
     >>> outputs = model(**inputs)
     >>> logits = outputs.logits
@@ -1467,10 +1469,10 @@ FLAX_BASE_MODEL_SAMPLE = r"""
     ```python
     >>> from transformers import {processor_class}, {model_class}
 
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
 
-    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors='jax')
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="jax")
     >>> outputs = model(**inputs)
 
     >>> last_hidden_states = outputs.last_hidden_state
@@ -1483,15 +1485,15 @@ FLAX_MULTIPLE_CHOICE_SAMPLE = r"""
     ```python
     >>> from transformers import {processor_class}, {model_class}
 
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
     >>> choice0 = "It is eaten with a fork and a knife."
     >>> choice1 = "It is eaten while held in the hand."
 
-    >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors='jax', padding=True)
-    >>> outputs = model(**{{k: v[None, :] for k,v in encoding.items()}})
+    >>> encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors="jax", padding=True)
+    >>> outputs = model(**{{k: v[None, :] for k, v in encoding.items()}})
 
     >>> logits = outputs.logits
     ```
@@ -1503,8 +1505,8 @@ FLAX_CAUSAL_LM_SAMPLE = r"""
     ```python
     >>> from transformers import {processor_class}, {model_class}
 
-    >>> tokenizer = {processor_class}.from_pretrained('{checkpoint}')
-    >>> model = {model_class}.from_pretrained('{checkpoint}')
+    >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
     >>> outputs = model(**inputs)
diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py
index 656c289c34..ed36d7a5fc 100644
--- a/src/transformers/generation_tf_utils.py
+++ b/src/transformers/generation_tf_utils.py
@@ -488,40 +488,60 @@ class TFGenerationMixin:
         Examples:
 
         ```python
-        tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
-        model = TFAutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from huggingface.co and cache.
+        tokenizer = AutoTokenizer.from_pretrained("distilgpt2")  # Initialize tokenizer
+        model = TFAutoModelWithLMHead.from_pretrained(
+            "distilgpt2"
+        )  # Download model and configuration from huggingface.co and cache.
         outputs = model.generate(max_length=40)  # do greedy decoding
-        print(f'Generated: {tokenizer.decode(outputs[0], skip_special_tokens=True)}')
+        print(f"Generated: {tokenizer.decode(outputs[0], skip_special_tokens=True)}")
 
-        tokenizer = AutoTokenizer.from_pretrained('openai-gpt')   # Initialize tokenizer
-        model = TFAutoModelWithLMHead.from_pretrained('openai-gpt')    # Download model and configuration from huggingface.co and cache.
-        input_context = 'The dog'
-        input_ids = tokenizer.encode(input_context, return_tensors='tf')  # encode input context
-        outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5)  # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog'
-        for i in range(3): #  3 output sequences were generated
-            print(f'Generated {i}: {tokenizer.decode(outputs[i], skip_special_tokens=True)}')
+        tokenizer = AutoTokenizer.from_pretrained("openai-gpt")  # Initialize tokenizer
+        model = TFAutoModelWithLMHead.from_pretrained(
+            "openai-gpt"
+        )  # Download model and configuration from huggingface.co and cache.
+        input_context = "The dog"
+        input_ids = tokenizer.encode(input_context, return_tensors="tf")  # encode input context
+        outputs = model.generate(
+            input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5
+        )  # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog'
+        for i in range(3):  #  3 output sequences were generated
+            print(f"Generated {i}: {tokenizer.decode(outputs[i], skip_special_tokens=True)}")
 
-        tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
-        model = TFAutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from huggingface.co and cache.
-        input_context = 'The dog'
-        input_ids = tokenizer.encode(input_context, return_tensors='tf')  # encode input context
-        outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3, do_sample=True)  # generate 3 candidates using sampling
-        for i in range(3): #  3 output sequences were generated
-            print(f'Generated {i}: {tokenizer.decode(outputs[i], skip_special_tokens=True)}')
+        tokenizer = AutoTokenizer.from_pretrained("distilgpt2")  # Initialize tokenizer
+        model = TFAutoModelWithLMHead.from_pretrained(
+            "distilgpt2"
+        )  # Download model and configuration from huggingface.co and cache.
+        input_context = "The dog"
+        input_ids = tokenizer.encode(input_context, return_tensors="tf")  # encode input context
+        outputs = model.generate(
+            input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3, do_sample=True
+        )  # generate 3 candidates using sampling
+        for i in range(3):  #  3 output sequences were generated
+            print(f"Generated {i}: {tokenizer.decode(outputs[i], skip_special_tokens=True)}")
 
-        tokenizer = AutoTokenizer.from_pretrained('ctrl')   # Initialize tokenizer
-        model = TFAutoModelWithLMHead.from_pretrained('ctrl')    # Download model and configuration from huggingface.co and cache.
-        input_context = 'Legal My neighbor is'  # "Legal" is one of the control codes for ctrl
-        input_ids = tokenizer.encode(input_context, return_tensors='tf')  # encode input context
-        outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2)  # generate sequences
-        print(f'Generated: {tokenizer.decode(outputs[0], skip_special_tokens=True)}')
+        tokenizer = AutoTokenizer.from_pretrained("ctrl")  # Initialize tokenizer
+        model = TFAutoModelWithLMHead.from_pretrained(
+            "ctrl"
+        )  # Download model and configuration from huggingface.co and cache.
+        input_context = "Legal My neighbor is"  # "Legal" is one of the control codes for ctrl
+        input_ids = tokenizer.encode(input_context, return_tensors="tf")  # encode input context
+        outputs = model.generate(
+            input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2
+        )  # generate sequences
+        print(f"Generated: {tokenizer.decode(outputs[0], skip_special_tokens=True)}")
 
-        tokenizer = AutoTokenizer.from_pretrained('gpt2')   # Initialize tokenizer
-        model = TFAutoModelWithLMHead.from_pretrained('gpt2')    # Download model and configuration from huggingface.co and cache.
-        input_context = 'My cute dog'
-        bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']]
-        input_ids = tokenizer.encode(input_context, return_tensors='tf')  # encode input context
-        outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids)  # generate sequences without allowing bad_words to be generated
+        tokenizer = AutoTokenizer.from_pretrained("gpt2")  # Initialize tokenizer
+        model = TFAutoModelWithLMHead.from_pretrained(
+            "gpt2"
+        )  # Download model and configuration from huggingface.co and cache.
+        input_context = "My cute dog"
+        bad_words_ids = [
+            tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ["idiot", "stupid", "shut up"]
+        ]
+        input_ids = tokenizer.encode(input_context, return_tensors="tf")  # encode input context
+        outputs = model.generate(
+            input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids
+        )  # generate sequences without allowing bad_words to be generated
         ```"""
 
         # We cannot generate if the model does not have a LM head
diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
index e8bc28383e..6990924e42 100644
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -939,8 +939,8 @@ class GenerationMixin:
         >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
         >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
         >>> document = (
-        ... "at least two people were killed in a suspected bomb attack on a passenger bus "
-        ... "in the strife-torn southern philippines on monday , the military said."
+        ...     "at least two people were killed in a suspected bomb attack on a passenger bus "
+        ...     "in the strife-torn southern philippines on monday , the military said."
         ... )
         >>> # encode input context
         >>> input_ids = tokenizer(document, return_tensors="pt").input_ids
@@ -1329,10 +1329,10 @@ class GenerationMixin:
 
         ```python
         >>> from transformers import (
-        ... AutoTokenizer,
-        ... AutoModelForCausalLM,
-        ... LogitsProcessorList,
-        ... MinLengthLogitsProcessor,
+        ...     AutoTokenizer,
+        ...     AutoModelForCausalLM,
+        ...     LogitsProcessorList,
+        ...     MinLengthLogitsProcessor,
         ... )
 
         >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
@@ -1345,9 +1345,11 @@ class GenerationMixin:
         >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
 
         >>> # instantiate logits processors
-        >>> logits_processor = LogitsProcessorList([
-        ...     MinLengthLogitsProcessor(15, eos_token_id=model.config.eos_token_id),
-        ... ])
+        >>> logits_processor = LogitsProcessorList(
+        ...     [
+        ...         MinLengthLogitsProcessor(15, eos_token_id=model.config.eos_token_id),
+        ...     ]
+        ... )
 
         >>> outputs = model.greedy_search(input_ids, logits_processor=logits_processor)
 
@@ -1556,12 +1558,12 @@ class GenerationMixin:
 
         ```python
         >>> from transformers import (
-        ...    AutoTokenizer,
-        ...    AutoModelForCausalLM,
-        ...    LogitsProcessorList,
-        ...    MinLengthLogitsProcessor,
-        ...    TopKLogitsWarper,
-        ...    TemperatureLogitsWarper,
+        ...     AutoTokenizer,
+        ...     AutoModelForCausalLM,
+        ...     LogitsProcessorList,
+        ...     MinLengthLogitsProcessor,
+        ...     TopKLogitsWarper,
+        ...     TemperatureLogitsWarper,
         ... )
 
         >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
@@ -1574,14 +1576,18 @@ class GenerationMixin:
         >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
 
         >>> # instantiate logits processors
-        >>> logits_processor = LogitsProcessorList([
-        ...     MinLengthLogitsProcessor(15, eos_token_id=model.config.eos_token_id),
-        ... ])
+        >>> logits_processor = LogitsProcessorList(
+        ...     [
+        ...         MinLengthLogitsProcessor(15, eos_token_id=model.config.eos_token_id),
+        ...     ]
+        ... )
         >>> # instantiate logits processors
-        >>> logits_warper = LogitsProcessorList([
-        ...     TopKLogitsWarper(50),
-        ...     TemperatureLogitsWarper(0.7),
-        ... ])
+        >>> logits_warper = LogitsProcessorList(
+        ...     [
+        ...         TopKLogitsWarper(50),
+        ...         TemperatureLogitsWarper(0.7),
+        ...     ]
+        ... )
 
         >>> outputs = model.sample(input_ids, logits_processor=logits_processor, logits_warper=logits_warper)
 
@@ -1795,11 +1801,11 @@ class GenerationMixin:
 
         ```python
         >>> from transformers import (
-        ...    AutoTokenizer,
-        ...    AutoModelForSeq2SeqLM,
-        ...    LogitsProcessorList,
-        ...    MinLengthLogitsProcessor,
-        ...    BeamSearchScorer,
+        ...     AutoTokenizer,
+        ...     AutoModelForSeq2SeqLM,
+        ...     LogitsProcessorList,
+        ...     MinLengthLogitsProcessor,
+        ...     BeamSearchScorer,
         ... )
         >>> import torch
 
@@ -1818,7 +1824,9 @@ class GenerationMixin:
 
         >>> # add encoder_outputs to model keyword arguments
         >>> model_kwargs = {
-        ...     "encoder_outputs": model.get_encoder()(encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True)
+        ...     "encoder_outputs": model.get_encoder()(
+        ...         encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
+        ...     )
         ... }
 
         >>> # instantiate beam scorer
@@ -1829,9 +1837,11 @@ class GenerationMixin:
         ... )
 
         >>> # instantiate logits processors
-        >>> logits_processor = LogitsProcessorList([
-        ...     MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
-        ... ])
+        >>> logits_processor = LogitsProcessorList(
+        ...     [
+        ...         MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
+        ...     ]
+        ... )
 
         >>> outputs = model.beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs)
 
@@ -2112,7 +2122,9 @@ class GenerationMixin:
 
         >>> # add encoder_outputs to model keyword arguments
         >>> model_kwargs = {
-        ...     "encoder_outputs": model.get_encoder()(encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True)
+        ...     "encoder_outputs": model.get_encoder()(
+        ...         encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
+        ...     )
         ... }
 
         >>> # instantiate beam scorer
@@ -2124,14 +2136,16 @@ class GenerationMixin:
         ... )
 
         >>> # instantiate logits processors
-        >>> logits_processor = LogitsProcessorList([
-        ...     MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id)
-        ... ])
+        >>> logits_processor = LogitsProcessorList(
+        ...     [MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id)]
+        ... )
         >>> # instantiate logits processors
-        >>> logits_warper = LogitsProcessorList([
-        ...     TopKLogitsWarper(50),
-        ...     TemperatureLogitsWarper(0.7),
-        ... ])
+        >>> logits_warper = LogitsProcessorList(
+        ...     [
+        ...         TopKLogitsWarper(50),
+        ...         TemperatureLogitsWarper(0.7),
+        ...     ]
+        ... )
 
         >>> outputs = model.beam_sample(
         ...     input_ids, beam_scorer, logits_processor=logits_processor, logits_warper=logits_warper, **model_kwargs
@@ -2384,12 +2398,12 @@ class GenerationMixin:
 
         ```python
         >>> from transformers import (
-        ...    AutoTokenizer,
-        ...    AutoModelForSeq2SeqLM,
-        ...    LogitsProcessorList,
-        ...    MinLengthLogitsProcessor,
-        ...    HammingDiversityLogitsProcessor,
-        ...    BeamSearchScorer,
+        ...     AutoTokenizer,
+        ...     AutoModelForSeq2SeqLM,
+        ...     LogitsProcessorList,
+        ...     MinLengthLogitsProcessor,
+        ...     HammingDiversityLogitsProcessor,
+        ...     BeamSearchScorer,
         ... )
         >>> import torch
 
@@ -2408,7 +2422,9 @@ class GenerationMixin:
 
         >>> # add encoder_outputs to model keyword arguments
         >>> model_kwargs = {
-        ...     "encoder_outputs": model.get_encoder()(encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True)
+        ...     "encoder_outputs": model.get_encoder()(
+        ...         encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True
+        ...     )
         ... }
 
         >>> # instantiate beam scorer
@@ -2417,16 +2433,20 @@ class GenerationMixin:
         ...     max_length=model.config.max_length,
         ...     num_beams=num_beams,
         ...     device=model.device,
-        ...     num_beam_groups=3
+        ...     num_beam_groups=3,
         ... )
 
         >>> # instantiate logits processors
-        >>> logits_processor = LogitsProcessorList([
-        ...     HammingDiversityLogitsProcessor(5.5, num_beams=6, num_beam_groups=3),
-        ...     MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
-        ... ])
+        >>> logits_processor = LogitsProcessorList(
+        ...     [
+        ...         HammingDiversityLogitsProcessor(5.5, num_beams=6, num_beam_groups=3),
+        ...         MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
+        ...     ]
+        ... )
 
-        >>> outputs = model.group_beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs)
+        >>> outputs = model.group_beam_search(
+        ...     input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs
+        ... )
 
         >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
         ```"""
diff --git a/src/transformers/keras_callbacks.py b/src/transformers/keras_callbacks.py
index ba9a6add8b..f126134292 100644
--- a/src/transformers/keras_callbacks.py
+++ b/src/transformers/keras_callbacks.py
@@ -32,10 +32,12 @@ class KerasMetricCallback(Callback):
 
     ```py
     from datasets import load_metric
+
     rouge_metric = load_metric("rouge")
 
+
     def rouge_fn(predictions, labels):
-        decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True))
+        decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
         decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
         result = rouge_metric.compute(predictions=decoded_predictions, references=decoded_labels)
         return {key: value.mid.fmeasure * 100 for key, value in result.items()}
diff --git a/src/transformers/modelcard.py b/src/transformers/modelcard.py
index 178f7f48c3..a5838cf43c 100644
--- a/src/transformers/modelcard.py
+++ b/src/transformers/modelcard.py
@@ -168,10 +168,14 @@ class ModelCard:
         Examples:
 
         ```python
-        modelcard = ModelCard.from_pretrained('bert-base-uncased')    # Download model card from huggingface.co and cache.
-        modelcard = ModelCard.from_pretrained('./test/saved_model/')  # E.g. model card was saved using *save_pretrained('./test/saved_model/')*
-        modelcard = ModelCard.from_pretrained('./test/saved_model/modelcard.json')
-        modelcard = ModelCard.from_pretrained('bert-base-uncased', output_attentions=True, foo=False)
+        modelcard = ModelCard.from_pretrained(
+            "bert-base-uncased"
+        )  # Download model card from huggingface.co and cache.
+        modelcard = ModelCard.from_pretrained(
+            "./test/saved_model/"
+        )  # E.g. model card was saved using *save_pretrained('./test/saved_model/')*
+        modelcard = ModelCard.from_pretrained("./test/saved_model/modelcard.json")
+        modelcard = ModelCard.from_pretrained("bert-base-uncased", output_attentions=True, foo=False)
         ```"""
         # This imports every model so let's do it dynamically here.
         from transformers.models.auto.configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py
index 0c1f17437f..c1f1ce77d0 100644
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@@ -200,16 +200,21 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
 
         ```python
         >>> from transformers import FlaxBertModel
+
         >>> # load model
-        >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
+        >>> model = FlaxBertModel.from_pretrained("bert-base-cased")
         >>> # By default, the model parameters will be in fp32 precision, to cast these to bfloat16 precision
         >>> model.params = model.to_bf16(model.params)
         >>> # If you want don't want to cast certain parameters (for example layer norm bias and scale)
         >>> # then pass the mask as follows
         >>> from flax import traverse_util
-        >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
+
+        >>> model = FlaxBertModel.from_pretrained("bert-base-cased")
         >>> flat_params = traverse_util.flatten_dict(model.params)
-        >>> mask = {path: (path[-2] != ("LayerNorm", "bias") and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
+        >>> mask = {
+        ...     path: (path[-2] != ("LayerNorm", "bias") and path[-2:] != ("LayerNorm", "scale"))
+        ...     for path in flat_params
+        ... }
         >>> mask = traverse_util.unflatten_dict(mask)
         >>> model.params = model.to_bf16(model.params, mask)
         ```"""
@@ -231,8 +236,9 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
 
         ```python
         >>> from transformers import FlaxBertModel
+
         >>> # Download model and configuration from huggingface.co
-        >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
+        >>> model = FlaxBertModel.from_pretrained("bert-base-cased")
         >>> # By default, the model params will be in fp32, to illustrate the use of this method,
         >>> # we'll first cast to fp16 and back to fp32
         >>> model.params = model.to_f16(model.params)
@@ -260,16 +266,21 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
 
         ```python
         >>> from transformers import FlaxBertModel
+
         >>> # load model
-        >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
+        >>> model = FlaxBertModel.from_pretrained("bert-base-cased")
         >>> # By default, the model params will be in fp32, to cast these to float16
         >>> model.params = model.to_fp16(model.params)
         >>> # If you want don't want to cast certain parameters (for example layer norm bias and scale)
         >>> # then pass the mask as follows
         >>> from flax import traverse_util
-        >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
+
+        >>> model = FlaxBertModel.from_pretrained("bert-base-cased")
         >>> flat_params = traverse_util.flatten_dict(model.params)
-        >>> mask = {path: (path[-2] != ("LayerNorm", "bias") and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
+        >>> mask = {
+        ...     path: (path[-2] != ("LayerNorm", "bias") and path[-2:] != ("LayerNorm", "scale"))
+        ...     for path in flat_params
+        ... }
         >>> mask = traverse_util.unflatten_dict(mask)
         >>> model.params = model.to_fp16(model.params, mask)
         ```"""
@@ -377,13 +388,14 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
 
         ```python
         >>> from transformers import BertConfig, FlaxBertModel
+
         >>> # Download model and configuration from huggingface.co and cache.
-        >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
+        >>> model = FlaxBertModel.from_pretrained("bert-base-cased")
         >>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable).
-        >>> model = FlaxBertModel.from_pretrained('./test/saved_model/')
+        >>> model = FlaxBertModel.from_pretrained("./test/saved_model/")
         >>> # Loading from a PyTorch checkpoint file instead of a PyTorch model (slower, for example purposes, not runnable).
-        >>> config = BertConfig.from_json_file('./pt_model/config.json')
-        >>> model = FlaxBertModel.from_pretrained('./pt_model/pytorch_model.bin', from_pt=True, config=config)
+        >>> config = BertConfig.from_json_file("./pt_model/config.json")
+        >>> model = FlaxBertModel.from_pretrained("./pt_model/pytorch_model.bin", from_pt=True, config=config)
         ```"""
         config = kwargs.pop("config", None)
         cache_dir = kwargs.pop("cache_dir", None)
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index 2ff8840303..fdcd2735ba 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -1460,16 +1460,17 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
 
         ```python
         >>> from transformers import BertConfig, TFBertModel
+
         >>> # Download model and configuration from huggingface.co and cache.
-        >>> model = TFBertModel.from_pretrained('bert-base-uncased')
+        >>> model = TFBertModel.from_pretrained("bert-base-uncased")
         >>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable).
-        >>> model = TFBertModel.from_pretrained('./test/saved_model/')
+        >>> model = TFBertModel.from_pretrained("./test/saved_model/")
         >>> # Update configuration during loading.
-        >>> model = TFBertModel.from_pretrained('bert-base-uncased', output_attentions=True)
+        >>> model = TFBertModel.from_pretrained("bert-base-uncased", output_attentions=True)
         >>> assert model.config.output_attentions == True
         >>> # Loading from a Pytorch model file instead of a TensorFlow checkpoint (slower, for example purposes, not runnable).
-        >>> config = BertConfig.from_json_file('./pt_model/my_pt_model_config.json')
-        >>> model = TFBertModel.from_pretrained('./pt_model/my_pytorch_model.bin', from_pt=True, config=config)
+        >>> config = BertConfig.from_json_file("./pt_model/my_pt_model_config.json")
+        >>> model = TFBertModel.from_pretrained("./pt_model/my_pytorch_model.bin", from_pt=True, config=config)
         ```"""
         config = kwargs.pop("config", None)
         cache_dir = kwargs.pop("cache_dir", None)
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 53459eb7bf..324046cc6a 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1211,18 +1211,19 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
 
         ```python
         >>> from transformers import BertConfig, BertModel
+
         >>> # Download model and configuration from huggingface.co and cache.
-        >>> model = BertModel.from_pretrained('bert-base-uncased')
+        >>> model = BertModel.from_pretrained("bert-base-uncased")
         >>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable).
-        >>> model = BertModel.from_pretrained('./test/saved_model/')
+        >>> model = BertModel.from_pretrained("./test/saved_model/")
         >>> # Update configuration during loading.
-        >>> model = BertModel.from_pretrained('bert-base-uncased', output_attentions=True)
+        >>> model = BertModel.from_pretrained("bert-base-uncased", output_attentions=True)
         >>> assert model.config.output_attentions == True
         >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower, for example purposes, not runnable).
-        >>> config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
-        >>> model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+        >>> config = BertConfig.from_json_file("./tf_model/my_tf_model_config.json")
+        >>> model = BertModel.from_pretrained("./tf_model/my_tf_checkpoint.ckpt.index", from_tf=True, config=config)
         >>> # Loading from a Flax checkpoint file instead of a PyTorch model (slower)
-        >>> model = BertModel.from_pretrained('bert-base-uncased', from_flax=True)
+        >>> model = BertModel.from_pretrained("bert-base-uncased", from_flax=True)
         ```"""
         config = kwargs.pop("config", None)
         state_dict = kwargs.pop("state_dict", None)
@@ -2320,6 +2321,7 @@ def apply_chunking_to_forward(
         hidden_states = self.decoder(hidden_states)
         return hidden_states
 
+
     # implement a chunked forward function
     def forward(self, hidden_states):
         return apply_chunking_to_forward(self.forward_chunk, self.chunk_size_lm_head, self.seq_len_dim, hidden_states)
diff --git a/src/transformers/models/albert/configuration_albert.py b/src/transformers/models/albert/configuration_albert.py
index fb87e845d2..1f871bf71d 100644
--- a/src/transformers/models/albert/configuration_albert.py
+++ b/src/transformers/models/albert/configuration_albert.py
@@ -90,15 +90,16 @@ class AlbertConfig(PretrainedConfig):
 
     ```python
     >>> from transformers import AlbertConfig, AlbertModel
+
     >>> # Initializing an ALBERT-xxlarge style configuration
     >>> albert_xxlarge_configuration = AlbertConfig()
 
     >>> # Initializing an ALBERT-base style configuration
     >>> albert_base_configuration = AlbertConfig(
-    ...      hidden_size=768,
-    ...      num_attention_heads=12,
-    ...      intermediate_size=3072,
-    ...  )
+    ...     hidden_size=768,
+    ...     num_attention_heads=12,
+    ...     intermediate_size=3072,
+    ... )
 
     >>> # Initializing a model from the ALBERT-base style configuration
     >>> model = AlbertModel(albert_xxlarge_configuration)
diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py
index 235351279a..9c7ccccc8c 100755
--- a/src/transformers/models/albert/modeling_albert.py
+++ b/src/transformers/models/albert/modeling_albert.py
@@ -802,10 +802,12 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
         >>> from transformers import AlbertTokenizer, AlbertForPreTraining
         >>> import torch
 
-        >>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        >>> model = AlbertForPreTraining.from_pretrained('albert-base-v2')
+        >>> tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
+        >>> model = AlbertForPreTraining.from_pretrained("albert-base-v2")
 
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(
+        ...     0
+        >>> )  # Batch size 1
         >>> outputs = model(input_ids)
 
         >>> prediction_logits = outputs.prediction_logits
diff --git a/src/transformers/models/albert/modeling_flax_albert.py b/src/transformers/models/albert/modeling_flax_albert.py
index 57e953a77a..c4627d0327 100644
--- a/src/transformers/models/albert/modeling_flax_albert.py
+++ b/src/transformers/models/albert/modeling_flax_albert.py
@@ -748,8 +748,8 @@ FLAX_ALBERT_FOR_PRETRAINING_DOCSTRING = """
     ```python
     >>> from transformers import AlbertTokenizer, FlaxAlbertForPreTraining
 
-    >>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-    >>> model = FlaxAlbertForPreTraining.from_pretrained('albert-base-v2')
+    >>> tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
+    >>> model = FlaxAlbertForPreTraining.from_pretrained("albert-base-v2")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
     >>> outputs = model(**inputs)
diff --git a/src/transformers/models/albert/modeling_tf_albert.py b/src/transformers/models/albert/modeling_tf_albert.py
index 86f6698c9f..f4440d7c2c 100644
--- a/src/transformers/models/albert/modeling_tf_albert.py
+++ b/src/transformers/models/albert/modeling_tf_albert.py
@@ -892,10 +892,12 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel, TFAlbertPreTrainingLoss):
         >>> import tensorflow as tf
         >>> from transformers import AlbertTokenizer, TFAlbertForPreTraining
 
-        >>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        >>> model = TFAlbertForPreTraining.from_pretrained('albert-base-v2')
+        >>> tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
+        >>> model = TFAlbertForPreTraining.from_pretrained("albert-base-v2")
 
-        >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
+        >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[
+        ...     None, :
+        >>> ]  # Batch size 1
         >>> outputs = model(input_ids)
 
         >>> prediction_logits = outputs.prediction_logits
diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py
index c5914869bf..cc0497fb78 100644
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -51,8 +51,9 @@ FROM_CONFIG_DOCSTRING = """
 
         ```python
         >>> from transformers import AutoConfig, BaseAutoModelClass
+
         >>> # Download configuration from huggingface.co and cache.
-        >>> config = AutoConfig.from_pretrained('checkpoint_placeholder')
+        >>> config = AutoConfig.from_pretrained("checkpoint_placeholder")
         >>> model = BaseAutoModelClass.from_config(config)
         ```
 """
@@ -147,16 +148,18 @@ FROM_PRETRAINED_TORCH_DOCSTRING = """
         >>> from transformers import AutoConfig, BaseAutoModelClass
 
         >>> # Download model and configuration from huggingface.co and cache.
-        >>> model = BaseAutoModelClass.from_pretrained('checkpoint_placeholder')
+        >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder")
 
         >>> # Update configuration during loading
-        >>> model = BaseAutoModelClass.from_pretrained('checkpoint_placeholder', output_attentions=True)
+        >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder", output_attentions=True)
         >>> model.config.output_attentions
         True
 
         >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-        >>> config = AutoConfig.from_pretrained('./tf_model/shortcut_placeholder_tf_model_config.json')
-        >>> model = BaseAutoModelClass.from_pretrained('./tf_model/shortcut_placeholder_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+        >>> config = AutoConfig.from_pretrained("./tf_model/shortcut_placeholder_tf_model_config.json")
+        >>> model = BaseAutoModelClass.from_pretrained(
+        ...     "./tf_model/shortcut_placeholder_tf_checkpoint.ckpt.index", from_tf=True, config=config
+        ... )
         ```
 """
 
@@ -241,16 +244,18 @@ FROM_PRETRAINED_TF_DOCSTRING = """
         >>> from transformers import AutoConfig, BaseAutoModelClass
 
         >>> # Download model and configuration from huggingface.co and cache.
-        >>> model = BaseAutoModelClass.from_pretrained('checkpoint_placeholder')
+        >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder")
 
         >>> # Update configuration during loading
-        >>> model = BaseAutoModelClass.from_pretrained('checkpoint_placeholder', output_attentions=True)
+        >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder", output_attentions=True)
         >>> model.config.output_attentions
         True
 
         >>> # Loading from a PyTorch checkpoint file instead of a TensorFlow model (slower)
-        >>> config = AutoConfig.from_pretrained('./pt_model/shortcut_placeholder_pt_model_config.json')
-        >>> model = BaseAutoModelClass.from_pretrained('./pt_model/shortcut_placeholder_pytorch_model.bin', from_pt=True, config=config)
+        >>> config = AutoConfig.from_pretrained("./pt_model/shortcut_placeholder_pt_model_config.json")
+        >>> model = BaseAutoModelClass.from_pretrained(
+        ...     "./pt_model/shortcut_placeholder_pytorch_model.bin", from_pt=True, config=config
+        ... )
         ```
 """
 
@@ -335,16 +340,18 @@ FROM_PRETRAINED_FLAX_DOCSTRING = """
         >>> from transformers import AutoConfig, BaseAutoModelClass
 
         >>> # Download model and configuration from huggingface.co and cache.
-        >>> model = BaseAutoModelClass.from_pretrained('checkpoint_placeholder')
+        >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder")
 
         >>> # Update configuration during loading
-        >>> model = BaseAutoModelClass.from_pretrained('checkpoint_placeholder', output_attentions=True)
+        >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder", output_attentions=True)
         >>> model.config.output_attentions
         True
 
         >>> # Loading from a PyTorch checkpoint file instead of a TensorFlow model (slower)
-        >>> config = AutoConfig.from_pretrained('./pt_model/shortcut_placeholder_pt_model_config.json')
-        >>> model = BaseAutoModelClass.from_pretrained('./pt_model/shortcut_placeholder_pytorch_model.bin', from_pt=True, config=config)
+        >>> config = AutoConfig.from_pretrained("./pt_model/shortcut_placeholder_pt_model_config.json")
+        >>> model = BaseAutoModelClass.from_pretrained(
+        ...     "./pt_model/shortcut_placeholder_pytorch_model.bin", from_pt=True, config=config
+        ... )
         ```
 """
 
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 219bd6fb93..86069397ea 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -555,24 +555,28 @@ class AutoConfig:
         >>> from transformers import AutoConfig
 
         >>> # Download configuration from huggingface.co and cache.
-        >>> config = AutoConfig.from_pretrained('bert-base-uncased')
+        >>> config = AutoConfig.from_pretrained("bert-base-uncased")
 
         >>> # Download configuration from huggingface.co (user-uploaded) and cache.
-        >>> config = AutoConfig.from_pretrained('dbmdz/bert-base-german-cased')
+        >>> config = AutoConfig.from_pretrained("dbmdz/bert-base-german-cased")
 
         >>> # If configuration file is in a directory (e.g., was saved using *save_pretrained('./test/saved_model/')*).
-        >>> config = AutoConfig.from_pretrained('./test/bert_saved_model/')
+        >>> config = AutoConfig.from_pretrained("./test/bert_saved_model/")
 
         >>> # Load a specific configuration file.
-        >>> config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json')
+        >>> config = AutoConfig.from_pretrained("./test/bert_saved_model/my_configuration.json")
 
         >>> # Change some config attributes when loading a pretrained config.
-        >>> config = AutoConfig.from_pretrained('bert-base-uncased', output_attentions=True, foo=False)
+        >>> config = AutoConfig.from_pretrained("bert-base-uncased", output_attentions=True, foo=False)
         >>> config.output_attentions
         True
-        >>> config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attentions=True, foo=False, return_unused_kwargs=True)
+
+        >>> config, unused_kwargs = AutoConfig.from_pretrained(
+        ...     "bert-base-uncased", output_attentions=True, foo=False, return_unused_kwargs=True
+        ... )
         >>> config.output_attentions
         True
+
         >>> config.unused_kwargs
         {'foo': False}
         ```"""
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 8d1f8cf380..7aa03abdff 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -141,10 +141,10 @@ class AutoFeatureExtractor:
         >>> from transformers import AutoFeatureExtractor
 
         >>> # Download feature extractor from huggingface.co and cache.
-        >>> feature_extractor = AutoFeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h')
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
 
         >>> # If feature extractor files are in a directory (e.g. feature extractor was saved using *save_pretrained('./test/saved_model/')*)
-        >>> feature_extractor = AutoFeatureExtractor.from_pretrained('./test/saved_model/')
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("./test/saved_model/")
         ```"""
         config = kwargs.pop("config", None)
         kwargs["_from_auto"] = True
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 0a49dc24f5..d2c6d496f2 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -134,10 +134,10 @@ class AutoProcessor:
         >>> from transformers import AutoProcessor
 
         >>> # Download processor from huggingface.co and cache.
-        >>> processor = AutoProcessor.from_pretrained('facebook/wav2vec2-base-960h')
+        >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
 
         >>> # If processor files are in a directory (e.g. processor was saved using *save_pretrained('./test/saved_model/')*)
-        >>> processor = AutoProcessor.from_pretrained('./test/saved_model/')
+        >>> processor = AutoProcessor.from_pretrained("./test/saved_model/")
         ```"""
         config = kwargs.pop("config", None)
         kwargs["_from_auto"] = True
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index d73d56b27f..896ee930e2 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -451,13 +451,13 @@ class AutoTokenizer:
         >>> from transformers import AutoTokenizer
 
         >>> # Download vocabulary from huggingface.co and cache.
-        >>> tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
 
         >>> # Download vocabulary from huggingface.co (user-uploaded) and cache.
-        >>> tokenizer = AutoTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
+        >>> tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
 
         >>> # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
-        >>> tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')
+        >>> tokenizer = AutoTokenizer.from_pretrained("./test/bert_saved_model/")
         ```"""
         config = kwargs.pop("config", None)
         kwargs["_from_auto"] = True
diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index 5862b9595b..4191f1d8e3 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -1779,8 +1779,8 @@ class BartForCausalLM(BartPretrainedModel):
         ```python
         >>> from transformers import BartTokenizer, BartForCausalLM
 
-        >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
-        >>> model = BartForCausalLM.from_pretrained('facebook/bart-large', add_cross_attention=False)
+        >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")
+        >>> model = BartForCausalLM.from_pretrained("facebook/bart-large", add_cross_attention=False)
         >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
diff --git a/src/transformers/models/bart/modeling_flax_bart.py b/src/transformers/models/bart/modeling_flax_bart.py
index d6cdcb7a01..f850227b3a 100644
--- a/src/transformers/models/bart/modeling_flax_bart.py
+++ b/src/transformers/models/bart/modeling_flax_bart.py
@@ -1021,11 +1021,11 @@ class FlaxBartPreTrainedModel(FlaxPreTrainedModel):
         ```python
         >>> from transformers import BartTokenizer, FlaxBartForConditionalGeneration
 
-        >>> model = FlaxBartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
-        >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+        >>> model = FlaxBartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
+        >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
 
         >>> text = "My friends are cool but they eat too many carbs."
-        >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors="jax")
         >>> encoder_outputs = model.encode(**inputs)
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -1087,11 +1087,11 @@ class FlaxBartPreTrainedModel(FlaxPreTrainedModel):
         ```python
         >>> from transformers import BartTokenizer, FlaxBartForConditionalGeneration
 
-        >>> model = FlaxBartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
-        >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+        >>> model = FlaxBartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
+        >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
 
         >>> text = "My friends are cool but they eat too many carbs."
-        >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors="jax")
         >>> encoder_outputs = model.encode(**inputs)
 
         >>> decoder_start_token_id = model.config.decoder_start_token_id
@@ -1355,11 +1355,11 @@ class FlaxBartForConditionalGeneration(FlaxBartPreTrainedModel):
         ```python
         >>> from transformers import BartTokenizer, FlaxBartForConditionalGeneration
 
-        >>> model = FlaxBartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
-        >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+        >>> model = FlaxBartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
+        >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
 
         >>> text = "My friends are cool but they eat too many carbs."
-        >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors="jax")
         >>> encoder_outputs = model.encode(**inputs)
 
         >>> decoder_start_token_id = model.config.decoder_start_token_id
diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py
index 19a2754148..0aeabccf4f 100755
--- a/src/transformers/models/beit/modeling_beit.py
+++ b/src/transformers/models/beit/modeling_beit.py
@@ -633,11 +633,11 @@ class BeitModel(BeitPreTrainedModel):
         >>> from PIL import Image
         >>> import requests
 
-        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-patch16-224-pt22k-ft22k')
-        >>> model = BeitModel.from_pretrained('microsoft/beit-base-patch16-224-pt22k-ft22k')
+        >>> feature_extractor = BeitFeatureExtractor.from_pretrained("microsoft/beit-base-patch16-224-pt22k-ft22k")
+        >>> model = BeitModel.from_pretrained("microsoft/beit-base-patch16-224-pt22k-ft22k")
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -750,11 +750,11 @@ class BeitForMaskedImageModeling(BeitPreTrainedModel):
         >>> from PIL import Image
         >>> import requests
 
-        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-patch16-224-pt22k')
-        >>> model = BeitForMaskedImageModeling.from_pretrained('microsoft/beit-base-patch16-224-pt22k')
+        >>> feature_extractor = BeitFeatureExtractor.from_pretrained("microsoft/beit-base-patch16-224-pt22k")
+        >>> model = BeitForMaskedImageModeling.from_pretrained("microsoft/beit-base-patch16-224-pt22k")
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -838,11 +838,11 @@ class BeitForImageClassification(BeitPreTrainedModel):
         >>> from PIL import Image
         >>> import requests
 
-        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-patch16-224')
-        >>> model = BeitForImageClassification.from_pretrained('microsoft/beit-base-patch16-224')
+        >>> feature_extractor = BeitFeatureExtractor.from_pretrained("microsoft/beit-base-patch16-224")
+        >>> model = BeitForImageClassification.from_pretrained("microsoft/beit-base-patch16-224")
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -1172,11 +1172,11 @@ class BeitForSemanticSegmentation(BeitPreTrainedModel):
         >>> from PIL import Image
         >>> import requests
 
-        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-finetuned-ade-640-640')
-        >>> model = BeitForSemanticSegmentation.from_pretrained('microsoft/beit-base-finetuned-ade-640-640')
+        >>> feature_extractor = BeitFeatureExtractor.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")
+        >>> model = BeitForSemanticSegmentation.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
diff --git a/src/transformers/models/beit/modeling_flax_beit.py b/src/transformers/models/beit/modeling_flax_beit.py
index 19ae6fabe2..1ba50f8758 100644
--- a/src/transformers/models/beit/modeling_flax_beit.py
+++ b/src/transformers/models/beit/modeling_flax_beit.py
@@ -735,11 +735,11 @@ FLAX_BEIT_MODEL_DOCSTRING = """
     >>> from PIL import Image
     >>> import requests
 
-    >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
     >>> image = Image.open(requests.get(url, stream=True).raw)
 
-    >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-patch16-224-pt22k-ft22k')
-    >>> model = FlaxBeitModel.from_pretrained('microsoft/beit-base-patch16-224-pt22k-ft22k')
+    >>> feature_extractor = BeitFeatureExtractor.from_pretrained("microsoft/beit-base-patch16-224-pt22k-ft22k")
+    >>> model = FlaxBeitModel.from_pretrained("microsoft/beit-base-patch16-224-pt22k-ft22k")
 
     >>> inputs = feature_extractor(images=image, return_tensors="np")
     >>> outputs = model(**inputs)
@@ -822,11 +822,11 @@ FLAX_BEIT_MLM_DOCSTRING = """
     >>> from PIL import Image
     >>> import requests
 
-    >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
     >>> image = Image.open(requests.get(url, stream=True).raw)
 
-    >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-patch16-224-pt22k')
-    >>> model = BeitForMaskedImageModeling.from_pretrained('microsoft/beit-base-patch16-224-pt22k')
+    >>> feature_extractor = BeitFeatureExtractor.from_pretrained("microsoft/beit-base-patch16-224-pt22k")
+    >>> model = BeitForMaskedImageModeling.from_pretrained("microsoft/beit-base-patch16-224-pt22k")
 
     >>> inputs = feature_extractor(images=image, return_tensors="np")
     >>> outputs = model(**inputs)
@@ -906,11 +906,11 @@ FLAX_BEIT_CLASSIF_DOCSTRING = """
     >>> from PIL import Image
     >>> import requests
 
-    >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
     >>> image = Image.open(requests.get(url, stream=True).raw)
 
-    >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-patch16-224')
-    >>> model = FlaxBeitForImageClassification.from_pretrained('microsoft/beit-base-patch16-224')
+    >>> feature_extractor = BeitFeatureExtractor.from_pretrained("microsoft/beit-base-patch16-224")
+    >>> model = FlaxBeitForImageClassification.from_pretrained("microsoft/beit-base-patch16-224")
 
     >>> inputs = feature_extractor(images=image, return_tensors="np")
     >>> outputs = model(**inputs)
diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py
index 7eff1dd2e8..66df84d47a 100755
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -1082,8 +1082,8 @@ class BertForPreTraining(BertPreTrainedModel):
         >>> from transformers import BertTokenizer, BertForPreTraining
         >>> import torch
 
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> model = BertForPreTraining.from_pretrained('bert-base-uncased')
+        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        >>> model = BertForPreTraining.from_pretrained("bert-base-uncased")
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -1208,10 +1208,10 @@ class BertLMHeadModel(BertPreTrainedModel):
         >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
         >>> import torch
 
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
         >>> config = BertConfig.from_pretrained("bert-base-cased")
         >>> config.is_decoder = True
-        >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+        >>> model = BertLMHeadModel.from_pretrained("bert-base-cased", config=config)
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -1436,16 +1436,16 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
         >>> from transformers import BertTokenizer, BertForNextSentencePrediction
         >>> import torch
 
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
+        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        >>> model = BertForNextSentencePrediction.from_pretrained("bert-base-uncased")
 
         >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
         >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
 
         >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
         >>> logits = outputs.logits
-        >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
+        >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
         ```
         """
 
diff --git a/src/transformers/models/bert/modeling_flax_bert.py b/src/transformers/models/bert/modeling_flax_bert.py
index 39e334ec8d..03f7309864 100644
--- a/src/transformers/models/bert/modeling_flax_bert.py
+++ b/src/transformers/models/bert/modeling_flax_bert.py
@@ -839,8 +839,8 @@ FLAX_BERT_FOR_PRETRAINING_DOCSTRING = """
     ```python
     >>> from transformers import BertTokenizer, FlaxBertForPreTraining
 
-    >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-    >>> model = FlaxBertForPreTraining.from_pretrained('bert-base-uncased')
+    >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    >>> model = FlaxBertForPreTraining.from_pretrained("bert-base-uncased")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
     >>> outputs = model(**inputs)
@@ -985,16 +985,16 @@ FLAX_BERT_FOR_NEXT_SENT_PRED_DOCSTRING = """
     ```python
     >>> from transformers import BertTokenizer, FlaxBertForNextSentencePrediction
 
-    >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-    >>> model = FlaxBertForNextSentencePrediction.from_pretrained('bert-base-uncased')
+    >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    >>> model = FlaxBertForNextSentencePrediction.from_pretrained("bert-base-uncased")
 
     >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
     >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-    >>> encoding = tokenizer(prompt, next_sentence, return_tensors='jax')
+    >>> encoding = tokenizer(prompt, next_sentence, return_tensors="jax")
 
     >>> outputs = model(**encoding)
     >>> logits = outputs.logits
-    >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
+    >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
     ```
 """
 
diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py
index 7f23abdc8c..c698cdc1ea 100644
--- a/src/transformers/models/bert/modeling_tf_bert.py
+++ b/src/transformers/models/bert/modeling_tf_bert.py
@@ -1233,9 +1233,11 @@ class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss):
         >>> import tensorflow as tf
         >>> from transformers import BertTokenizer, TFBertForPreTraining
 
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> model = TFBertForPreTraining.from_pretrained('bert-base-uncased')
-        >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
+        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        >>> model = TFBertForPreTraining.from_pretrained("bert-base-uncased")
+        >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[
+        ...     None, :
+        >>> ]  # Batch size 1
         >>> outputs = model(input_ids)
         >>> prediction_scores, seq_relationship_scores = outputs[:2]
         ```"""
@@ -1609,15 +1611,15 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel, TFNextSentencePredi
         >>> import tensorflow as tf
         >>> from transformers import BertTokenizer, TFBertForNextSentencePrediction
 
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> model = TFBertForNextSentencePrediction.from_pretrained('bert-base-uncased')
+        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        >>> model = TFBertForNextSentencePrediction.from_pretrained("bert-base-uncased")
 
         >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
         >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='tf')
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="tf")
 
-        >>> logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
-        >>> assert logits[0][0] < logits[0][1] # the next sentence was random
+        >>> logits = model(encoding["input_ids"], token_type_ids=encoding["token_type_ids"])[0]
+        >>> assert logits[0][0] < logits[0][1]  # the next sentence was random
         ```"""
         inputs = input_processing(
             func=self.call,
diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py
index 0ec8ab9705..5a910af959 100755
--- a/src/transformers/models/bert_generation/modeling_bert_generation.py
+++ b/src/transformers/models/bert_generation/modeling_bert_generation.py
@@ -513,10 +513,12 @@ class BertGenerationDecoder(BertGenerationPreTrainedModel):
         >>> from transformers import BertGenerationTokenizer, BertGenerationDecoder, BertGenerationConfig
         >>> import torch
 
-        >>> tokenizer = BertGenerationTokenizer.from_pretrained('google/bert_for_seq_generation_L-24_bbc_encoder')
+        >>> tokenizer = BertGenerationTokenizer.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder")
         >>> config = BertGenerationConfig.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder")
         >>> config.is_decoder = True
-        >>> model = BertGenerationDecoder.from_pretrained('google/bert_for_seq_generation_L-24_bbc_encoder', config=config)
+        >>> model = BertGenerationDecoder.from_pretrained(
+        ...     "google/bert_for_seq_generation_L-24_bbc_encoder", config=config
+        ... )
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_token_type_ids=False, return_tensors="pt")
         >>> outputs = model(**inputs)
diff --git a/src/transformers/models/bertweet/tokenization_bertweet.py b/src/transformers/models/bertweet/tokenization_bertweet.py
index 71f4038ffe..bc1b5dab2f 100644
--- a/src/transformers/models/bertweet/tokenization_bertweet.py
+++ b/src/transformers/models/bertweet/tokenization_bertweet.py
@@ -685,6 +685,7 @@ class TweetTokenizer:
     ```python
     >>> # Tokenizer for tweets.
     >>> from nltk.tokenize import TweetTokenizer
+
     >>> tknzr = TweetTokenizer()
     >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
     >>> tknzr.tokenize(s0)
@@ -692,7 +693,7 @@ class TweetTokenizer:
 
     >>> # Examples using *strip_handles* and *reduce_len parameters*:
     >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
-    >>> s1 = '@remy: This is waaaaayyyy too much for you!!!!!!'
+    >>> s1 = "@remy: This is waaaaayyyy too much for you!!!!!!"
     >>> tknzr.tokenize(s1)
     [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
     ```"""
diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py
index 47b9644430..5b3c62c988 100755
--- a/src/transformers/models/big_bird/modeling_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_big_bird.py
@@ -2308,8 +2308,8 @@ class BigBirdForPreTraining(BigBirdPreTrainedModel):
         >>> from transformers import BigBirdTokenizer, BigBirdForPreTraining
         >>> import torch
 
-        >>> tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')
-        >>> model = BigBirdForPreTraining.from_pretrained('google/bigbird-roberta-base')
+        >>> tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base")
+        >>> model = BigBirdForPreTraining.from_pretrained("google/bigbird-roberta-base")
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -2532,10 +2532,10 @@ class BigBirdForCausalLM(BigBirdPreTrainedModel):
         >>> from transformers import BigBirdTokenizer, BigBirdForCausalLM, BigBirdConfig
         >>> import torch
 
-        >>> tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')
+        >>> tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base")
         >>> config = BigBirdConfig.from_pretrained("google/bigbird-roberta-base")
         >>> config.is_decoder = True
-        >>> model = BigBirdForCausalLM.from_pretrained('google/bigbird-roberta-base', config=config)
+        >>> model = BigBirdForCausalLM.from_pretrained("google/bigbird-roberta-base", config=config)
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
diff --git a/src/transformers/models/big_bird/modeling_flax_big_bird.py b/src/transformers/models/big_bird/modeling_flax_big_bird.py
index 214eb9740f..d03244f404 100644
--- a/src/transformers/models/big_bird/modeling_flax_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_flax_big_bird.py
@@ -1644,8 +1644,8 @@ FLAX_BIG_BIRD_FOR_PRETRAINING_DOCSTRING = """
     ```python
     >>> from transformers import BigBirdTokenizer, FlaxBigBirdForPreTraining
 
-    >>> tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')
-    >>> model = FlaxBigBirdForPreTraining.from_pretrained('google/bigbird-roberta-base')
+    >>> tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base")
+    >>> model = FlaxBigBirdForPreTraining.from_pretrained("google/bigbird-roberta-base")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
     >>> outputs = model(**inputs)
diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
index 5eb5fda103..81c28f4e4d 100755
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -2861,7 +2861,6 @@ class BigBirdPegasusDecoderWrapper(BigBirdPegasusPreTrainedModel):
         return self.decoder(*args, **kwargs)
 
 
-# Copied from transformers.models.bart.modeling_bart.BartForCausalLM with BartDecoderWrapper->BigBirdPegasusDecoderWrapper, BartForCausalLM->BigBirdPegasusForCausalLM, BartPreTrainedModel->BigBirdPegasusPreTrainedModel, BartTokenizer->PegasusTokenizer, 'facebook/bart-large'->"google/bigbird-pegasus-large-arxiv"
 class BigBirdPegasusForCausalLM(BigBirdPegasusPreTrainedModel):
     def __init__(self, config):
         config = copy.deepcopy(config)
@@ -2984,7 +2983,9 @@ class BigBirdPegasusForCausalLM(BigBirdPegasusPreTrainedModel):
         >>> from transformers import PegasusTokenizer, BigBirdPegasusForCausalLM
 
         >>> tokenizer = PegasusTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv")
-        >>> model = BigBirdPegasusForCausalLM.from_pretrained("google/bigbird-pegasus-large-arxiv", add_cross_attention=False)
+        >>> model = BigBirdPegasusForCausalLM.from_pretrained(
+        ...     "google/bigbird-pegasus-large-arxiv", add_cross_attention=False
+        ... )
         >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index b2e7f0b5a8..eae58ee190 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -1130,7 +1130,9 @@ class BlenderbotModel(BlenderbotPreTrainedModel):
         >>> model = BlenderbotModel.from_pretrained("facebook/blenderbot-400M-distill")
         >>> tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
 
-        >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+        >>> input_ids = tokenizer(
+        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+        >>> ).input_ids  # Batch size 1
         >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
         >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
 
@@ -1506,8 +1508,8 @@ class BlenderbotForCausalLM(BlenderbotPreTrainedModel):
         ```python
         >>> from transformers import BlenderbotTokenizer, BlenderbotForCausalLM
 
-        >>> tokenizer = BlenderbotTokenizer.from_pretrained('facebook/bart-large')
-        >>> model = BlenderbotForCausalLM.from_pretrained('facebook/bart-large', add_cross_attention=False)
+        >>> tokenizer = BlenderbotTokenizer.from_pretrained("facebook/bart-large")
+        >>> model = BlenderbotForCausalLM.from_pretrained("facebook/bart-large", add_cross_attention=False)
         >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
diff --git a/src/transformers/models/blenderbot/modeling_flax_blenderbot.py b/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
index cd41a0c85f..f9f7829970 100644
--- a/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
@@ -982,11 +982,11 @@ class FlaxBlenderbotPreTrainedModel(FlaxPreTrainedModel):
         ```python
         >>> from transformers import BlenderbotTokenizer, FlaxBlenderbotForConditionalGeneration
 
-        >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot-400M-distill')
-        >>> tokenizer = BlenderbotTokenizer.from_pretrained('facebook/blenderbot-400M-distill')
+        >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained("facebook/blenderbot-400M-distill")
+        >>> tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
 
         >>> text = "My friends are cool but they eat too many carbs."
-        >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors="jax")
         >>> encoder_outputs = model.encode(**inputs)
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -1050,11 +1050,11 @@ class FlaxBlenderbotPreTrainedModel(FlaxPreTrainedModel):
         ```python
         >>> from transformers import BlenderbotTokenizer, FlaxBlenderbotForConditionalGeneration
 
-        >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot-400M-distill')
-        >>> tokenizer = BlenderbotTokenizer.from_pretrained('facebook/blenderbot-400M-distill')
+        >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained("facebook/blenderbot-400M-distill")
+        >>> tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
 
         >>> text = "My friends are cool but they eat too many carbs."
-        >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors="jax")
         >>> encoder_outputs = model.encode(**inputs)
 
         >>> decoder_start_token_id = model.config.decoder_start_token_id
@@ -1319,11 +1319,11 @@ class FlaxBlenderbotForConditionalGeneration(FlaxBlenderbotPreTrainedModel):
         ```python
         >>> from transformers import BlenderbotTokenizer, FlaxBlenderbotForConditionalGeneration
 
-        >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot-400M-distill')
-        >>> tokenizer = BlenderbotTokenizer.from_pretrained('facebook/blenderbot-400M-distill')
+        >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained("facebook/blenderbot-400M-distill")
+        >>> tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
 
         >>> text = "My friends are cool but they eat too many carbs."
-        >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors="jax")
         >>> encoder_outputs = model.encode(**inputs)
 
         >>> decoder_start_token_id = model.config.decoder_start_token_id
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index ad52947121..d180c7837d 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -1113,7 +1113,9 @@ class BlenderbotSmallModel(BlenderbotSmallPreTrainedModel):
         >>> model = BlenderbotSmallModel.from_pretrained("facebook/blenderbot_small-90M")
         >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot_small-90M")
 
-        >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+        >>> input_ids = tokenizer(
+        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+        >>> ).input_ids  # Batch size 1
         >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
         >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
 
@@ -1477,8 +1479,8 @@ class BlenderbotSmallForCausalLM(BlenderbotSmallPreTrainedModel):
         ```python
         >>> from transformers import BlenderbotSmallTokenizer, BlenderbotSmallForCausalLM
 
-        >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/bart-large')
-        >>> model = BlenderbotSmallForCausalLM.from_pretrained('facebook/bart-large', add_cross_attention=False)
+        >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained("facebook/bart-large")
+        >>> model = BlenderbotSmallForCausalLM.from_pretrained("facebook/bart-large", add_cross_attention=False)
         >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
diff --git a/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
index 408589ab14..ffa1eac9d9 100644
--- a/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
@@ -994,11 +994,11 @@ class FlaxBlenderbotSmallPreTrainedModel(FlaxPreTrainedModel):
         ```python
         >>> from transformers import BlenderbotSmallTokenizer, FlaxBlenderbotSmallForConditionalGeneration
 
-        >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained('facebook/blenderbot_small-90M')
-        >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/blenderbot_small-90M')
+        >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained("facebook/blenderbot_small-90M")
+        >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot_small-90M")
 
         >>> text = "My friends are cool but they eat too many carbs."
-        >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors="np")
         >>> encoder_outputs = model.encode(**inputs)
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -1062,11 +1062,11 @@ class FlaxBlenderbotSmallPreTrainedModel(FlaxPreTrainedModel):
         ```python
         >>> from transformers import BlenderbotSmallTokenizer, FlaxBlenderbotSmallForConditionalGeneration
 
-        >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained('facebook/blenderbot_small-90M')
-        >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/blenderbot_small-90M')
+        >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained("facebook/blenderbot_small-90M")
+        >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot_small-90M")
 
         >>> text = "My friends are cool but they eat too many carbs."
-        >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors="np")
         >>> encoder_outputs = model.encode(**inputs)
 
         >>> decoder_start_token_id = model.config.decoder_start_token_id
@@ -1331,11 +1331,11 @@ class FlaxBlenderbotSmallForConditionalGeneration(FlaxBlenderbotSmallPreTrainedM
         ```python
         >>> from transformers import BlenderbotSmallTokenizer, FlaxBlenderbotSmallForConditionalGeneration
 
-        >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained('facebook/blenderbot_small-90M')
-        >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/blenderbot_small-90M')
+        >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained("facebook/blenderbot_small-90M")
+        >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot_small-90M")
 
         >>> text = "My friends are cool but they eat too many carbs."
-        >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors="np")
         >>> encoder_outputs = model.encode(**inputs)
 
         >>> decoder_start_token_id = model.config.decoder_start_token_id
diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py
index aabfdf5bfc..27a3cc859b 100755
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -707,11 +707,11 @@ class CLIPTextModel(CLIPPreTrainedModel):
         >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
         >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
 
-        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"],  padding=True, return_tensors="pt")
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
 
         >>> outputs = model(**inputs)
         >>> last_hidden_state = outputs.last_hidden_state
-        >>> pooled_output = outputs.pooler_output # pooled (EOS token) states
+        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
         ```"""
         return self.text_model(
             input_ids=input_ids,
@@ -823,7 +823,7 @@ class CLIPVisionModel(CLIPPreTrainedModel):
 
         >>> outputs = model(**inputs)
         >>> last_hidden_state = outputs.last_hidden_state
-        >>> pooled_output = outputs.pooler_output # pooled CLS states
+        >>> pooled_output = outputs.pooler_output  # pooled CLS states
         ```"""
         return self.vision_model(
             pixel_values=pixel_values,
@@ -890,7 +890,7 @@ class CLIPModel(CLIPPreTrainedModel):
         >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
         >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
 
-        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"],  padding=True, return_tensors="pt")
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
         >>> text_features = model.get_text_features(**inputs)
         ```"""
         text_outputs = self.text_model(
@@ -978,11 +978,13 @@ class CLIPModel(CLIPPreTrainedModel):
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
+        ... )
 
         >>> outputs = model(**inputs)
-        >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
-        >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.return_dict
         vision_outputs = self.vision_model(
diff --git a/src/transformers/models/clip/modeling_flax_clip.py b/src/transformers/models/clip/modeling_flax_clip.py
index cbb811ab67..c7160a79ec 100644
--- a/src/transformers/models/clip/modeling_flax_clip.py
+++ b/src/transformers/models/clip/modeling_flax_clip.py
@@ -813,7 +813,7 @@ class FlaxCLIPPreTrainedModel(FlaxPreTrainedModel):
         >>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
         >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
 
-        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"],  padding=True, return_tensors="np")
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="np")
         >>> text_features = model.get_text_features(**inputs)
         ```"""
         if position_ids is None:
@@ -943,11 +943,11 @@ FLAX_CLIP_TEXT_MODEL_DOCSTRING = """
     >>> model = FlaxCLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
     >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
 
-    >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"],  padding=True, return_tensors="np")
+    >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="np")
 
     >>> outputs = model(**inputs)
     >>> last_hidden_state = outputs.last_hidden_state
-    >>> pooler_output = outputs.pooler_output # pooled (EOS token) states
+    >>> pooler_output = outputs.pooler_output  # pooled (EOS token) states
     ```
 """
 
@@ -1005,7 +1005,7 @@ FLAX_CLIP_VISION_MODEL_DOCSTRING = """
 
     >>> outputs = model(**inputs)
     >>> last_hidden_state = outputs.last_hidden_state
-    >>> pooler_output = outputs.pooler_output # pooled CLS states
+    >>> pooler_output = outputs.pooler_output  # pooled CLS states
     ```
 """
 
@@ -1128,11 +1128,13 @@ FLAX_CLIP_MODEL_DOCSTRING = """
     >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
     >>> image = Image.open(requests.get(url, stream=True).raw)
 
-    >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="np", padding=True)
+    >>> inputs = processor(
+    ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="np", padding=True
+    ... )
 
     >>> outputs = model(**inputs)
-    >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
-    >>> probs = jax.nn.softmax(logits_per_image, axis=1) # we can take the softmax to get the label probabilities
+    >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+    >>> probs = jax.nn.softmax(logits_per_image, axis=1)  # we can take the softmax to get the label probabilities
     ```
 """
 
diff --git a/src/transformers/models/clip/modeling_tf_clip.py b/src/transformers/models/clip/modeling_tf_clip.py
index a32037db21..36ca1bea9e 100644
--- a/src/transformers/models/clip/modeling_tf_clip.py
+++ b/src/transformers/models/clip/modeling_tf_clip.py
@@ -1131,11 +1131,11 @@ class TFCLIPTextModel(TFCLIPPreTrainedModel):
         >>> model = TFCLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
         >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
 
-        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"],  padding=True, return_tensors="tf")
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf")
 
         >>> outputs = model(**inputs)
         >>> last_hidden_state = outputs.last_hidden_state
-        >>> pooled_output = outputs.pooler_output # pooled (EOS token) states
+        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
         ```"""
         inputs = input_processing(
             func=self.call,
@@ -1245,7 +1245,7 @@ class TFCLIPVisionModel(TFCLIPPreTrainedModel):
 
         >>> outputs = model(**inputs)
         >>> last_hidden_state = outputs.last_hidden_state
-        >>> pooled_output = outputs.pooler_output # pooled CLS states
+        >>> pooled_output = outputs.pooler_output  # pooled CLS states
         ```"""
         inputs = input_processing(
             func=self.call,
@@ -1355,7 +1355,7 @@ class TFCLIPModel(TFCLIPPreTrainedModel):
         >>> model = TFCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
         >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
 
-        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"],  padding=True, return_tensors="tf")
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf")
         >>> text_features = model.get_text_features(**inputs)
         ```"""
         inputs = input_processing(
@@ -1469,11 +1469,13 @@ class TFCLIPModel(TFCLIPPreTrainedModel):
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="tf", padding=True)
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="tf", padding=True
+        ... )
 
         >>> outputs = model(**inputs)
-        >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
-        >>> probs = tf.nn.softmax(logits_per_image, axis=1) # we can take the softmax to get the label probabilities
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = tf.nn.softmax(logits_per_image, axis=1)  # we can take the softmax to get the label probabilities
         ```"""
         inputs = input_processing(
             func=self.call,
diff --git a/src/transformers/models/convbert/configuration_convbert.py b/src/transformers/models/convbert/configuration_convbert.py
index 8f0c6fd773..0f654eebb4 100644
--- a/src/transformers/models/convbert/configuration_convbert.py
+++ b/src/transformers/models/convbert/configuration_convbert.py
@@ -78,6 +78,7 @@ class ConvBertConfig(PretrainedConfig):
 
     ```python
     >>> from transformers import ConvBertModel, ConvBertConfig
+
     >>> # Initializing a ConvBERT convbert-base-uncased style configuration
     >>> configuration = ConvBertConfig()
     >>> # Initializing a model from the convbert-base-uncased style configuration
diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py
index 1f179ed5df..364c067e87 100644
--- a/src/transformers/models/deberta/modeling_deberta.py
+++ b/src/transformers/models/deberta/modeling_deberta.py
@@ -90,10 +90,10 @@ class XSoftmax(torch.autograd.Function):
     >>> from transformers.models.deberta.modeling_deberta import XSoftmax
 
     >>> # Make a tensor
-    >>> x = torch.randn([4,20,100])
+    >>> x = torch.randn([4, 20, 100])
 
     >>> # Create a mask
-    >>> mask = (x>0).int()
+    >>> mask = (x > 0).int()
 
     >>> # Specify the dimension to apply softmax
     >>> dim = -1
diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
index be6e93e973..abe1d3e588 100644
--- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py
@@ -91,10 +91,10 @@ class XSoftmax(torch.autograd.Function):
     >>> from transformers.models.deberta_v2.modeling_deberta_v2 import XSoftmax
 
     >>> # Make a tensor
-    >>> x = torch.randn([4,20,100])
+    >>> x = torch.randn([4, 20, 100])
 
     >>> # Create a mask
-    >>> mask = (x>0).int()
+    >>> mask = (x > 0).int()
 
     >>> # Specify the dimension to apply softmax
     >>> dim = -1
diff --git a/src/transformers/models/deit/modeling_deit.py b/src/transformers/models/deit/modeling_deit.py
index acad311b34..77d610f720 100644
--- a/src/transformers/models/deit/modeling_deit.py
+++ b/src/transformers/models/deit/modeling_deit.py
@@ -492,11 +492,11 @@ class DeiTModel(DeiTPreTrainedModel):
         >>> from PIL import Image
         >>> import requests
 
-        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = DeiTFeatureExtractor.from_pretrained('facebook/deit-base-distilled-patch16-224')
-        >>> model = DeiTModel.from_pretrained('facebook/deit-base-distilled-patch16-224', add_pooling_layer=False)
+        >>> feature_extractor = DeiTFeatureExtractor.from_pretrained("facebook/deit-base-distilled-patch16-224")
+        >>> model = DeiTModel.from_pretrained("facebook/deit-base-distilled-patch16-224", add_pooling_layer=False)
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -604,13 +604,13 @@ class DeiTForImageClassification(DeiTPreTrainedModel):
         >>> from PIL import Image
         >>> import requests
 
-        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
         >>> # note: we are loading a DeiTForImageClassificationWithTeacher from the hub here,
         >>> # so the head will be randomly initialized, hence the predictions will be random
-        >>> feature_extractor = DeiTFeatureExtractor.from_pretrained('facebook/deit-base-distilled-patch16-224')
-        >>> model = DeiTForImageClassification.from_pretrained('facebook/deit-base-distilled-patch16-224')
+        >>> feature_extractor = DeiTFeatureExtractor.from_pretrained("facebook/deit-base-distilled-patch16-224")
+        >>> model = DeiTForImageClassification.from_pretrained("facebook/deit-base-distilled-patch16-224")
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -737,11 +737,11 @@ class DeiTForImageClassificationWithTeacher(DeiTPreTrainedModel):
         >>> from PIL import Image
         >>> import requests
 
-        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = DeiTFeatureExtractor.from_pretrained('facebook/deit-base-distilled-patch16-224')
-        >>> model = DeiTForImageClassificationWithTeacher.from_pretrained('facebook/deit-base-distilled-patch16-224')
+        >>> feature_extractor = DeiTFeatureExtractor.from_pretrained("facebook/deit-base-distilled-patch16-224")
+        >>> model = DeiTForImageClassificationWithTeacher.from_pretrained("facebook/deit-base-distilled-patch16-224")
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
index e57ec6fb58..334f07d382 100644
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -1224,11 +1224,11 @@ class DetrModel(DetrPreTrainedModel):
         >>> from PIL import Image
         >>> import requests
 
-        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = DetrFeatureExtractor.from_pretrained('facebook/detr-resnet-50')
-        >>> model = DetrModel.from_pretrained('facebook/detr-resnet-50')
+        >>> feature_extractor = DetrFeatureExtractor.from_pretrained("facebook/detr-resnet-50")
+        >>> model = DetrModel.from_pretrained("facebook/detr-resnet-50")
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
         >>> last_hidden_states = outputs.last_hidden_state
@@ -1381,11 +1381,11 @@ class DetrForObjectDetection(DetrPreTrainedModel):
         >>> from PIL import Image
         >>> import requests
 
-        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = DetrFeatureExtractor.from_pretrained('facebook/detr-resnet-50')
-        >>> model = DetrForObjectDetection.from_pretrained('facebook/detr-resnet-50')
+        >>> feature_extractor = DetrFeatureExtractor.from_pretrained("facebook/detr-resnet-50")
+        >>> model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -1537,11 +1537,11 @@ class DetrForSegmentation(DetrPreTrainedModel):
         >>> from PIL import Image
         >>> import requests
 
-        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = DetrFeatureExtractor.from_pretrained('facebook/detr-resnet-50-panoptic')
-        >>> model = DetrForSegmentation.from_pretrained('facebook/detr-resnet-50-panoptic')
+        >>> feature_extractor = DetrFeatureExtractor.from_pretrained("facebook/detr-resnet-50-panoptic")
+        >>> model = DetrForSegmentation.from_pretrained("facebook/detr-resnet-50-panoptic")
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py
index 58f4d70ebc..0a1ebd9b4a 100755
--- a/src/transformers/models/distilbert/modeling_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_distilbert.py
@@ -1064,16 +1064,16 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel):
         >>> from transformers import DistilBertTokenizer, DistilBertForMultipleChoice
         >>> import torch
 
-        >>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')
-        >>> model = DistilBertForMultipleChoice.from_pretrained('distilbert-base-cased')
+        >>> tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased")
+        >>> model = DistilBertForMultipleChoice.from_pretrained("distilbert-base-cased")
 
         >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
         >>> choice0 = "It is eaten with a fork and a knife."
         >>> choice1 = "It is eaten while held in the hand."
         >>> labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
 
-        >>> encoding = tokenizer([[prompt, choice0], [prompt, choice1]], return_tensors='pt', padding=True)
-        >>> outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1
+        >>> encoding = tokenizer([[prompt, choice0], [prompt, choice1]], return_tensors="pt", padding=True)
+        >>> outputs = model(**{k: v.unsqueeze(0) for k, v in encoding.items()}, labels=labels)  # batch size is 1
 
         >>> # the linear classifier still needs to be trained
         >>> loss = outputs.loss
diff --git a/src/transformers/models/dpr/modeling_dpr.py b/src/transformers/models/dpr/modeling_dpr.py
index aa2ccd4ce2..bcd0f49889 100644
--- a/src/transformers/models/dpr/modeling_dpr.py
+++ b/src/transformers/models/dpr/modeling_dpr.py
@@ -468,9 +468,10 @@ class DPRContextEncoder(DPRPretrainedContextEncoder):
 
         ```python
         >>> from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
-        >>> tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
-        >>> model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
-        >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"]
+
+        >>> tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
+        >>> model = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
+        >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors="pt")["input_ids"]
         >>> embeddings = model(input_ids).pooler_output
         ```"""
 
@@ -548,9 +549,10 @@ class DPRQuestionEncoder(DPRPretrainedQuestionEncoder):
 
         ```python
         >>> from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
-        >>> tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
-        >>> model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
-        >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"]
+
+        >>> tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
+        >>> model = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
+        >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors="pt")["input_ids"]
         >>> embeddings = model(input_ids).pooler_output
         ```
         """
@@ -627,14 +629,15 @@ class DPRReader(DPRPretrainedReader):
 
         ```python
         >>> from transformers import DPRReader, DPRReaderTokenizer
-        >>> tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
-        >>> model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base')
+
+        >>> tokenizer = DPRReaderTokenizer.from_pretrained("facebook/dpr-reader-single-nq-base")
+        >>> model = DPRReader.from_pretrained("facebook/dpr-reader-single-nq-base")
         >>> encoded_inputs = tokenizer(
-        ...         questions=["What is love ?"],
-        ...         titles=["Haddaway"],
-        ...         texts=["'What Is Love' is a song recorded by the artist Haddaway"],
-        ...         return_tensors='pt'
-        ...     )
+        ...     questions=["What is love ?"],
+        ...     titles=["Haddaway"],
+        ...     texts=["'What Is Love' is a song recorded by the artist Haddaway"],
+        ...     return_tensors="pt",
+        ... )
         >>> outputs = model(**encoded_inputs)
         >>> start_logits = outputs.start_logits
         >>> end_logits = outputs.end_logits
diff --git a/src/transformers/models/dpr/modeling_tf_dpr.py b/src/transformers/models/dpr/modeling_tf_dpr.py
index d48be40c38..160b97609f 100644
--- a/src/transformers/models/dpr/modeling_tf_dpr.py
+++ b/src/transformers/models/dpr/modeling_tf_dpr.py
@@ -615,9 +615,10 @@ class TFDPRContextEncoder(TFDPRPretrainedContextEncoder):
 
         ```python
         >>> from transformers import TFDPRContextEncoder, DPRContextEncoderTokenizer
-        >>> tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
-        >>> model = TFDPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', from_pt=True)
-        >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='tf')["input_ids"]
+
+        >>> tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
+        >>> model = TFDPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base", from_pt=True)
+        >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors="tf")["input_ids"]
         >>> embeddings = model(input_ids).pooler_output
         ```
         """
@@ -715,9 +716,10 @@ class TFDPRQuestionEncoder(TFDPRPretrainedQuestionEncoder):
 
         ```python
         >>> from transformers import TFDPRQuestionEncoder, DPRQuestionEncoderTokenizer
-        >>> tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
-        >>> model = TFDPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base', from_pt=True)
-        >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='tf')["input_ids"]
+
+        >>> tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
+        >>> model = TFDPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base", from_pt=True)
+        >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors="tf")["input_ids"]
         >>> embeddings = model(input_ids).pooler_output
         ```
         """
@@ -813,14 +815,15 @@ class TFDPRReader(TFDPRPretrainedReader):
 
         ```python
         >>> from transformers import TFDPRReader, DPRReaderTokenizer
-        >>> tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
-        >>> model = TFDPRReader.from_pretrained('facebook/dpr-reader-single-nq-base', from_pt=True)
+
+        >>> tokenizer = DPRReaderTokenizer.from_pretrained("facebook/dpr-reader-single-nq-base")
+        >>> model = TFDPRReader.from_pretrained("facebook/dpr-reader-single-nq-base", from_pt=True)
         >>> encoded_inputs = tokenizer(
-        ...         questions=["What is love ?"],
-        ...         titles=["Haddaway"],
-        ...         texts=["'What Is Love' is a song recorded by the artist Haddaway"],
-        ...         return_tensors='tf'
-        ...     )
+        ...     questions=["What is love ?"],
+        ...     titles=["Haddaway"],
+        ...     texts=["'What Is Love' is a song recorded by the artist Haddaway"],
+        ...     return_tensors="tf",
+        ... )
         >>> outputs = model(encoded_inputs)
         >>> start_logits = outputs.start_logits
         >>> end_logits = outputs.end_logits
diff --git a/src/transformers/models/dpr/tokenization_dpr.py b/src/transformers/models/dpr/tokenization_dpr.py
index d5e3c29732..9ce7b81be8 100644
--- a/src/transformers/models/dpr/tokenization_dpr.py
+++ b/src/transformers/models/dpr/tokenization_dpr.py
@@ -280,14 +280,15 @@ class CustomDPRReaderTokenizerMixin:
 
         ```python
         >>> from transformers import DPRReader, DPRReaderTokenizer
-        >>> tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
-        >>> model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base')
+
+        >>> tokenizer = DPRReaderTokenizer.from_pretrained("facebook/dpr-reader-single-nq-base")
+        >>> model = DPRReader.from_pretrained("facebook/dpr-reader-single-nq-base")
         >>> encoded_inputs = tokenizer(
-        ...         questions=["What is love ?"],
-        ...         titles=["Haddaway"],
-        ...         texts=["'What Is Love' is a song recorded by the artist Haddaway"],
-        ...         return_tensors='pt'
-        ...     )
+        ...     questions=["What is love ?"],
+        ...     titles=["Haddaway"],
+        ...     texts=["'What Is Love' is a song recorded by the artist Haddaway"],
+        ...     return_tensors="pt",
+        ... )
         >>> outputs = model(**encoded_inputs)
         >>> predicted_spans = tokenizer.decode_best_spans(encoded_inputs, outputs)
         >>> print(predicted_spans[0].text)  # best span
diff --git a/src/transformers/models/dpr/tokenization_dpr_fast.py b/src/transformers/models/dpr/tokenization_dpr_fast.py
index 6be8087371..8ea21aeea1 100644
--- a/src/transformers/models/dpr/tokenization_dpr_fast.py
+++ b/src/transformers/models/dpr/tokenization_dpr_fast.py
@@ -281,14 +281,15 @@ class CustomDPRReaderTokenizerMixin:
 
         ```python
         >>> from transformers import DPRReader, DPRReaderTokenizer
-        >>> tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
-        >>> model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base')
+
+        >>> tokenizer = DPRReaderTokenizer.from_pretrained("facebook/dpr-reader-single-nq-base")
+        >>> model = DPRReader.from_pretrained("facebook/dpr-reader-single-nq-base")
         >>> encoded_inputs = tokenizer(
-        ...         questions=["What is love ?"],
-        ...         titles=["Haddaway"],
-        ...         texts=["'What Is Love' is a song recorded by the artist Haddaway"],
-        ...         return_tensors='pt'
-        ...     )
+        ...     questions=["What is love ?"],
+        ...     titles=["Haddaway"],
+        ...     texts=["'What Is Love' is a song recorded by the artist Haddaway"],
+        ...     return_tensors="pt",
+        ... )
         >>> outputs = model(**encoded_inputs)
         >>> predicted_spans = tokenizer.decode_best_spans(encoded_inputs, outputs)
         >>> print(predicted_spans[0].text)  # best span
diff --git a/src/transformers/models/electra/modeling_electra.py b/src/transformers/models/electra/modeling_electra.py
index 3f00111518..8ad939d5f6 100644
--- a/src/transformers/models/electra/modeling_electra.py
+++ b/src/transformers/models/electra/modeling_electra.py
@@ -1095,10 +1095,12 @@ class ElectraForPreTraining(ElectraPreTrainedModel):
         >>> from transformers import ElectraTokenizer, ElectraForPreTraining
         >>> import torch
 
-        >>> tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
-        >>> model = ElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
+        >>> tokenizer = ElectraTokenizer.from_pretrained("google/electra-small-discriminator")
+        >>> model = ElectraForPreTraining.from_pretrained("google/electra-small-discriminator")
 
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(
+        ...     0
+        >>> )  # Batch size 1
         >>> logits = model(input_ids).logits
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/electra/modeling_flax_electra.py b/src/transformers/models/electra/modeling_flax_electra.py
index 8ef3f270e0..7f277ede35 100644
--- a/src/transformers/models/electra/modeling_flax_electra.py
+++ b/src/transformers/models/electra/modeling_flax_electra.py
@@ -821,8 +821,8 @@ FLAX_ELECTRA_FOR_PRETRAINING_DOCSTRING = """
     ```python
     >>> from transformers import ElectraTokenizer, FlaxElectraForPreTraining
 
-    >>> tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
-    >>> model = FlaxElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
+    >>> tokenizer = ElectraTokenizer.from_pretrained("google/electra-small-discriminator")
+    >>> model = FlaxElectraForPreTraining.from_pretrained("google/electra-small-discriminator")
 
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
     >>> outputs = model(**inputs)
diff --git a/src/transformers/models/electra/modeling_tf_electra.py b/src/transformers/models/electra/modeling_tf_electra.py
index 8e734e24e9..c00bb3c190 100644
--- a/src/transformers/models/electra/modeling_tf_electra.py
+++ b/src/transformers/models/electra/modeling_tf_electra.py
@@ -1088,8 +1088,8 @@ class TFElectraForPreTraining(TFElectraPreTrainedModel):
         >>> import tensorflow as tf
         >>> from transformers import ElectraTokenizer, TFElectraForPreTraining
 
-        >>> tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
-        >>> model = TFElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
+        >>> tokenizer = ElectraTokenizer.from_pretrained("google/electra-small-discriminator")
+        >>> model = TFElectraForPreTraining.from_pretrained("google/electra-small-discriminator")
         >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
         >>> outputs = model(input_ids)
         >>> scores = outputs[0]
diff --git a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py
index 833ff5d759..1fca8a10f7 100644
--- a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py
@@ -57,17 +57,17 @@ class EncoderDecoderConfig(PretrainedConfig):
 
     >>> # Accessing the model configuration
     >>> config_encoder = model.config.encoder
-    >>> config_decoder  = model.config.decoder
+    >>> config_decoder = model.config.decoder
     >>> # set decoder config to causal lm
     >>> config_decoder.is_decoder = True
     >>> config_decoder.add_cross_attention = True
 
     >>> # Saving the model, including its configuration
-    >>> model.save_pretrained('my-model')
+    >>> model.save_pretrained("my-model")
 
     >>> # loading model and config from pretrained folder
-    >>> encoder_decoder_config = EncoderDecoderConfig.from_pretrained('my-model')
-    >>> model = EncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config)
+    >>> encoder_decoder_config = EncoderDecoderConfig.from_pretrained("my-model")
+    >>> model = EncoderDecoderModel.from_pretrained("my-model", config=encoder_decoder_config)
     ```"""
     model_type = "encoder-decoder"
     is_composition = True
diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
index 7ec7aa8c59..f829773ca0 100644
--- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
@@ -336,8 +336,9 @@ class EncoderDecoderModel(PreTrainedModel):
 
         ```python
         >>> from transformers import EncoderDecoderModel
+
         >>> # initialize a bert2bert from two pretrained BERT models. Note that the cross-attention layers will be randomly initialized
-        >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased')
+        >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")
         >>> # saving model after fine-tuning
         >>> model.save_pretrained("./bert2bert")
         >>> # load fine-tuned model
@@ -448,8 +449,10 @@ class EncoderDecoderModel(PreTrainedModel):
         >>> from transformers import EncoderDecoderModel, BertTokenizer
         >>> import torch
 
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert from pre-trained checkpoints
+        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained(
+        ...     "bert-base-uncased", "bert-base-uncased"
+        >>> )  # initialize Bert2Bert from pre-trained checkpoints
 
         >>> # training
         >>> model.config.decoder_start_token_id = tokenizer.cls_token_id
diff --git a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
index 66e8a9ffa6..2578984835 100644
--- a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
@@ -431,12 +431,12 @@ class FlaxEncoderDecoderModel(FlaxPreTrainedModel):
         >>> from transformers import FlaxEncoderDecoderModel, BertTokenizer
 
         >>> # initialize a bert2gpt2 from pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
-        >>> model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-cased', 'gpt2')
+        >>> model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-cased", "gpt2")
 
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
 
         >>> text = "My friends are cool but they eat too many carbs."
-        >>> input_ids = tokenizer.encode(text, return_tensors='np')
+        >>> input_ids = tokenizer.encode(text, return_tensors="np")
         >>> encoder_outputs = model.encode(input_ids)
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -509,12 +509,12 @@ class FlaxEncoderDecoderModel(FlaxPreTrainedModel):
         >>> import jax.numpy as jnp
 
         >>> # initialize a bert2gpt2 from pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
-        >>> model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-cased', 'gpt2')
+        >>> model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-cased", "gpt2")
 
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
 
         >>> text = "My friends are cool but they eat too many carbs."
-        >>> input_ids = tokenizer.encode(text, max_length=1024, return_tensors='np')
+        >>> input_ids = tokenizer.encode(text, max_length=1024, return_tensors="np")
         >>> encoder_outputs = model.encode(input_ids)
 
         >>> decoder_start_token_id = model.config.decoder.bos_token_id
@@ -636,15 +636,15 @@ class FlaxEncoderDecoderModel(FlaxPreTrainedModel):
         >>> # load a fine-tuned bert2gpt2 model
         >>> model = FlaxEncoderDecoderModel.from_pretrained("patrickvonplaten/bert2gpt2-cnn_dailymail-fp16")
         >>> # load input & output tokenizer
-        >>> tokenizer_input = BertTokenizer.from_pretrained('bert-base-cased')
-        >>> tokenizer_output = GPT2Tokenizer.from_pretrained('gpt2')
+        >>> tokenizer_input = BertTokenizer.from_pretrained("bert-base-cased")
+        >>> tokenizer_output = GPT2Tokenizer.from_pretrained("gpt2")
 
         >>> article = '''Sigma Alpha Epsilon is under fire for a video showing party-bound fraternity members
-        ... singing a racist chant. SAE's national chapter suspended the students,
-        ... but University of Oklahoma President David Boren took it a step further,
-        ... saying the university's affiliation with the fraternity is permanently done.'''
+        >>> singing a racist chant. SAE's national chapter suspended the students,
+        >>> but University of Oklahoma President David Boren took it a step further,
+        >>> saying the university's affiliation with the fraternity is permanently done.'''
 
-        >>> input_ids = tokenizer_input(article, add_special_tokens=True, return_tensors='np').input_ids
+        >>> input_ids = tokenizer_input(article, add_special_tokens=True, return_tensors="np").input_ids
 
         >>> # use GPT2's eos_token as the pad as well as eos token
         >>> model.config.eos_token_id = model.config.decoder.eos_token_id
@@ -654,7 +654,8 @@ class FlaxEncoderDecoderModel(FlaxPreTrainedModel):
 
         >>> summary = tokenizer_output.batch_decode(sequences, skip_special_tokens=True)[0]
         >>> assert summary == "SAS Alpha Epsilon suspended Sigma Alpha Epsilon members"
-        ```"""
+        ```
+        """
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -781,8 +782,9 @@ class FlaxEncoderDecoderModel(FlaxPreTrainedModel):
 
         ```python
         >>> from transformers import FlaxEncoderDecoderModel
+
         >>> # initialize a bert2gpt2 from pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
-        >>> model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-cased', 'gpt2')
+        >>> model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-cased", "gpt2")
         >>> # saving model after fine-tuning
         >>> model.save_pretrained("./bert2gpt2")
         >>> # load fine-tuned model
diff --git a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
index 25fc6e38a3..8c725b05cc 100644
--- a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
@@ -280,6 +280,7 @@ class TFEncoderDecoderModel(TFPreTrainedModel):
 
         ```python
         >>> from transformers import TFEncoderDecoderModel
+
         >>> model = TFEncoderDecoderModel.from_pretrained("ydshieh/bert2bert-cnn_dailymail-fp16")
         ```"""
 
@@ -347,8 +348,9 @@ class TFEncoderDecoderModel(TFPreTrainedModel):
 
         ```python
         >>> from transformers import TFEncoderDecoderModel
+
         >>> # initialize a bert2gpt2 from two pretrained BERT models. Note that the cross-attention layers will be randomly initialized
-        >>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'gpt2')
+        >>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "gpt2")
         >>> # saving model after fine-tuning
         >>> model.save_pretrained("./bert2gpt2")
         >>> # load fine-tuned model
@@ -486,12 +488,14 @@ class TFEncoderDecoderModel(TFPreTrainedModel):
         >>> from transformers import TFEncoderDecoderModel, BertTokenizer
 
         >>> # initialize a bert2gpt2 from a pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
-        >>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-cased', 'gpt2')
+        >>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-cased", "gpt2")
 
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
 
         >>> # forward
-        >>> input_ids = tokenizer.encode("Hello, my dog is cute", add_special_tokens=True, return_tensors='tf')  # Batch size 1
+        >>> input_ids = tokenizer.encode(
+        ...     "Hello, my dog is cute", add_special_tokens=True, return_tensors="tf"
+        >>> )  # Batch size 1
         >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
 
         >>> # training
diff --git a/src/transformers/models/fnet/modeling_fnet.py b/src/transformers/models/fnet/modeling_fnet.py
index f76205558c..f12281499b 100755
--- a/src/transformers/models/fnet/modeling_fnet.py
+++ b/src/transformers/models/fnet/modeling_fnet.py
@@ -674,8 +674,9 @@ class FNetForPreTraining(FNetPreTrainedModel):
         ```python
         >>> from transformers import FNetTokenizer, FNetForPreTraining
         >>> import torch
-        >>> tokenizer = FNetTokenizer.from_pretrained('google/fnet-base')
-        >>> model = FNetForPreTraining.from_pretrained('google/fnet-base')
+
+        >>> tokenizer = FNetTokenizer.from_pretrained("google/fnet-base")
+        >>> model = FNetForPreTraining.from_pretrained("google/fnet-base")
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
         >>> prediction_logits = outputs.prediction_logits
@@ -822,14 +823,15 @@ class FNetForNextSentencePrediction(FNetPreTrainedModel):
         ```python
         >>> from transformers import FNetTokenizer, FNetForNextSentencePrediction
         >>> import torch
-        >>> tokenizer = FNetTokenizer.from_pretrained('google/fnet-base')
-        >>> model = FNetForNextSentencePrediction.from_pretrained('google/fnet-base')
+
+        >>> tokenizer = FNetTokenizer.from_pretrained("google/fnet-base")
+        >>> model = FNetForNextSentencePrediction.from_pretrained("google/fnet-base")
         >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
         >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
         >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
         >>> logits = outputs.logits
-        >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
+        >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
         ```"""
 
         if "next_sentence_label" in kwargs:
diff --git a/src/transformers/models/fnet/tokenization_fnet.py b/src/transformers/models/fnet/tokenization_fnet.py
index 97e0865571..209e4f5229 100644
--- a/src/transformers/models/fnet/tokenization_fnet.py
+++ b/src/transformers/models/fnet/tokenization_fnet.py
@@ -275,7 +275,7 @@ class FNetTokenizer(PreTrainedTokenizer):
         Create a mask from the two sequences passed to be used in a sequence-pair classification task. An FNet sequence
         pair mask has the following format: :
 
-        ```python
+        ```
         0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second sequence |
         ```
 
diff --git a/src/transformers/models/fsmt/configuration_fsmt.py b/src/transformers/models/fsmt/configuration_fsmt.py
index 06939165ce..290b75af5e 100644
--- a/src/transformers/models/fsmt/configuration_fsmt.py
+++ b/src/transformers/models/fsmt/configuration_fsmt.py
@@ -120,7 +120,7 @@ class FSMTConfig(PretrainedConfig):
     ```python
     >>> from transformers import FSMTConfig, FSMTModel
 
-    >>> config = FSMTConfig.from_pretrained('facebook/wmt19-en-ru')
+    >>> config = FSMTConfig.from_pretrained("facebook/wmt19-en-ru")
     >>> model = FSMTModel(config)
     ```"""
     model_type = "fsmt"
diff --git a/src/transformers/models/funnel/modeling_funnel.py b/src/transformers/models/funnel/modeling_funnel.py
index 20c3aa1082..c1db54dd7e 100644
--- a/src/transformers/models/funnel/modeling_funnel.py
+++ b/src/transformers/models/funnel/modeling_funnel.py
@@ -1114,10 +1114,10 @@ class FunnelForPreTraining(FunnelPreTrainedModel):
         >>> from transformers import FunnelTokenizer, FunnelForPreTraining
         >>> import torch
 
-        >>> tokenizer = FunnelTokenizer.from_pretrained('funnel-transformer/small')
-        >>> model = FunnelForPreTraining.from_pretrained('funnel-transformer/small')
+        >>> tokenizer = FunnelTokenizer.from_pretrained("funnel-transformer/small")
+        >>> model = FunnelForPreTraining.from_pretrained("funnel-transformer/small")
 
-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors= "pt")
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> logits = model(**inputs).logits
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/funnel/modeling_tf_funnel.py b/src/transformers/models/funnel/modeling_tf_funnel.py
index 373f8e192e..c79bab4047 100644
--- a/src/transformers/models/funnel/modeling_tf_funnel.py
+++ b/src/transformers/models/funnel/modeling_tf_funnel.py
@@ -1273,10 +1273,10 @@ class TFFunnelForPreTraining(TFFunnelPreTrainedModel):
         >>> from transformers import FunnelTokenizer, TFFunnelForPreTraining
         >>> import torch
 
-        >>> tokenizer = TFFunnelTokenizer.from_pretrained('funnel-transformer/small')
-        >>> model = TFFunnelForPreTraining.from_pretrained('funnel-transformer/small')
+        >>> tokenizer = TFFunnelTokenizer.from_pretrained("funnel-transformer/small")
+        >>> model = TFFunnelForPreTraining.from_pretrained("funnel-transformer/small")
 
-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors= "tf")
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
         >>> logits = model(inputs).logits
         ```"""
         inputs = input_processing(
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index 7a4a27c7ed..8bb8590a8b 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -631,12 +631,13 @@ PARALLELIZE_DOCSTRING = r"""
 
     ```python
     # Here is an example of a device map on a machine with 4 GPUs using gpt2-xl, which has a total of 48 attention modules:
-    model = GPT2LMHeadModel.from_pretrained('gpt2-xl')
-    device_map = {0: [0, 1, 2, 3, 4, 5, 6, 7, 8],
-
-              1: [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21],
-              2: [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34],
-              3: [35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]}
+    model = GPT2LMHeadModel.from_pretrained("gpt2-xl")
+    device_map = {
+        0: [0, 1, 2, 3, 4, 5, 6, 7, 8],
+        1: [9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21],
+        2: [22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34],
+        3: [35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47],
+    }
     model.parallelize(device_map)
     ```
 """
@@ -647,14 +648,15 @@ DEPARALLELIZE_DOCSTRING = r"""
 
     ```python
     # On a 4 GPU machine with gpt2-large:
-    model = GPT2LMHeadModel.from_pretrained('gpt2-large')
-    device_map = {0: [0, 1, 2, 3, 4, 5, 6, 7],
-
-                1: [8, 9, 10, 11, 12, 13, 14, 15],
-                2: [16, 17, 18, 19, 20, 21, 22, 23],
-                3: [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]}
-    model.parallelize(device_map) # Splits the model across several devices
-    model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
+    model = GPT2LMHeadModel.from_pretrained("gpt2-large")
+    device_map = {
+        0: [0, 1, 2, 3, 4, 5, 6, 7],
+        1: [8, 9, 10, 11, 12, 13, 14, 15],
+        2: [16, 17, 18, 19, 20, 21, 22, 23],
+        3: [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35],
+    }
+    model.parallelize(device_map)  # Splits the model across several devices
+    model.deparallelize()  # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
     ```
 """
 
@@ -1224,13 +1226,15 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
         >>> import torch
         >>> from transformers import GPT2Tokenizer, GPT2DoubleHeadsModel
 
-        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        >>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
+        >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        >>> model = GPT2DoubleHeadsModel.from_pretrained("gpt2")
 
         >>> # Add a [CLS] to the vocabulary (we should train it also!)
-        >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
+        >>> num_added_tokens = tokenizer.add_special_tokens({"cls_token": "[CLS]"})
 
-        >>> embedding_layer = model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
+        >>> embedding_layer = model.resize_token_embeddings(
+        ...     len(tokenizer)
+        >>> )  # Update the model embeddings with the new vocabulary size
 
         >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
         >>> encoded_choices = [tokenizer.encode(s) for s in choices]
diff --git a/src/transformers/models/gpt2/modeling_tf_gpt2.py b/src/transformers/models/gpt2/modeling_tf_gpt2.py
index 1bac2f6226..637ec12b8f 100644
--- a/src/transformers/models/gpt2/modeling_tf_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_tf_gpt2.py
@@ -1033,13 +1033,15 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
         >>> import tensorflow as tf
         >>> from transformers import GPT2Tokenizer, TFGPT2DoubleHeadsModel
 
-        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        >>> model = TFGPT2DoubleHeadsModel.from_pretrained('gpt2')
+        >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        >>> model = TFGPT2DoubleHeadsModel.from_pretrained("gpt2")
 
         >>> # Add a [CLS] to the vocabulary (we should train it also!)
-        >>> num_added_tokens = tokenizer.add_special_tokens({'cls_token': '[CLS]'})
+        >>> num_added_tokens = tokenizer.add_special_tokens({"cls_token": "[CLS]"})
 
-        >>> embedding_layer = model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
+        >>> embedding_layer = model.resize_token_embeddings(
+        ...     len(tokenizer)
+        >>> )  # Update the model embeddings with the new vocabulary size
 
         >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
         >>> encoded_choices = [tokenizer.encode(s) for s in choices]
diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py
index 7eafeab20e..869014bee6 100755
--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -412,11 +412,13 @@ PARALLELIZE_DOCSTRING = r"""
 
     ```python
     # Here is an example of a device map on a machine with 4 GPUs using gpt-j-6B, which has a total of 28 attention modules:
-    model = GPTJForCausalLM.from_pretrained('EleutherAI/gpt-j-6B')
-    device_map = {0: [0, 1, 2, 3, 4, 5, 6],
-              1: [7, 8, 9, 10, 11, 12, 13],
-              2: [14, 15, 16, 17, 18, 19, 20],
-              3: [21, 22, 23, 24, 25, 26, 27]}
+    model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")
+    device_map = {
+        0: [0, 1, 2, 3, 4, 5, 6],
+        1: [7, 8, 9, 10, 11, 12, 13],
+        2: [14, 15, 16, 17, 18, 19, 20],
+        3: [21, 22, 23, 24, 25, 26, 27],
+    }
     model.parallelize(device_map)
     ```
 """
@@ -428,13 +430,15 @@ DEPARALLELIZE_DOCSTRING = r"""
 
     ```python
     # On a 4 GPU machine with gpt-j-6B:
-    model = GPTJForCausalLM.from_pretrained('EleutherAI/gpt-j-6B')
-    device_map = {0: [0, 1, 2, 3, 4, 5, 6],
-                  1: [7, 8, 9, 10, 11, 12, 13],
-                  2: [14, 15, 16, 17, 18, 19, 20],
-                  3: [21, 22, 23, 24, 25, 26, 27]}
-    model.parallelize(device_map) # Splits the model across several devices
-    model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
+    model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")
+    device_map = {
+        0: [0, 1, 2, 3, 4, 5, 6],
+        1: [7, 8, 9, 10, 11, 12, 13],
+        2: [14, 15, 16, 17, 18, 19, 20],
+        3: [21, 22, 23, 24, 25, 26, 27],
+    }
+    model.parallelize(device_map)  # Splits the model across several devices
+    model.deparallelize()  # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
     ```
 """
 
diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py
index f983875c74..eaf3f4b697 100755
--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@@ -987,11 +987,13 @@ class HubertModel(HubertPreTrainedModel):
         >>> processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
         >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
 
+
         >>> def map_to_array(batch):
         ...     speech, _ = sf.read(batch["file"])
         ...     batch["speech"] = speech
         ...     return batch
 
+
         >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         >>> ds = ds.map(map_to_array)
 
diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py
index 976135dd7e..8cbd2c6fdf 100644
--- a/src/transformers/models/hubert/modeling_tf_hubert.py
+++ b/src/transformers/models/hubert/modeling_tf_hubert.py
@@ -1417,11 +1417,13 @@ class TFHubertModel(TFHubertPreTrainedModel):
         >>> processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-base-960h")
         >>> model = TFHubertModel.from_pretrained("facebook/hubert-base-960h")
 
+
         >>> def map_to_array(batch):
         ...     speech, _ = sf.read(batch["file"])
         ...     batch["speech"] = speech
         ...     return batch
 
+
         >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         >>> ds = ds.map(map_to_array)
 
@@ -1528,16 +1530,19 @@ class TFHubertForCTC(TFHubertPreTrainedModel):
         >>> processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-base-960h")
         >>> model = TFHubertForCTC.from_pretrained("facebook/hubert-base-960h")
 
+
         >>> def map_to_array(batch):
         ...     speech, _ = sf.read(batch["file"])
         ...     batch["speech"] = speech
         ...     return batch
 
+
         >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         >>> ds = ds.map(map_to_array)
 
-        >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1
-        >>> logits = model(input_values).logits >>> predicted_ids = tf.argmax(logits, axis=-1)
+        >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values  # Batch size 1
+        >>> logits = model(input_values).logits
+        >>> predicted_ids = tf.argmax(logits, axis=-1)
 
         >>> transcription = processor.decode(predicted_ids[0])
 
diff --git a/src/transformers/models/imagegpt/modeling_imagegpt.py b/src/transformers/models/imagegpt/modeling_imagegpt.py
index 563e1b0279..38917ae377 100755
--- a/src/transformers/models/imagegpt/modeling_imagegpt.py
+++ b/src/transformers/models/imagegpt/modeling_imagegpt.py
@@ -686,11 +686,11 @@ class ImageGPTModel(ImageGPTPreTrainedModel):
         >>> from PIL import Image
         >>> import requests
 
-        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ImageGPTFeatureExtractor.from_pretrained('openai/imagegpt-small')
-        >>> model = ImageGPTModel.from_pretrained('openai/imagegpt-small')
+        >>> feature_extractor = ImageGPTFeatureExtractor.from_pretrained("openai/imagegpt-small")
+        >>> model = ImageGPTModel.from_pretrained("openai/imagegpt-small")
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -981,27 +981,31 @@ class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel):
         >>> import matplotlib.pyplot as plt
         >>> import numpy as np
 
-        >>> feature_extractor = ImageGPTFeatureExtractor.from_pretrained('openai/imagegpt-small')
-        >>> model = ImageGPTForCausalImageModeling.from_pretrained('openai/imagegpt-small')
+        >>> feature_extractor = ImageGPTFeatureExtractor.from_pretrained("openai/imagegpt-small")
+        >>> model = ImageGPTForCausalImageModeling.from_pretrained("openai/imagegpt-small")
         >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         >>> model.to(device)
 
         >>> # unconditional generation of 8 images
         >>> batch_size = 8
-        >>> context = torch.full((batch_size, 1), model.config.vocab_size - 1) #initialize with SOS token
+        >>> context = torch.full((batch_size, 1), model.config.vocab_size - 1)  # initialize with SOS token
         >>> context = torch.tensor(context).to(device)
-        >>> output = model.generate(input_ids=context, max_length=model.config.n_positions + 1, temperature=1.0, do_sample=True, top_k=40)
+        >>> output = model.generate(
+        ...     input_ids=context, max_length=model.config.n_positions + 1, temperature=1.0, do_sample=True, top_k=40
+        ... )
 
         >>> clusters = feature_extractor.clusters
         >>> n_px = feature_extractor.size
 
-        >>> samples = output[:,1:].cpu().detach().numpy()
-        >>> samples_img = [np.reshape(np.rint(127.5 * (clusters[s] + 1.0)), [n_px, n_px, 3]).astype(np.uint8) for s in samples] # convert color cluster tokens back to pixels
+        >>> samples = output[:, 1:].cpu().detach().numpy()
+        >>> samples_img = [
+        ...     np.reshape(np.rint(127.5 * (clusters[s] + 1.0)), [n_px, n_px, 3]).astype(np.uint8) for s in samples
+        >>> ]  # convert color cluster tokens back to pixels
         >>> f, axes = plt.subplots(1, batch_size, dpi=300)
 
         >>> for img, ax in zip(samples_img, axes):
-        ...    ax.axis('off')
-        ...    ax.imshow(img)
+        ...     ax.axis("off")
+        ...     ax.imshow(img)
         ```"""
 
         if "pixel_values" in kwargs:
@@ -1126,11 +1130,11 @@ class ImageGPTForImageClassification(ImageGPTPreTrainedModel):
         >>> from PIL import Image
         >>> import requests
 
-        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ImageGPTFeatureExtractor.from_pretrained('openai/imagegpt-small')
-        >>> model = ImageGPTForImageClassification.from_pretrained('openai/imagegpt-small')
+        >>> feature_extractor = ImageGPTFeatureExtractor.from_pretrained("openai/imagegpt-small")
+        >>> model = ImageGPTForImageClassification.from_pretrained("openai/imagegpt-small")
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py
index c4579d05b2..29a8c071ee 100644
--- a/src/transformers/models/layoutlm/modeling_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_layoutlm.py
@@ -755,8 +755,8 @@ class LayoutLMModel(LayoutLMPreTrainedModel):
         >>> from transformers import LayoutLMTokenizer, LayoutLMModel
         >>> import torch
 
-        >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
-        >>> model = LayoutLMModel.from_pretrained('microsoft/layoutlm-base-uncased')
+        >>> tokenizer = LayoutLMTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
+        >>> model = LayoutLMModel.from_pretrained("microsoft/layoutlm-base-uncased")
 
         >>> words = ["Hello", "world"]
         >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
@@ -768,13 +768,15 @@ class LayoutLMModel(LayoutLMPreTrainedModel):
         >>> # add bounding boxes of cls + sep tokens
         >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
 
-        >>> encoding = tokenizer(' '.join(words), return_tensors="pt")
+        >>> encoding = tokenizer(" ".join(words), return_tensors="pt")
         >>> input_ids = encoding["input_ids"]
         >>> attention_mask = encoding["attention_mask"]
         >>> token_type_ids = encoding["token_type_ids"]
         >>> bbox = torch.tensor([token_boxes])
 
-        >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids)
+        >>> outputs = model(
+        ...     input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids
+        ... )
 
         >>> last_hidden_states = outputs.last_hidden_state
         ```"""
@@ -900,8 +902,8 @@ class LayoutLMForMaskedLM(LayoutLMPreTrainedModel):
         >>> from transformers import LayoutLMTokenizer, LayoutLMForMaskedLM
         >>> import torch
 
-        >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
-        >>> model = LayoutLMForMaskedLM.from_pretrained('microsoft/layoutlm-base-uncased')
+        >>> tokenizer = LayoutLMTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
+        >>> model = LayoutLMForMaskedLM.from_pretrained("microsoft/layoutlm-base-uncased")
 
         >>> words = ["Hello", "[MASK]"]
         >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
@@ -913,7 +915,7 @@ class LayoutLMForMaskedLM(LayoutLMPreTrainedModel):
         >>> # add bounding boxes of cls + sep tokens
         >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
 
-        >>> encoding = tokenizer(' '.join(words), return_tensors="pt")
+        >>> encoding = tokenizer(" ".join(words), return_tensors="pt")
         >>> input_ids = encoding["input_ids"]
         >>> attention_mask = encoding["attention_mask"]
         >>> token_type_ids = encoding["token_type_ids"]
@@ -921,8 +923,13 @@ class LayoutLMForMaskedLM(LayoutLMPreTrainedModel):
 
         >>> labels = tokenizer("Hello world", return_tensors="pt")["input_ids"]
 
-        >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids,
-        ...                 labels=labels)
+        >>> outputs = model(
+        ...     input_ids=input_ids,
+        ...     bbox=bbox,
+        ...     attention_mask=attention_mask,
+        ...     token_type_ids=token_type_ids,
+        ...     labels=labels,
+        ... )
 
         >>> loss = outputs.loss
         ```"""
@@ -1017,8 +1024,8 @@ class LayoutLMForSequenceClassification(LayoutLMPreTrainedModel):
         >>> from transformers import LayoutLMTokenizer, LayoutLMForSequenceClassification
         >>> import torch
 
-        >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
-        >>> model = LayoutLMForSequenceClassification.from_pretrained('microsoft/layoutlm-base-uncased')
+        >>> tokenizer = LayoutLMTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
+        >>> model = LayoutLMForSequenceClassification.from_pretrained("microsoft/layoutlm-base-uncased")
 
         >>> words = ["Hello", "world"]
         >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
@@ -1030,15 +1037,20 @@ class LayoutLMForSequenceClassification(LayoutLMPreTrainedModel):
         >>> # add bounding boxes of cls + sep tokens
         >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
 
-        >>> encoding = tokenizer(' '.join(words), return_tensors="pt")
+        >>> encoding = tokenizer(" ".join(words), return_tensors="pt")
         >>> input_ids = encoding["input_ids"]
         >>> attention_mask = encoding["attention_mask"]
         >>> token_type_ids = encoding["token_type_ids"]
         >>> bbox = torch.tensor([token_boxes])
         >>> sequence_label = torch.tensor([1])
 
-        >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids,
-        ...                 labels=sequence_label)
+        >>> outputs = model(
+        ...     input_ids=input_ids,
+        ...     bbox=bbox,
+        ...     attention_mask=attention_mask,
+        ...     token_type_ids=token_type_ids,
+        ...     labels=sequence_label,
+        ... )
 
         >>> loss = outputs.loss
         >>> logits = outputs.logits
@@ -1147,8 +1159,8 @@ class LayoutLMForTokenClassification(LayoutLMPreTrainedModel):
         >>> from transformers import LayoutLMTokenizer, LayoutLMForTokenClassification
         >>> import torch
 
-        >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
-        >>> model = LayoutLMForTokenClassification.from_pretrained('microsoft/layoutlm-base-uncased')
+        >>> tokenizer = LayoutLMTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
+        >>> model = LayoutLMForTokenClassification.from_pretrained("microsoft/layoutlm-base-uncased")
 
         >>> words = ["Hello", "world"]
         >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
@@ -1160,15 +1172,20 @@ class LayoutLMForTokenClassification(LayoutLMPreTrainedModel):
         >>> # add bounding boxes of cls + sep tokens
         >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
 
-        >>> encoding = tokenizer(' '.join(words), return_tensors="pt")
+        >>> encoding = tokenizer(" ".join(words), return_tensors="pt")
         >>> input_ids = encoding["input_ids"]
         >>> attention_mask = encoding["attention_mask"]
         >>> token_type_ids = encoding["token_type_ids"]
         >>> bbox = torch.tensor([token_boxes])
-        >>> token_labels = torch.tensor([1,1,0,0]).unsqueeze(0) # batch size of 1
+        >>> token_labels = torch.tensor([1, 1, 0, 0]).unsqueeze(0)  # batch size of 1
 
-        >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids,
-        ...                 labels=token_labels)
+        >>> outputs = model(
+        ...     input_ids=input_ids,
+        ...     bbox=bbox,
+        ...     attention_mask=attention_mask,
+        ...     token_type_ids=token_type_ids,
+        ...     labels=token_labels,
+        ... )
 
         >>> loss = outputs.loss
         >>> logits = outputs.logits
diff --git a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
index ae4ce05c0e..6ac317f391 100644
--- a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
@@ -954,8 +954,8 @@ class TFLayoutLMModel(TFLayoutLMPreTrainedModel):
         >>> from transformers import LayoutLMTokenizer, TFLayoutLMModel
         >>> import tensorflow as tf
 
-        >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
-        >>> model = TFLayoutLMModel.from_pretrained('microsoft/layoutlm-base-uncased')
+        >>> tokenizer = LayoutLMTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
+        >>> model = TFLayoutLMModel.from_pretrained("microsoft/layoutlm-base-uncased")
 
         >>> words = ["Hello", "world"]
         >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
@@ -967,13 +967,15 @@ class TFLayoutLMModel(TFLayoutLMPreTrainedModel):
         >>> # add bounding boxes of cls + sep tokens
         >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
 
-        >>> encoding = tokenizer(' '.join(words), return_tensors="tf")
+        >>> encoding = tokenizer(" ".join(words), return_tensors="tf")
         >>> input_ids = encoding["input_ids"]
         >>> attention_mask = encoding["attention_mask"]
         >>> token_type_ids = encoding["token_type_ids"]
         >>> bbox = tf.convert_to_tensor([token_boxes])
 
-        >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids)
+        >>> outputs = model(
+        ...     input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids
+        ... )
 
         >>> last_hidden_states = outputs.last_hidden_state
         ```"""
@@ -1094,8 +1096,8 @@ class TFLayoutLMForMaskedLM(TFLayoutLMPreTrainedModel, TFMaskedLanguageModelingL
         >>> from transformers import LayoutLMTokenizer, TFLayoutLMForMaskedLM
         >>> import tensorflow as tf
 
-        >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
-        >>> model = TFLayoutLMForMaskedLM.from_pretrained('microsoft/layoutlm-base-uncased')
+        >>> tokenizer = LayoutLMTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
+        >>> model = TFLayoutLMForMaskedLM.from_pretrained("microsoft/layoutlm-base-uncased")
 
         >>> words = ["Hello", "[MASK]"]
         >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
@@ -1107,7 +1109,7 @@ class TFLayoutLMForMaskedLM(TFLayoutLMPreTrainedModel, TFMaskedLanguageModelingL
         >>> # add bounding boxes of cls + sep tokens
         >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
 
-        >>> encoding = tokenizer(' '.join(words), return_tensors="tf")
+        >>> encoding = tokenizer(" ".join(words), return_tensors="tf")
         >>> input_ids = encoding["input_ids"]
         >>> attention_mask = encoding["attention_mask"]
         >>> token_type_ids = encoding["token_type_ids"]
@@ -1115,8 +1117,13 @@ class TFLayoutLMForMaskedLM(TFLayoutLMPreTrainedModel, TFMaskedLanguageModelingL
 
         >>> labels = tokenizer("Hello world", return_tensors="tf")["input_ids"]
 
-        >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids,
-        ...                 labels=labels)
+        >>> outputs = model(
+        ...     input_ids=input_ids,
+        ...     bbox=bbox,
+        ...     attention_mask=attention_mask,
+        ...     token_type_ids=token_type_ids,
+        ...     labels=labels,
+        ... )
 
         >>> loss = outputs.loss
         ```"""
@@ -1231,8 +1238,8 @@ class TFLayoutLMForSequenceClassification(TFLayoutLMPreTrainedModel, TFSequenceC
         >>> from transformers import LayoutLMTokenizer, TFLayoutLMForSequenceClassification
         >>> import tensorflow as tf
 
-        >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
-        >>> model = TFLayoutLMForSequenceClassification.from_pretrained('microsoft/layoutlm-base-uncased')
+        >>> tokenizer = LayoutLMTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
+        >>> model = TFLayoutLMForSequenceClassification.from_pretrained("microsoft/layoutlm-base-uncased")
 
         >>> words = ["Hello", "world"]
         >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
@@ -1244,15 +1251,20 @@ class TFLayoutLMForSequenceClassification(TFLayoutLMPreTrainedModel, TFSequenceC
         >>> # add bounding boxes of cls + sep tokens
         >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
 
-        >>> encoding = tokenizer(' '.join(words), return_tensors="tf")
+        >>> encoding = tokenizer(" ".join(words), return_tensors="tf")
         >>> input_ids = encoding["input_ids"]
         >>> attention_mask = encoding["attention_mask"]
         >>> token_type_ids = encoding["token_type_ids"]
         >>> bbox = tf.convert_to_tensor([token_boxes])
         >>> sequence_label = tf.convert_to_tensor([1])
 
-        >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids,
-        ...                 labels=sequence_label)
+        >>> outputs = model(
+        ...     input_ids=input_ids,
+        ...     bbox=bbox,
+        ...     attention_mask=attention_mask,
+        ...     token_type_ids=token_type_ids,
+        ...     labels=sequence_label,
+        ... )
 
         >>> loss = outputs.loss
         >>> logits = outputs.logits
@@ -1371,8 +1383,8 @@ class TFLayoutLMForTokenClassification(TFLayoutLMPreTrainedModel, TFTokenClassif
         >>> from transformers import LayoutLMTokenizer, TFLayoutLMForTokenClassification
         >>> import torch
 
-        >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
-        >>> model = TFLayoutLMForTokenClassification.from_pretrained('microsoft/layoutlm-base-uncased')
+        >>> tokenizer = LayoutLMTokenizer.from_pretrained("microsoft/layoutlm-base-uncased")
+        >>> model = TFLayoutLMForTokenClassification.from_pretrained("microsoft/layoutlm-base-uncased")
 
         >>> words = ["Hello", "world"]
         >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
@@ -1384,15 +1396,20 @@ class TFLayoutLMForTokenClassification(TFLayoutLMPreTrainedModel, TFTokenClassif
         >>> # add bounding boxes of cls + sep tokens
         >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
 
-        >>> encoding = tokenizer(' '.join(words), return_tensors="tf")
+        >>> encoding = tokenizer(" ".join(words), return_tensors="tf")
         >>> input_ids = encoding["input_ids"]
         >>> attention_mask = encoding["attention_mask"]
         >>> token_type_ids = encoding["token_type_ids"]
         >>> bbox = tf.convert_to_tensor([token_boxes])
-        >>> token_labels = tf.convert_to_tensor([1,1,0,0])
+        >>> token_labels = tf.convert_to_tensor([1, 1, 0, 0])
 
-        >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids,
-        ...                 labels=token_labels)
+        >>> outputs = model(
+        ...     input_ids=input_ids,
+        ...     bbox=bbox,
+        ...     attention_mask=attention_mask,
+        ...     token_type_ids=token_type_ids,
+        ...     labels=token_labels,
+        ... )
 
         >>> loss = outputs.loss
         >>> logits = outputs.logits
diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
index 5162b9c22c..8c5d95b76f 100755
--- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
@@ -827,8 +827,8 @@ class LayoutLMv2Model(LayoutLMv2PreTrainedModel):
         >>> from transformers import LayoutLMv2Processor, LayoutLMv2Model
         >>> from PIL import Image
 
-        >>> processor = LayoutLMv2Processor.from_pretrained('microsoft/layoutlmv2-base-uncased')
-        >>> model = LayoutLMv2Model.from_pretrained('microsoft/layoutlmv2-base-uncased')
+        >>> processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
+        >>> model = LayoutLMv2Model.from_pretrained("microsoft/layoutlmv2-base-uncased")
 
         >>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
 
@@ -995,8 +995,8 @@ class LayoutLMv2ForSequenceClassification(LayoutLMv2PreTrainedModel):
         >>> from PIL import Image
         >>> import torch
 
-        >>> processor = LayoutLMv2Processor.from_pretrained('microsoft/layoutlmv2-base-uncased')
-        >>> model = LayoutLMv2ForSequenceClassification.from_pretrained('microsoft/layoutlmv2-base-uncased')
+        >>> processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
+        >>> model = LayoutLMv2ForSequenceClassification.from_pretrained("microsoft/layoutlmv2-base-uncased")
 
         >>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
 
@@ -1161,12 +1161,12 @@ class LayoutLMv2ForTokenClassification(LayoutLMv2PreTrainedModel):
         >>> from transformers import LayoutLMv2Processor, LayoutLMv2ForTokenClassification
         >>> from PIL import Image
 
-        >>> processor = LayoutLMv2Processor.from_pretrained('microsoft/layoutlmv2-base-uncased', revision="no_ocr")
-        >>> model = LayoutLMv2ForTokenClassification.from_pretrained('microsoft/layoutlmv2-base-uncased')
+        >>> processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")
+        >>> model = LayoutLMv2ForTokenClassification.from_pretrained("microsoft/layoutlmv2-base-uncased")
 
         >>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
         >>> words = ["hello", "world"]
-        >>> boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes
+        >>> boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]  # make sure to normalize your bounding boxes
         >>> word_labels = [0, 1]
 
         >>> encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="pt")
@@ -1285,8 +1285,8 @@ class LayoutLMv2ForQuestionAnswering(LayoutLMv2PreTrainedModel):
         >>> from PIL import Image
         >>> import torch
 
-        >>> processor = LayoutLMv2Processor.from_pretrained('microsoft/layoutlmv2-base-uncased')
-        >>> model = LayoutLMv2ForQuestionAnswering.from_pretrained('microsoft/layoutlmv2-base-uncased')
+        >>> processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
+        >>> model = LayoutLMv2ForQuestionAnswering.from_pretrained("microsoft/layoutlmv2-base-uncased")
 
         >>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
         >>> question = "what's his name?"
diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py
index 3f6fb9d00a..0a07044afb 100755
--- a/src/transformers/models/led/modeling_led.py
+++ b/src/transformers/models/led/modeling_led.py
@@ -343,10 +343,24 @@ class LEDEncoderSelfAttention(nn.Module):
         Example:
 
         ```python
-        chunked_hidden_states: [ 0.4983,  2.6918, -0.0071,  1.0492,
-                                 -1.8348,  0.7672,  0.2986,  0.0285,
-                                 -0.7584,  0.4206, -0.0405,  0.1599,
-                                 2.0514, -1.1600,  0.5372,  0.2629 ]
+        chunked_hidden_states: [
+            0.4983,
+            2.6918,
+            -0.0071,
+            1.0492,
+            -1.8348,
+            0.7672,
+            0.2986,
+            0.0285,
+            -0.7584,
+            0.4206,
+            -0.0405,
+            0.1599,
+            2.0514,
+            -1.1600,
+            0.5372,
+            0.2629,
+        ]
         window_overlap = num_rows = 4
         ```
 
@@ -2334,11 +2348,12 @@ class LEDForConditionalGeneration(LEDPreTrainedModel):
 
         ```python
         >>> from transformers import LEDTokenizer, LEDForConditionalGeneration
-        >>> tokenizer = LEDTokenizer.from_pretrained('allenai/led-base-16384')
+
+        >>> tokenizer = LEDTokenizer.from_pretrained("allenai/led-base-16384")
         >>> TXT = "My friends are <mask> but they eat too many carbs."
 
-        >>> model = LEDForConditionalGeneration.from_pretrained('allenai/led-base-16384')
-        >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
+        >>> model = LEDForConditionalGeneration.from_pretrained("allenai/led-base-16384")
+        >>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
 
         >>> prediction = model.generate(input_ids)[0]
         >>> print(tokenizer.decode(prediction, skip_special_tokens=True))
diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py
index 491d5334b0..3e4736ea08 100644
--- a/src/transformers/models/led/modeling_tf_led.py
+++ b/src/transformers/models/led/modeling_tf_led.py
@@ -610,10 +610,24 @@ class TFLEDEncoderSelfAttention(tf.keras.layers.Layer):
         Example:
 
         ```python
-        chunked_hidden_states: [ 0.4983,  2.6918, -0.0071,  1.0492,
-                                 -1.8348,  0.7672,  0.2986,  0.0285,
-                                 -0.7584,  0.4206, -0.0405,  0.1599,
-                                 2.0514, -1.1600,  0.5372,  0.2629 ]
+        chunked_hidden_states: [
+            0.4983,
+            2.6918,
+            -0.0071,
+            1.0492,
+            -1.8348,
+            0.7672,
+            0.2986,
+            0.0285,
+            -0.7584,
+            0.4206,
+            -0.0405,
+            0.1599,
+            2.0514,
+            -1.1600,
+            0.5372,
+            0.2629,
+        ]
         window_overlap = num_rows = 4
         ```
 
@@ -2382,11 +2396,12 @@ class TFLEDForConditionalGeneration(TFLEDPreTrainedModel):
         ```python
         >>> from transformers import LEDTokenizer, TFLEDForConditionalGeneration
         >>> import tensorflow as tf
-        >>> mname = 'allenai/led-base-16384'
+
+        >>> mname = "allenai/led-base-16384"
         >>> tokenizer = LEDTokenizer.from_pretrained(mname)
         >>> TXT = "My friends are <mask> but they eat too many carbs."
         >>> model = TFLEDForConditionalGeneration.from_pretrained(mname)
-        >>> batch = tokenizer([TXT], return_tensors='tf')
+        >>> batch = tokenizer([TXT], return_tensors="tf")
         >>> logits = model(inputs=batch.input_ids).logits
         >>> probs = tf.nn.softmax(logits[0])
         >>> # probs[5] is associated with the mask token
diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py
index 987897224c..6f9f6b43f5 100755
--- a/src/transformers/models/longformer/modeling_longformer.py
+++ b/src/transformers/models/longformer/modeling_longformer.py
@@ -726,10 +726,24 @@ class LongformerSelfAttention(nn.Module):
         Example:
 
         ```python
-        chunked_hidden_states: [ 0.4983,  2.6918, -0.0071,  1.0492,
-                                 -1.8348,  0.7672,  0.2986,  0.0285,
-                                 -0.7584,  0.4206, -0.0405,  0.1599,
-                                 2.0514, -1.1600,  0.5372,  0.2629 ]
+        chunked_hidden_states: [
+            0.4983,
+            2.6918,
+            -0.0071,
+            1.0492,
+            -1.8348,
+            0.7672,
+            0.2986,
+            0.0285,
+            -0.7584,
+            0.4206,
+            -0.0405,
+            0.1599,
+            2.0514,
+            -1.1600,
+            0.5372,
+            0.2629,
+        ]
         window_overlap = num_rows = 4
         ```
 
@@ -1605,19 +1619,30 @@ class LongformerModel(LongformerPreTrainedModel):
         >>> import torch
         >>> from transformers import LongformerModel, LongformerTokenizer
 
-        >>> model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
-        >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
+        >>> model = LongformerModel.from_pretrained("allenai/longformer-base-4096")
+        >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
 
-        >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
+        >>> SAMPLE_TEXT = " ".join(["Hello world! "] * 1000)  # long input document
         >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1
 
-        >>> attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention
-        >>> global_attention_mask = torch.zeros(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to global attention to be deactivated for all tokens
-        >>> global_attention_mask[:, [1, 4, 21,]] = 1  # Set global attention to random tokens for the sake of this example
-        ...                                     # Usually, set global attention based on the task. For example,
-        ...                                     # classification: the <s> token
-        ...                                     # QA: question tokens
-        ...                                     # LM: potentially on the beginning of sentences and paragraphs
+        >>> attention_mask = torch.ones(
+        ...     input_ids.shape, dtype=torch.long, device=input_ids.device
+        >>> )  # initialize to local attention
+        >>> global_attention_mask = torch.zeros(
+        ...     input_ids.shape, dtype=torch.long, device=input_ids.device
+        >>> )  # initialize to global attention to be deactivated for all tokens
+        >>> global_attention_mask[
+        ...     :,
+        ...     [
+        ...         1,
+        ...         4,
+        ...         21,
+        ...     ],
+        >>> ] = 1  # Set global attention to random tokens for the sake of this example
+        >>> # Usually, set global attention based on the task. For example,
+        >>> # classification: the <s> token
+        >>> # QA: question tokens
+        >>> # LM: potentially on the beginning of sentences and paragraphs
         >>> outputs = model(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)
         >>> sequence_output = outputs.last_hidden_state
         >>> pooled_output = outputs.pooler_output
@@ -1748,14 +1773,14 @@ class LongformerForMaskedLM(LongformerPreTrainedModel):
         >>> import torch
         >>> from transformers import LongformerForMaskedLM, LongformerTokenizer
 
-        >>> model = LongformerForMaskedLM.from_pretrained('allenai/longformer-base-4096')
-        >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
+        >>> model = LongformerForMaskedLM.from_pretrained("allenai/longformer-base-4096")
+        >>> tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
 
-        >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
+        >>> SAMPLE_TEXT = " ".join(["Hello world! "] * 1000)  # long input document
         >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1
 
         >>> attention_mask = None  # default is local attention everywhere, which is a good choice for MaskedLM
-        ...                        # check `LongformerModel.forward` for more details how to set *attention_mask*
+        >>> # check `LongformerModel.forward` for more details how to set *attention_mask*
         >>> outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
         >>> loss = outputs.loss
         >>> prediction_logits = outputs.logits
@@ -1994,8 +2019,10 @@ class LongformerForQuestionAnswering(LongformerPreTrainedModel):
         >>> end_logits = outputs.end_logits
         >>> all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())
 
-        >>> answer_tokens = all_tokens[torch.argmax(start_logits) :torch.argmax(end_logits)+1]
-        >>> answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) # remove space prepending space token
+        >>> answer_tokens = all_tokens[torch.argmax(start_logits) : torch.argmax(end_logits) + 1]
+        >>> answer = tokenizer.decode(
+        ...     tokenizer.convert_tokens_to_ids(answer_tokens)
+        >>> )  # remove space prepending space token
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py
index c47955201d..31bd55abd0 100644
--- a/src/transformers/models/longformer/modeling_tf_longformer.py
+++ b/src/transformers/models/longformer/modeling_tf_longformer.py
@@ -1138,10 +1138,24 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
         Example:
 
         ```python
-        chunked_hidden_states: [ 0.4983,  2.6918, -0.0071,  1.0492,
-                                 -1.8348,  0.7672,  0.2986,  0.0285,
-                                 -0.7584,  0.4206, -0.0405,  0.1599,
-                                 2.0514, -1.1600,  0.5372,  0.2629 ]
+        chunked_hidden_states: [
+            0.4983,
+            2.6918,
+            -0.0071,
+            1.0492,
+            -1.8348,
+            0.7672,
+            0.2986,
+            0.0285,
+            -0.7584,
+            0.4206,
+            -0.0405,
+            0.1599,
+            2.0514,
+            -1.1600,
+            0.5372,
+            0.2629,
+        ]
         window_overlap = num_rows = 4
         ```
 
diff --git a/src/transformers/models/luke/modeling_luke.py b/src/transformers/models/luke/modeling_luke.py
index 6fe0133be1..1ce344df9c 100644
--- a/src/transformers/models/luke/modeling_luke.py
+++ b/src/transformers/models/luke/modeling_luke.py
@@ -937,8 +937,8 @@ class LukeModel(LukePreTrainedModel):
 
         >>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base")
         >>> model = LukeModel.from_pretrained("studio-ousia/luke-base")
-
         # Compute the contextualized entity representation corresponding to the entity mention "Beyoncé"
+
         >>> text = "Beyoncé lives in Los Angeles."
         >>> entity_spans = [(0, 7)]  # character-based entity span corresponding to "Beyoncé"
 
@@ -946,13 +946,21 @@ class LukeModel(LukePreTrainedModel):
         >>> outputs = model(**encoding)
         >>> word_last_hidden_state = outputs.last_hidden_state
         >>> entity_last_hidden_state = outputs.entity_last_hidden_state
-
         # Input Wikipedia entities to obtain enriched contextualized representations of word tokens
-        >>> text = "Beyoncé lives in Los Angeles."
-        >>> entities = ["Beyoncé", "Los Angeles"]  # Wikipedia entity titles corresponding to the entity mentions "Beyoncé" and "Los Angeles"
-        >>> entity_spans = [(0, 7), (17, 28)]  # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
 
-        >>> encoding = tokenizer(text, entities=entities, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
+        >>> text = "Beyoncé lives in Los Angeles."
+        >>> entities = [
+        ...     "Beyoncé",
+        ...     "Los Angeles",
+        >>> ]  # Wikipedia entity titles corresponding to the entity mentions "Beyoncé" and "Los Angeles"
+        >>> entity_spans = [
+        ...     (0, 7),
+        ...     (17, 28),
+        >>> ]  # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
+
+        >>> encoding = tokenizer(
+        ...     text, entities=entities, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt"
+        ... )
         >>> outputs = model(**encoding)
         >>> word_last_hidden_state = outputs.last_hidden_state
         >>> entity_last_hidden_state = outputs.entity_last_hidden_state
@@ -1423,7 +1431,10 @@ class LukeForEntityPairClassification(LukePreTrainedModel):
         >>> model = LukeForEntityPairClassification.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
 
         >>> text = "Beyoncé lives in Los Angeles."
-        >>> entity_spans = [(0, 7), (17, 28)]  # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
+        >>> entity_spans = [
+        ...     (0, 7),
+        ...     (17, 28),
+        >>> ]  # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
         >>> inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt")
         >>> outputs = model(**inputs)
         >>> logits = outputs.logits
@@ -1548,8 +1559,8 @@ class LukeForEntitySpanClassification(LukePreTrainedModel):
         >>> model = LukeForEntitySpanClassification.from_pretrained("studio-ousia/luke-large-finetuned-conll-2003")
 
         >>> text = "Beyoncé lives in Los Angeles"
-
         # List all possible entity spans in the text
+
         >>> word_start_positions = [0, 8, 14, 17, 21]  # character-based start positions of word tokens
         >>> word_end_positions = [7, 13, 16, 20, 28]  # character-based end positions of word tokens
         >>> entity_spans = []
@@ -1563,7 +1574,7 @@ class LukeForEntitySpanClassification(LukePreTrainedModel):
         >>> predicted_class_indices = logits.argmax(-1).squeeze().tolist()
         >>> for span, predicted_class_idx in zip(entity_spans, predicted_class_indices):
         ...     if predicted_class_idx != 0:
-        ...        print(text[span[0]:span[1]], model.config.id2label[predicted_class_idx])
+        ...         print(text[span[0] : span[1]], model.config.id2label[predicted_class_idx])
         Beyoncé PER
         Los Angeles LOC
         ```"""
diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py
index c5900c0020..c775ee2540 100755
--- a/src/transformers/models/m2m_100/modeling_m2m_100.py
+++ b/src/transformers/models/m2m_100/modeling_m2m_100.py
@@ -1283,14 +1283,14 @@ class M2M100ForConditionalGeneration(M2M100PreTrainedModel):
         ```python
         >>> from transformers import M2M100Tokenizer, M2M100ForConditionalGeneration
 
-        >>> model = M2M100ForConditionalGeneration.from_pretrained('facebook/m2m100_418M')
-        >>> tokenizer = M2M100Tokenizer.from_pretrained('facebook/m2m100_418M')
+        >>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
+        >>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M")
 
         >>> text_to_translate = "Life is like a box of chocolates"
-        >>> model_inputs = tokenizer(text_to_translate, return_tensors='pt')
+        >>> model_inputs = tokenizer(text_to_translate, return_tensors="pt")
 
         >>> # translate to French
-        >>> gen_tokens = model.generate( **model_inputs, forced_bos_token_id=tokenizer.get_lang_id("fr"))
+        >>> gen_tokens = model.generate(**model_inputs, forced_bos_token_id=tokenizer.get_lang_id("fr"))
         >>> print(tokenizer.batch_decode(gen_tokens, skip_special_tokens=True))
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
diff --git a/src/transformers/models/m2m_100/tokenization_m2m_100.py b/src/transformers/models/m2m_100/tokenization_m2m_100.py
index 80ca021252..4e54dfc12b 100644
--- a/src/transformers/models/m2m_100/tokenization_m2m_100.py
+++ b/src/transformers/models/m2m_100/tokenization_m2m_100.py
@@ -111,13 +111,14 @@ class M2M100Tokenizer(PreTrainedTokenizer):
 
     ```python
     >>> from transformers import M2M100Tokenizer
-    >>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M, src_lang="en", tgt_lang="ro")
+
+    >>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en", tgt_lang="ro")
     >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
-    >>> tgt_text =  "Şeful ONU declară că nu există o soluţie militară în Siria"
+    >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
     >>> model_inputs = tokenizer(src_text, return_tensors="pt")
     >>> with tokenizer.as_target_tokenizer():
-    ...    labels = tokenizer(tgt_text, return_tensors="pt").input_ids
-    >>> # model(**model_inputs, labels=labels) should work
+    ...     labels = tokenizer(tgt_text, return_tensors="pt").input_ids
+    >>> model(**model_inputs, labels=labels)  # should work
     ```"""
 
     vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/models/marian/modeling_flax_marian.py b/src/transformers/models/marian/modeling_flax_marian.py
index 0b0c32e591..193d629dc6 100644
--- a/src/transformers/models/marian/modeling_flax_marian.py
+++ b/src/transformers/models/marian/modeling_flax_marian.py
@@ -986,11 +986,11 @@ class FlaxMarianPreTrainedModel(FlaxPreTrainedModel):
         ```python
         >>> from transformers import MarianTokenizer, FlaxMarianMTModel
 
-        >>> tokenizer = MarianTokenizer.from_pretrained('facebook/marian-large-cnn')
-        >>> model = FlaxMarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+        >>> tokenizer = MarianTokenizer.from_pretrained("facebook/marian-large-cnn")
+        >>> model = FlaxMarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-de")
 
         >>> text = "My friends are cool but they eat too many carbs."
-        >>> inputs = tokenizer(text, max_length=64, return_tensors='jax')
+        >>> inputs = tokenizer(text, max_length=64, return_tensors="jax")
         >>> encoder_outputs = model.encode(**inputs)
         ```"""
 
@@ -1053,11 +1053,11 @@ class FlaxMarianPreTrainedModel(FlaxPreTrainedModel):
         ```python
         >>> from transformers import MarianTokenizer, FlaxMarianMTModel
 
-        >>> tokenizer = MarianTokenizer.from_pretrained('facebook/marian-large-cnn')
-        >>> model = FlaxMarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+        >>> tokenizer = MarianTokenizer.from_pretrained("facebook/marian-large-cnn")
+        >>> model = FlaxMarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-de")
 
         >>> text = "My friends are cool but they eat too many carbs."
-        >>> inputs = tokenizer(text, max_length=64, return_tensors='jax')
+        >>> inputs = tokenizer(text, max_length=64, return_tensors="jax")
         >>> encoder_outputs = model.encode(**inputs)
 
         >>> decoder_start_token_id = model.config.decoder_start_token_id
@@ -1321,11 +1321,11 @@ class FlaxMarianMTModel(FlaxMarianPreTrainedModel):
         ```python
         >>> from transformers import MarianTokenizer, FlaxMarianMTModel
 
-        >>> model = FlaxMarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')
-        >>> tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+        >>> model = FlaxMarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-de")
+        >>> tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
 
         >>> text = "My friends are cool but they eat too many carbs."
-        >>> inputs = tokenizer(text, max_length=64, return_tensors='jax')
+        >>> inputs = tokenizer(text, max_length=64, return_tensors="jax")
         >>> encoder_outputs = model.encode(**inputs)
 
         >>> decoder_start_token_id = model.config.decoder_start_token_id
@@ -1482,11 +1482,11 @@ FLAX_MARIAN_MT_DOCSTRING = """
     ```python
     >>> from transformers import MarianTokenizer, FlaxMarianMTModel
 
-    >>> model = FlaxMarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')
-    >>> tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+    >>> model = FlaxMarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-en-de")
+    >>> tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
 
     >>> text = "My friends are cool but they eat too many carbs."
-    >>> input_ids = tokenizer(text, max_length=64, return_tensors='jax').input_ids
+    >>> input_ids = tokenizer(text, max_length=64, return_tensors="jax").input_ids
 
     >>> sequences = model.generate(input_ids, max_length=64, num_beams=2).sequences
 
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index c6be85348a..a6855a3493 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -531,10 +531,11 @@ MARIAN_GENERATION_EXAMPLE = r"""
         ```python
         >>> from transformers import MarianTokenizer, MarianMTModel
         >>> from typing import List
-        >>> src = 'fr'  # source language
-        >>> trg = 'en'  # target language
+
+        >>> src = "fr"  # source language
+        >>> trg = "en"  # target language
         >>> sample_text = "où est l'arrêt de bus ?"
-        >>> model_name = f'Helsinki-NLP/opus-mt-{src}-{trg}'
+        >>> model_name = f"Helsinki-NLP/opus-mt-{src}-{trg}"
 
         >>> model = MarianMTModel.from_pretrained(model_name)
         >>> tokenizer = MarianTokenizer.from_pretrained(model_name)
@@ -1132,12 +1133,17 @@ class MarianModel(MarianPreTrainedModel):
         ```python
         >>> from transformers import MarianTokenizer, MarianModel
 
-        >>> tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
-        >>> model = MarianModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+        >>> tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
+        >>> model = MarianModel.from_pretrained("Helsinki-NLP/opus-mt-en-de")
 
-        >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
-        >>> decoder_input_ids = tokenizer("<pad> Studien haben gezeigt dass es hilfreich ist einen Hund zu besitzen",
-        ... return_tensors="pt", add_special_tokens=False).input_ids  # Batch size 1
+        >>> input_ids = tokenizer(
+        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+        >>> ).input_ids  # Batch size 1
+        >>> decoder_input_ids = tokenizer(
+        ...     "<pad> Studien haben gezeigt dass es hilfreich ist einen Hund zu besitzen",
+        ...     return_tensors="pt",
+        ...     add_special_tokens=False,
+        >>> ).input_ids  # Batch size 1
         >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
 
         >>> last_hidden_states = outputs.last_hidden_state
@@ -1513,8 +1519,8 @@ class MarianForCausalLM(MarianPreTrainedModel):
         ```python
         >>> from transformers import MarianTokenizer, MarianForCausalLM
 
-        >>> tokenizer = MarianTokenizer.from_pretrained('facebook/bart-large')
-        >>> model = MarianForCausalLM.from_pretrained('facebook/bart-large', add_cross_attention=False)
+        >>> tokenizer = MarianTokenizer.from_pretrained("facebook/bart-large")
+        >>> model = MarianForCausalLM.from_pretrained("facebook/bart-large", add_cross_attention=False)
         >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
diff --git a/src/transformers/models/marian/modeling_tf_marian.py b/src/transformers/models/marian/modeling_tf_marian.py
index 52c8ac7275..228e54cff9 100644
--- a/src/transformers/models/marian/modeling_tf_marian.py
+++ b/src/transformers/models/marian/modeling_tf_marian.py
@@ -562,10 +562,11 @@ MARIAN_GENERATION_EXAMPLE = r"""
         ```python
         >>> from transformers import MarianTokenizer, TFMarianMTModel
         >>> from typing import List
-        >>> src = 'fr'  # source language
-        >>> trg = 'en'  # target language
+
+        >>> src = "fr"  # source language
+        >>> trg = "en"  # target language
         >>> sample_text = "où est l'arrêt de bus ?"
-        >>> model_name = f'Helsinki-NLP/opus-mt-{src}-{trg}'
+        >>> model_name = f"Helsinki-NLP/opus-mt-{src}-{trg}"
 
         >>> model = TFMarianMTModel.from_pretrained(model_name)
         >>> tokenizer = MarianTokenizer.from_pretrained(model_name)
diff --git a/src/transformers/models/marian/tokenization_marian.py b/src/transformers/models/marian/tokenization_marian.py
index 12c06baba8..487f96ad79 100644
--- a/src/transformers/models/marian/tokenization_marian.py
+++ b/src/transformers/models/marian/tokenization_marian.py
@@ -102,15 +102,17 @@ class MarianTokenizer(PreTrainedTokenizer):
 
     ```python
     >>> from transformers import MarianTokenizer
-    >>> tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
-    >>> src_texts = [ "I am a small frog.", "Tom asked his teacher for advice."]
+
+    >>> tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
+    >>> src_texts = ["I am a small frog.", "Tom asked his teacher for advice."]
     >>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."]  # optional
     >>> inputs = tokenizer(src_texts, return_tensors="pt", padding=True)
     >>> with tokenizer.as_target_tokenizer():
     ...     labels = tokenizer(tgt_texts, return_tensors="pt", padding=True)
     >>> inputs["labels"] = labels["input_ids"]
     # keys  [input_ids, attention_mask, labels].
-    >>> outputs = model(**inputs) should work
+
+    >>> outputs = model(**inputs)  # should work
     ```"""
 
     vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/models/mbart/modeling_flax_mbart.py b/src/transformers/models/mbart/modeling_flax_mbart.py
index f569d08b05..e909f7700a 100644
--- a/src/transformers/models/mbart/modeling_flax_mbart.py
+++ b/src/transformers/models/mbart/modeling_flax_mbart.py
@@ -1046,11 +1046,11 @@ class FlaxMBartPreTrainedModel(FlaxPreTrainedModel):
         ```python
         >>> from transformers import MBartTokenizer, FlaxMBartForConditionalGeneration
 
-        >>> model = FlaxMBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25')
-        >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-cc25')
+        >>> model = FlaxMBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
+        >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")
 
         >>> text = "My friends are cool but they eat too many carbs."
-        >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors="jax")
         >>> encoder_outputs = model.encode(**inputs)
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -1112,11 +1112,11 @@ class FlaxMBartPreTrainedModel(FlaxPreTrainedModel):
         ```python
         >>> from transformers import MBartTokenizer, FlaxMBartForConditionalGeneration
 
-        >>> model = FlaxMBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25')
-        >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-cc25')
+        >>> model = FlaxMBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
+        >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")
 
         >>> text = "My friends are cool but they eat too many carbs."
-        >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors="jax")
         >>> encoder_outputs = model.encode(**inputs)
 
         >>> decoder_start_token_id = model.config.decoder_start_token_id
@@ -1379,11 +1379,11 @@ class FlaxMBartForConditionalGeneration(FlaxMBartPreTrainedModel):
         ```python
         >>> from transformers import MBartTokenizer, FlaxMBartForConditionalGeneration
 
-        >>> model = FlaxMBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25')
-        >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-cc25')
+        >>> model = FlaxMBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
+        >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-cc25")
 
         >>> text = "My friends are cool but they eat too many carbs."
-        >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors="jax")
         >>> encoder_outputs = model.encode(**inputs)
 
         >>> decoder_start_token_id = model.config.decoder_start_token_id
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index 73bc4f0ef8..06f7e514e2 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -1776,8 +1776,8 @@ class MBartForCausalLM(MBartPreTrainedModel):
         ```python
         >>> from transformers import MBartTokenizer, MBartForCausalLM
 
-        >>> tokenizer = MBartTokenizer.from_pretrained('facebook/bart-large')
-        >>> model = MBartForCausalLM.from_pretrained('facebook/bart-large', add_cross_attention=False)
+        >>> tokenizer = MBartTokenizer.from_pretrained("facebook/bart-large")
+        >>> model = MBartForCausalLM.from_pretrained("facebook/bart-large", add_cross_attention=False)
         >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
diff --git a/src/transformers/models/mbart/tokenization_mbart.py b/src/transformers/models/mbart/tokenization_mbart.py
index 23443dc238..01d00b9e6b 100644
--- a/src/transformers/models/mbart/tokenization_mbart.py
+++ b/src/transformers/models/mbart/tokenization_mbart.py
@@ -81,10 +81,11 @@ class MBartTokenizer(XLMRobertaTokenizer):
 
     ```python
     >>> from transformers import MBartTokenizer
-    >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-en-ro', src_lang="en_XX", tgt_lang="ro_RO")
+
+    >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO")
     >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
     >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
-    >>> inputs = tokenizer(example_english_phrase, return_tensors="pt)
+    >>> inputs = tokenizer(example_english_phrase, return_tensors="pt")
     >>> with tokenizer.as_target_tokenizer():
     ...     labels = tokenizer(expected_translation_romanian, return_tensors="pt")
     >>> inputs["labels"] = labels["input_ids"]
diff --git a/src/transformers/models/mbart/tokenization_mbart_fast.py b/src/transformers/models/mbart/tokenization_mbart_fast.py
index b6850187bc..311cb8058c 100644
--- a/src/transformers/models/mbart/tokenization_mbart_fast.py
+++ b/src/transformers/models/mbart/tokenization_mbart_fast.py
@@ -96,10 +96,13 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
 
     ```python
     >>> from transformers import MBartTokenizerFast
-    >>> tokenizer = MBartTokenizerFast.from_pretrained('facebook/mbart-large-en-ro', src_lang="en_XX", tgt_lang="ro_RO")
+
+    >>> tokenizer = MBartTokenizerFast.from_pretrained(
+    ...     "facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO"
+    ... )
     >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
     >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
-    >>> inputs = tokenizer(example_english_phrase, return_tensors="pt)
+    >>> inputs = tokenizer(example_english_phrase, return_tensors="pt")
     >>> with tokenizer.as_target_tokenizer():
     ...     labels = tokenizer(expected_translation_romanian, return_tensors="pt")
     >>> inputs["labels"] = labels["input_ids"]
diff --git a/src/transformers/models/mbart50/tokenization_mbart50.py b/src/transformers/models/mbart50/tokenization_mbart50.py
index 389535fc30..1282e4774a 100644
--- a/src/transformers/models/mbart50/tokenization_mbart50.py
+++ b/src/transformers/models/mbart50/tokenization_mbart50.py
@@ -96,12 +96,13 @@ class MBart50Tokenizer(PreTrainedTokenizer):
 
     ```python
     >>> from transformers import MBart50Tokenizer
+
     >>> tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")
     >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
-    >>> tgt_text =  "Şeful ONU declară că nu există o soluţie militară în Siria"
+    >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
     >>> model_inputs = tokenizer(src_text, return_tensors="pt")
     >>> with tokenizer.as_target_tokenizer():
-    ...    labels = tokenizer(tgt_text, return_tensors="pt").input_ids
+    ...     labels = tokenizer(tgt_text, return_tensors="pt").input_ids
     >>> # model(**model_inputs, labels=labels) should work
     ```"""
 
diff --git a/src/transformers/models/mbart50/tokenization_mbart50_fast.py b/src/transformers/models/mbart50/tokenization_mbart50_fast.py
index 40448bdcbc..928118bfd5 100644
--- a/src/transformers/models/mbart50/tokenization_mbart50_fast.py
+++ b/src/transformers/models/mbart50/tokenization_mbart50_fast.py
@@ -91,12 +91,13 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast):
 
     ```python
     >>> from transformers import MBart50TokenizerFast
+
     >>> tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")
     >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
-    >>> tgt_text =  "Şeful ONU declară că nu există o soluţie militară în Siria"
+    >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
     >>> model_inputs = tokenizer(src_text, return_tensors="pt")
     >>> with tokenizer.as_target_tokenizer():
-    ...    labels = tokenizer(tgt_text, return_tensors="pt").input_ids
+    ...     labels = tokenizer(tgt_text, return_tensors="pt").input_ids
     >>> # model(**model_inputs, labels=labels) should work
     ```"""
 
diff --git a/src/transformers/models/megatron_bert/modeling_megatron_bert.py b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
index 8f71a84a3f..e0f9f1191f 100755
--- a/src/transformers/models/megatron_bert/modeling_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/modeling_megatron_bert.py
@@ -1063,8 +1063,8 @@ class MegatronBertForPreTraining(MegatronBertPreTrainedModel):
         >>> from transformers import BertTokenizer, MegatronBertForPreTraining
         >>> import torch
 
-        >>> tokenizer = BertTokenizer.from_pretrained('nvidia/megatron-bert-cased-345m')
-        >>> model = MegatronBertForPreTraining.from_pretrained('nvidia/megatron-bert-cased-345m')
+        >>> tokenizer = BertTokenizer.from_pretrained("nvidia/megatron-bert-cased-345m")
+        >>> model = MegatronBertForPreTraining.from_pretrained("nvidia/megatron-bert-cased-345m")
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -1187,8 +1187,8 @@ class MegatronBertForCausalLM(MegatronBertPreTrainedModel):
         >>> from transformers import BertTokenizer, MegatronBertForCausalLM, MegatronBertConfig
         >>> import torch
 
-        >>> tokenizer = BertTokenizer.from_pretrained('nvidia/megatron-bert-cased-345m')
-        >>> model = MegatronBertForCausalLM.from_pretrained('nvidia/megatron-bert-cased-345m', is_decoder=True)
+        >>> tokenizer = BertTokenizer.from_pretrained("nvidia/megatron-bert-cased-345m")
+        >>> model = MegatronBertForCausalLM.from_pretrained("nvidia/megatron-bert-cased-345m", is_decoder=True)
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -1413,16 +1413,16 @@ class MegatronBertForNextSentencePrediction(MegatronBertPreTrainedModel):
         >>> from transformers import BertTokenizer, MegatronBertForNextSentencePrediction
         >>> import torch
 
-        >>> tokenizer = BertTokenizer.from_pretrained('nvidia/megatron-bert-cased-345m')
-        >>> model = MegatronBertForNextSentencePrediction.from_pretrained('nvidia/megatron-bert-cased-345m')
+        >>> tokenizer = BertTokenizer.from_pretrained("nvidia/megatron-bert-cased-345m")
+        >>> model = MegatronBertForNextSentencePrediction.from_pretrained("nvidia/megatron-bert-cased-345m")
 
         >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
         >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
 
         >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
         >>> logits = outputs.logits
-        >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
+        >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
         ```"""
 
         if "next_sentence_label" in kwargs:
diff --git a/src/transformers/models/mmbt/modeling_mmbt.py b/src/transformers/models/mmbt/modeling_mmbt.py
index 7b8b3c5ce6..3b3942f2ec 100644
--- a/src/transformers/models/mmbt/modeling_mmbt.py
+++ b/src/transformers/models/mmbt/modeling_mmbt.py
@@ -214,7 +214,7 @@ class MMBTModel(nn.Module, ModuleUtilsMixin):
 
         ```python
         # For example purposes. Not runnable.
-        transformer = BertModel.from_pretrained('bert-base-uncased')
+        transformer = BertModel.from_pretrained("bert-base-uncased")
         encoder = ImageEncoder(args)
         mmbt = MMBTModel(config, transformer, encoder)
         ```"""
@@ -334,7 +334,7 @@ class MMBTForClassification(nn.Module):
 
     ```python
     # For example purposes. Not runnable.
-    transformer = BertModel.from_pretrained('bert-base-uncased')
+    transformer = BertModel.from_pretrained("bert-base-uncased")
     encoder = ImageEncoder(args)
     model = MMBTForClassification(config, transformer, encoder)
     outputs = model(input_modal, input_ids, labels=labels)
diff --git a/src/transformers/models/mobilebert/modeling_mobilebert.py b/src/transformers/models/mobilebert/modeling_mobilebert.py
index 57bc42edbe..a37bd82983 100644
--- a/src/transformers/models/mobilebert/modeling_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_mobilebert.py
@@ -962,7 +962,9 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel):
         >>> tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased")
         >>> model = MobileBertForPreTraining.from_pretrained("google/mobilebert-uncased")
 
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(
+        ...     0
+        >>> )  # Batch size 1
         >>> outputs = model(input_ids)
 
         >>> prediction_logits = outputs.prediction_logits
@@ -1147,12 +1149,12 @@ class MobileBertForNextSentencePrediction(MobileBertPreTrainedModel):
         >>> from transformers import MobileBertTokenizer, MobileBertForNextSentencePrediction
         >>> import torch
 
-        >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
-        >>> model = MobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased')
+        >>> tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased")
+        >>> model = MobileBertForNextSentencePrediction.from_pretrained("google/mobilebert-uncased")
 
         >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
         >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
 
         >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
         >>> loss = outputs.loss
diff --git a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
index e4a8b73029..d060b2b0bc 100644
--- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
@@ -1038,8 +1038,8 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel):
         >>> import tensorflow as tf
         >>> from transformers import MobileBertTokenizer, TFMobileBertForPreTraining
 
-        >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
-        >>> model = TFMobileBertForPreTraining.from_pretrained('google/mobilebert-uncased')
+        >>> tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased")
+        >>> model = TFMobileBertForPreTraining.from_pretrained("google/mobilebert-uncased")
         >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
         >>> outputs = model(input_ids)
         >>> prediction_scores, seq_relationship_scores = outputs[:2]
@@ -1250,14 +1250,14 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel, TFNextS
         >>> import tensorflow as tf
         >>> from transformers import MobileBertTokenizer, TFMobileBertForNextSentencePrediction
 
-        >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
-        >>> model = TFMobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased')
+        >>> tokenizer = MobileBertTokenizer.from_pretrained("google/mobilebert-uncased")
+        >>> model = TFMobileBertForNextSentencePrediction.from_pretrained("google/mobilebert-uncased")
 
         >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
         >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='tf')
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="tf")
 
-        >>> logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
+        >>> logits = model(encoding["input_ids"], token_type_ids=encoding["token_type_ids"])[0]
         ```"""
         inputs = input_processing(
             func=self.call,
diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py
index 128eba98e0..314198c69a 100644
--- a/src/transformers/models/mt5/modeling_mt5.py
+++ b/src/transformers/models/mt5/modeling_mt5.py
@@ -34,6 +34,7 @@ class MT5Model(T5Model):
 
     ```python
     >>> from transformers import MT5Model, T5Tokenizer
+
     >>> model = MT5Model.from_pretrained("google/mt5-small")
     >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
     >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
@@ -67,6 +68,7 @@ class MT5ForConditionalGeneration(T5ForConditionalGeneration):
 
     ```python
     >>> from transformers import MT5ForConditionalGeneration, T5Tokenizer
+
     >>> model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
     >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
     >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
@@ -75,7 +77,7 @@ class MT5ForConditionalGeneration(T5ForConditionalGeneration):
     >>> with tokenizer.as_target_tokenizer():
     ...     labels = tokenizer(summary, return_tensors="pt")
 
-    >>> outputs = model(**inputs,labels=labels["input_ids"])
+    >>> outputs = model(**inputs, labels=labels["input_ids"])
     >>> loss = outputs.loss
     ```"""
 
@@ -98,6 +100,7 @@ class MT5EncoderModel(T5EncoderModel):
 
     ```python
     >>> from transformers import MT5EncoderModel, T5Tokenizer
+
     >>> model = MT5EncoderModel.from_pretrained("google/mt5-small")
     >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
     >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
diff --git a/src/transformers/models/mt5/modeling_tf_mt5.py b/src/transformers/models/mt5/modeling_tf_mt5.py
index 3b08bb4a02..2808b8421a 100644
--- a/src/transformers/models/mt5/modeling_tf_mt5.py
+++ b/src/transformers/models/mt5/modeling_tf_mt5.py
@@ -34,6 +34,7 @@ class TFMT5Model(TFT5Model):
 
     ```python
     >>> from transformers import TFMT5Model, T5Tokenizer
+
     >>> model = TFMT5Model.from_pretrained("google/mt5-small")
     >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
     >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
@@ -58,6 +59,7 @@ class TFMT5ForConditionalGeneration(TFT5ForConditionalGeneration):
 
     ```python
     >>> from transformers import TFMT5ForConditionalGeneration, T5Tokenizer
+
     >>> model = TFMT5ForConditionalGeneration.from_pretrained("google/mt5-small")
     >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
     >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
@@ -66,7 +68,7 @@ class TFMT5ForConditionalGeneration(TFT5ForConditionalGeneration):
     >>> with tokenizer.as_target_tokenizer():
     ...     labels = tokenizer(summary, return_tensors="tf")
 
-    >>> outputs = model(**inputs,labels=labels["input_ids"])
+    >>> outputs = model(**inputs, labels=labels["input_ids"])
     >>> loss = outputs.loss
     ```"""
 
@@ -83,6 +85,7 @@ class TFMT5EncoderModel(TFT5EncoderModel):
 
     ```python
     >>> from transformers import TFMT5EncoderModel, T5Tokenizer
+
     >>> model = TFMT5EncoderModel.from_pretrained("google/mt5-small")
     >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
     >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
diff --git a/src/transformers/models/openai/modeling_openai.py b/src/transformers/models/openai/modeling_openai.py
index 8bce94ffce..73c9098dda 100644
--- a/src/transformers/models/openai/modeling_openai.py
+++ b/src/transformers/models/openai/modeling_openai.py
@@ -675,14 +675,16 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
         >>> from transformers import OpenAIGPTTokenizer, OpenAIGPTDoubleHeadsModel
         >>> import torch
 
-        >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        >>> model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
-        >>> tokenizer.add_special_tokens({'cls_token': '[CLS]'})  # Add a [CLS] to the vocabulary (we should train it also!)
+        >>> tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")
+        >>> model = OpenAIGPTDoubleHeadsModel.from_pretrained("openai-gpt")
+        >>> tokenizer.add_special_tokens(
+        ...     {"cls_token": "[CLS]"}
+        >>> )  # Add a [CLS] to the vocabulary (we should train it also!)
         >>> model.resize_token_embeddings(len(tokenizer))
 
         >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
         >>> input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        >>> mc_token_ids = torch.tensor([input_ids.size(-1)-1, input_ids.size(-1)-1]).unsqueeze(0)  # Batch size 1
+        >>> mc_token_ids = torch.tensor([input_ids.size(-1) - 1, input_ids.size(-1) - 1]).unsqueeze(0)  # Batch size 1
 
         >>> outputs = model(input_ids, mc_token_ids=mc_token_ids)
         >>> lm_logits = outputs.lm_logits
diff --git a/src/transformers/models/openai/modeling_tf_openai.py b/src/transformers/models/openai/modeling_tf_openai.py
index 88511f0c6b..b135aa07cb 100644
--- a/src/transformers/models/openai/modeling_tf_openai.py
+++ b/src/transformers/models/openai/modeling_tf_openai.py
@@ -726,18 +726,22 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
         >>> import tensorflow as tf
         >>> from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel
 
-        >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        >>> model = TFOpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
+        >>> tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")
+        >>> model = TFOpenAIGPTDoubleHeadsModel.from_pretrained("openai-gpt")
 
         >>> # Add a [CLS] to the vocabulary (we should train it also!)
-        >>> tokenizer.add_special_tokens({'cls_token': '[CLS]'})
+        >>> tokenizer.add_special_tokens({"cls_token": "[CLS]"})
         >>> model.resize_token_embeddings(len(tokenizer))  # Update the model embeddings with the new vocabulary size
         >>> print(tokenizer.cls_token_id, len(tokenizer))  # The newly token the last token of the vocabulary
 
         >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
         >>> encoding = tokenizer(choices, return_tensors="tf")
         >>> inputs = {k: tf.expand_dims(v, 0) for k, v in encoding.items()}
-        >>> inputs["mc_token_ids"]= tf.constant([inputs["input_ids"].shape[-1] - 1, inputs["input_ids"].shape[-1] - 1])[None, :]  # Batch size 1
+        >>> inputs["mc_token_ids"] = tf.constant(
+        ...     [inputs["input_ids"].shape[-1] - 1, inputs["input_ids"].shape[-1] - 1]
+        >>> )[
+        ...     None, :
+        >>> ]  # Batch size 1
         >>> outputs = model(inputs)
         >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
         ```"""
diff --git a/src/transformers/models/pegasus/modeling_flax_pegasus.py b/src/transformers/models/pegasus/modeling_flax_pegasus.py
index e11f7e2fa7..7599909f6a 100644
--- a/src/transformers/models/pegasus/modeling_flax_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_flax_pegasus.py
@@ -994,11 +994,11 @@ class FlaxPegasusPreTrainedModel(FlaxPreTrainedModel):
         ```python
         >>> from transformers import PegasusTokenizer, FlaxPegasusForConditionalGeneration
 
-        >>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
-        >>> tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large')
+        >>> model = FlaxPegasusForConditionalGeneration.from_pretrained("google/pegasus-large")
+        >>> tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
 
         >>> text = "My friends are cool but they eat too many carbs."
-        >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors="np")
         >>> encoder_outputs = model.encode(**inputs)
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -1060,11 +1060,11 @@ class FlaxPegasusPreTrainedModel(FlaxPreTrainedModel):
         ```python
         >>> from transformers import PegasusTokenizer, FlaxPegasusForConditionalGeneration
 
-        >>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
-        >>> tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large')
+        >>> model = FlaxPegasusForConditionalGeneration.from_pretrained("google/pegasus-large")
+        >>> tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
 
         >>> text = "My friends are cool but they eat too many carbs."
-        >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors="np")
         >>> encoder_outputs = model.encode(**inputs)
 
         >>> decoder_start_token_id = model.config.decoder_start_token_id
@@ -1329,11 +1329,11 @@ class FlaxPegasusForConditionalGeneration(FlaxPegasusPreTrainedModel):
         ```python
         >>> from transformers import PegasusTokenizer, FlaxPegasusForConditionalGeneration
 
-        >>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
-        >>> tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large')
+        >>> model = FlaxPegasusForConditionalGeneration.from_pretrained("google/pegasus-large")
+        >>> tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
 
         >>> text = "My friends are cool but they eat too many carbs."
-        >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors="np")
         >>> encoder_outputs = model.encode(**inputs)
 
         >>> decoder_start_token_id = model.config.decoder_start_token_id
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index bcce420329..14b300731d 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -1206,7 +1206,9 @@ class PegasusModel(PegasusPreTrainedModel):
         >>> tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
         >>> model = PegasusModel.from_pretrained("google/pegasus-large")
 
-        >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+        >>> input_ids = tokenizer(
+        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+        >>> ).input_ids  # Batch size 1
         >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
         >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
 
@@ -1620,8 +1622,8 @@ class PegasusForCausalLM(PegasusPreTrainedModel):
         ```python
         >>> from transformers import PegasusTokenizer, PegasusForCausalLM
 
-        >>> tokenizer = PegasusTokenizer.from_pretrained('facebook/bart-large')
-        >>> model = PegasusForCausalLM.from_pretrained('facebook/bart-large', add_cross_attention=False)
+        >>> tokenizer = PegasusTokenizer.from_pretrained("facebook/bart-large")
+        >>> model = PegasusForCausalLM.from_pretrained("facebook/bart-large", add_cross_attention=False)
         >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
diff --git a/src/transformers/models/perceiver/modeling_perceiver.py b/src/transformers/models/perceiver/modeling_perceiver.py
index ce14e16711..f372d09a3e 100755
--- a/src/transformers/models/perceiver/modeling_perceiver.py
+++ b/src/transformers/models/perceiver/modeling_perceiver.py
@@ -773,7 +773,11 @@ class PerceiverModel(PerceiverPreTrainedModel):
 
         ```python
         >>> from transformers import PerceiverConfig, PerceiverTokenizer, PerceiverFeatureExtractor, PerceiverModel
-        >>> from transformers.models.perceiver.modeling_perceiver import PerceiverTextPreprocessor, PerceiverImagePreprocessor, PerceiverClassificationDecoder
+        >>> from transformers.models.perceiver.modeling_perceiver import (
+        ...     PerceiverTextPreprocessor,
+        ...     PerceiverImagePreprocessor,
+        ...     PerceiverClassificationDecoder,
+        ... )
         >>> import torch
         >>> import requests
         >>> from PIL import Image
@@ -785,10 +789,12 @@ class PerceiverModel(PerceiverPreTrainedModel):
         >>> # using trainable position embeddings
         >>> config = PerceiverConfig()
         >>> preprocessor = PerceiverTextPreprocessor(config)
-        >>> decoder = PerceiverClassificationDecoder(config,
-        ...                                          num_channels=config.d_latents,
-        ...                                          trainable_position_encoding_kwargs=dict(num_channels=config.d_latents, index_dims=1),
-        ...                                          use_query_residual=True)
+        >>> decoder = PerceiverClassificationDecoder(
+        ...     config,
+        ...     num_channels=config.d_latents,
+        ...     trainable_position_encoding_kwargs=dict(num_channels=config.d_latents, index_dims=1),
+        ...     use_query_residual=True,
+        ... )
         >>> model = PerceiverModel(config, input_preprocessor=preprocessor, decoder=decoder)
 
         >>> # you can then do a forward pass as follows:
@@ -797,7 +803,7 @@ class PerceiverModel(PerceiverPreTrainedModel):
         >>> inputs = tokenizer(text, return_tensors="pt").input_ids
 
         >>> with torch.no_grad():
-        >>>    outputs = model(inputs=inputs)
+        ...     outputs = model(inputs=inputs)
         >>> logits = outputs.logits
 
         >>> # to train, one can train the model using standard cross-entropy:
@@ -808,37 +814,39 @@ class PerceiverModel(PerceiverPreTrainedModel):
 
         >>> # EXAMPLE 2: using the Perceiver to classify images
         >>> # - we define an ImagePreprocessor, which can be used to embed images
-        >>> preprocessor=PerceiverImagePreprocessor(
-        ...              config,
-        ...              prep_type="conv1x1",
-        ...              spatial_downsample=1,
-        ...              out_channels=256,
-        ...              position_encoding_type="trainable",
-        ...              concat_or_add_pos="concat",
-        ...              project_pos_dim=256,
-        ...              trainable_position_encoding_kwargs=dict(num_channels=256, index_dims=config.image_size ** 2,
-        ...              ),
+        >>> preprocessor = PerceiverImagePreprocessor(
+        ...     config,
+        ...     prep_type="conv1x1",
+        ...     spatial_downsample=1,
+        ...     out_channels=256,
+        ...     position_encoding_type="trainable",
+        ...     concat_or_add_pos="concat",
+        ...     project_pos_dim=256,
+        ...     trainable_position_encoding_kwargs=dict(
+        ...         num_channels=256,
+        ...         index_dims=config.image_size ** 2,
+        ...     ),
         ... )
 
         >>> model = PerceiverModel(
+        ...     config,
+        ...     input_preprocessor=preprocessor,
+        ...     decoder=PerceiverClassificationDecoder(
         ...         config,
-        ...         input_preprocessor=preprocessor,
-        ...         decoder=PerceiverClassificationDecoder(
-        ...              config,
-        ...              num_channels=config.d_latents,
-        ...              trainable_position_encoding_kwargs=dict(num_channels=config.d_latents, index_dims=1),
-        ...              use_query_residual=True,
-        ...          ),
+        ...         num_channels=config.d_latents,
+        ...         trainable_position_encoding_kwargs=dict(num_channels=config.d_latents, index_dims=1),
+        ...         use_query_residual=True,
+        ...     ),
         ... )
 
         >>> # you can then do a forward pass as follows:
         >>> feature_extractor = PerceiverFeatureExtractor()
-        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
         >>> inputs = feature_extractor(image, return_tensors="pt").pixel_values
 
         >>> with torch.no_grad():
-        >>>    outputs = model(inputs=inputs)
+        ...     outputs = model(inputs=inputs)
         >>> logits = outputs.logits
 
         >>> # to train, one can train the model using standard cross-entropy:
@@ -1001,14 +1009,14 @@ class PerceiverForMaskedLM(PerceiverPreTrainedModel):
         >>> from transformers import PerceiverTokenizer, PerceiverForMaskedLM
         >>> import torch
 
-        >>> tokenizer = PerceiverTokenizer.from_pretrained('deepmind/language-perceiver')
-        >>> model = PerceiverForMaskedLM.from_pretrained('deepmind/language-perceiver')
+        >>> tokenizer = PerceiverTokenizer.from_pretrained("deepmind/language-perceiver")
+        >>> model = PerceiverForMaskedLM.from_pretrained("deepmind/language-perceiver")
 
         >>> # training
         >>> text = "This is an incomplete sentence where some words are missing."
         >>> inputs = tokenizer(text, padding="max_length", return_tensors="pt")
         >>> # mask " missing."
-        >>> inputs['input_ids'][0, 52:61] = tokenizer.mask_token_id
+        >>> inputs["input_ids"][0, 52:61] = tokenizer.mask_token_id
         >>> labels = tokenizer(text, padding="max_length", return_tensors="pt").input_ids
 
         >>> outputs = model(**inputs, labels=labels)
@@ -1020,11 +1028,11 @@ class PerceiverForMaskedLM(PerceiverPreTrainedModel):
         >>> encoding = tokenizer(text, padding="max_length", return_tensors="pt")
 
         >>> # mask bytes corresponding to " missing.". Note that the model performs much better if the masked span starts with a space.
-        >>> encoding['input_ids'][0, 52:61] = tokenizer.mask_token_id
+        >>> encoding["input_ids"][0, 52:61] = tokenizer.mask_token_id
 
         >>> # forward pass
         >>> with torch.no_grad():
-        >>>    outputs = model(**encoding)
+        ...     outputs = model(**encoding)
         >>> logits = outputs.logits
 
         >>> masked_tokens_predictions = logits[0, 52:61].argmax(dim=-1).tolist()
@@ -1117,8 +1125,8 @@ class PerceiverForSequenceClassification(PerceiverPreTrainedModel):
         ```python
         >>> from transformers import PerceiverTokenizer, PerceiverForSequenceClassification
 
-        >>> tokenizer = PerceiverTokenizer.from_pretrained('deepmind/language-perceiver')
-        >>> model = PerceiverForSequenceClassification.from_pretrained('deepmind/language-perceiver')
+        >>> tokenizer = PerceiverTokenizer.from_pretrained("deepmind/language-perceiver")
+        >>> model = PerceiverForSequenceClassification.from_pretrained("deepmind/language-perceiver")
 
         >>> text = "hello world"
         >>> inputs = tokenizer(text, return_tensors="pt").input_ids
@@ -1252,11 +1260,11 @@ class PerceiverForImageClassificationLearned(PerceiverPreTrainedModel):
         >>> from PIL import Image
         >>> import requests
 
-        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = PerceiverFeatureExtractor.from_pretrained('deepmind/vision-perceiver-learned')
-        >>> model = PerceiverForImageClassificationLearned.from_pretrained('deepmind/vision-perceiver-learned')
+        >>> feature_extractor = PerceiverFeatureExtractor.from_pretrained("deepmind/vision-perceiver-learned")
+        >>> model = PerceiverForImageClassificationLearned.from_pretrained("deepmind/vision-perceiver-learned")
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt").pixel_values
         >>> outputs = model(inputs=inputs)
@@ -1389,11 +1397,11 @@ class PerceiverForImageClassificationFourier(PerceiverPreTrainedModel):
         >>> from PIL import Image
         >>> import requests
 
-        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = PerceiverFeatureExtractor.from_pretrained('deepmind/vision-perceiver-fourier')
-        >>> model = PerceiverForImageClassificationFourier.from_pretrained('deepmind/vision-perceiver-fourier')
+        >>> feature_extractor = PerceiverFeatureExtractor.from_pretrained("deepmind/vision-perceiver-fourier")
+        >>> model = PerceiverForImageClassificationFourier.from_pretrained("deepmind/vision-perceiver-fourier")
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt").pixel_values
         >>> outputs = model(inputs=inputs)
@@ -1526,11 +1534,11 @@ class PerceiverForImageClassificationConvProcessing(PerceiverPreTrainedModel):
         >>> from PIL import Image
         >>> import requests
 
-        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = PerceiverFeatureExtractor.from_pretrained('deepmind/vision-perceiver-conv')
-        >>> model = PerceiverForImageClassificationConvProcessing.from_pretrained('deepmind/vision-perceiver-conv')
+        >>> feature_extractor = PerceiverFeatureExtractor.from_pretrained("deepmind/vision-perceiver-conv")
+        >>> model = PerceiverForImageClassificationConvProcessing.from_pretrained("deepmind/vision-perceiver-conv")
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt").pixel_values
         >>> outputs = model(inputs=inputs)
@@ -1676,7 +1684,7 @@ class PerceiverForOpticalFlow(PerceiverPreTrainedModel):
         >>> from transformers import PerceiverForOpticalFlow
         >>> import torch
 
-        >>> model = PerceiverForOpticalFlow.from_pretrained('deepmind/optical-flow-perceiver')
+        >>> model = PerceiverForOpticalFlow.from_pretrained("deepmind/optical-flow-perceiver")
 
         >>> # in the Perceiver IO paper, the authors extract a 3 x 3 patch around each pixel,
         >>> # leading to 3 x 3 x 3 = 27 values for each pixel (as each pixel also has 3 color channels)
@@ -1894,7 +1902,7 @@ class PerceiverForMultimodalAutoencoding(PerceiverPreTrainedModel):
         >>> audio = torch.randn((1, 30720, 1))
         >>> inputs = dict(image=images, audio=audio, label=torch.zeros((images.shape[0], 700)))
 
-        >>> model = PerceiverForMultimodalAutoencoding.from_pretrained('deepmind/multimodal-perceiver')
+        >>> model = PerceiverForMultimodalAutoencoding.from_pretrained("deepmind/multimodal-perceiver")
 
         >>> # in the Perceiver IO paper, videos are auto-encoded in chunks
         >>> # each chunk subsamples different index dimensions of the image and audio modality decoder queries
@@ -1904,9 +1912,9 @@ class PerceiverForMultimodalAutoencoding(PerceiverPreTrainedModel):
         >>> # process the first chunk
         >>> chunk_idx = 0
         >>> subsampling = {
-        ... "image": torch.arange(image_chunk_size * chunk_idx, image_chunk_size * (chunk_idx + 1)),
-        ... "audio": torch.arange(audio_chunk_size * chunk_idx, audio_chunk_size * (chunk_idx + 1)),
-        ... "label": None,
+        ...     "image": torch.arange(image_chunk_size * chunk_idx, image_chunk_size * (chunk_idx + 1)),
+        ...     "audio": torch.arange(audio_chunk_size * chunk_idx, audio_chunk_size * (chunk_idx + 1)),
+        ...     "label": None,
         ... }
 
         >>> outputs = model(inputs=inputs, subsampled_output_points=subsampling)
diff --git a/src/transformers/models/prophetnet/modeling_prophetnet.py b/src/transformers/models/prophetnet/modeling_prophetnet.py
index 5cf97de072..f9ee530c61 100644
--- a/src/transformers/models/prophetnet/modeling_prophetnet.py
+++ b/src/transformers/models/prophetnet/modeling_prophetnet.py
@@ -1292,8 +1292,8 @@ class ProphetNetEncoder(ProphetNetPreTrainedModel):
         >>> from transformers import ProphetNetTokenizer, ProphetNetEncoder
         >>> import torch
 
-        >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased')
-        >>> model = ProphetNetEncoder.from_pretrained('patrickvonplaten/prophetnet-large-uncased-standalone')
+        >>> tokenizer = ProphetNetTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
+        >>> model = ProphetNetEncoder.from_pretrained("patrickvonplaten/prophetnet-large-uncased-standalone")
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
 
@@ -1469,8 +1469,8 @@ class ProphetNetDecoder(ProphetNetPreTrainedModel):
         >>> from transformers import ProphetNetTokenizer, ProphetNetDecoder
         >>> import torch
 
-        >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased')
-        >>> model = ProphetNetDecoder.from_pretrained('microsoft/prophetnet-large-uncased', add_cross_attention=False)
+        >>> tokenizer = ProphetNetTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
+        >>> model = ProphetNetDecoder.from_pretrained("microsoft/prophetnet-large-uncased", add_cross_attention=False)
         >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -1809,10 +1809,12 @@ class ProphetNetModel(ProphetNetPreTrainedModel):
         ```python
         >>> from transformers import ProphetNetTokenizer, ProphetNetModel
 
-        >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased')
-        >>> model = ProphetNetModel.from_pretrained('microsoft/prophetnet-large-uncased')
+        >>> tokenizer = ProphetNetTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
+        >>> model = ProphetNetModel.from_pretrained("microsoft/prophetnet-large-uncased")
 
-        >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+        >>> input_ids = tokenizer(
+        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+        >>> ).input_ids  # Batch size 1
         >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
         >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
 
@@ -1929,10 +1931,12 @@ class ProphetNetForConditionalGeneration(ProphetNetPreTrainedModel):
         ```python
         >>> from transformers import ProphetNetTokenizer, ProphetNetForConditionalGeneration
 
-        >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased')
-        >>> model = ProphetNetForConditionalGeneration.from_pretrained('microsoft/prophetnet-large-uncased')
+        >>> tokenizer = ProphetNetTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
+        >>> model = ProphetNetForConditionalGeneration.from_pretrained("microsoft/prophetnet-large-uncased")
 
-        >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+        >>> input_ids = tokenizer(
+        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+        >>> ).input_ids  # Batch size 1
         >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
         >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
 
@@ -2173,8 +2177,8 @@ class ProphetNetForCausalLM(ProphetNetPreTrainedModel):
         >>> from transformers import ProphetNetTokenizer, ProphetNetForCausalLM
         >>> import torch
 
-        >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased')
-        >>> model = ProphetNetForCausalLM.from_pretrained('microsoft/prophetnet-large-uncased')
+        >>> tokenizer = ProphetNetTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
+        >>> model = ProphetNetForCausalLM.from_pretrained("microsoft/prophetnet-large-uncased")
         >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -2185,17 +2189,21 @@ class ProphetNetForCausalLM(ProphetNetPreTrainedModel):
         >>> from transformers import BertTokenizer, EncoderDecoderModel, ProphetNetTokenizer
         >>> import torch
 
-        >>> tokenizer_enc = BertTokenizer.from_pretrained('bert-large-uncased')
-        >>> tokenizer_dec = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased')
-        >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-large-uncased", "microsoft/prophetnet-large-uncased")
+        >>> tokenizer_enc = BertTokenizer.from_pretrained("bert-large-uncased")
+        >>> tokenizer_dec = ProphetNetTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
+        >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained(
+        ...     "bert-large-uncased", "microsoft/prophetnet-large-uncased"
+        ... )
 
         >>> ARTICLE = (
-        ... "the us state department said wednesday it had received no "
-        ... "formal word from bolivia that it was expelling the us ambassador there "
-        ... "but said the charges made against him are `` baseless ."
+        ...     "the us state department said wednesday it had received no "
+        ...     "formal word from bolivia that it was expelling the us ambassador there "
+        ...     "but said the charges made against him are `` baseless ."
         ... )
         >>> input_ids = tokenizer_enc(ARTICLE, return_tensors="pt").input_ids
-        >>> labels = tokenizer_dec("us rejects charges against its ambassador in bolivia", return_tensors="pt").input_ids
+        >>> labels = tokenizer_dec(
+        ...     "us rejects charges against its ambassador in bolivia", return_tensors="pt"
+        >>> ).input_ids
         >>> outputs = model(input_ids=input_ids, decoder_input_ids=labels[:, :-1], labels=labels[:, 1:])
 
         >>> loss = outputs.loss
diff --git a/src/transformers/models/qdqbert/modeling_qdqbert.py b/src/transformers/models/qdqbert/modeling_qdqbert.py
index fd1c363fdb..a174e7769a 100755
--- a/src/transformers/models/qdqbert/modeling_qdqbert.py
+++ b/src/transformers/models/qdqbert/modeling_qdqbert.py
@@ -1083,10 +1083,10 @@ class QDQBertLMHeadModel(QDQBertPreTrainedModel):
         >>> from transformers import BertTokenizer, QDQBertLMHeadModel, QDQBertConfig
         >>> import torch
 
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
         >>> config = QDQBertConfig.from_pretrained("bert-base-cased")
         >>> config.is_decoder = True
-        >>> model = QDQBertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+        >>> model = QDQBertLMHeadModel.from_pretrained("bert-base-cased", config=config)
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -1310,16 +1310,16 @@ class QDQBertForNextSentencePrediction(QDQBertPreTrainedModel):
         >>> from transformers import BertTokenizer, QDQBertForNextSentencePrediction
         >>> import torch
 
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> model = QDQBertForNextSentencePrediction.from_pretrained('bert-base-uncased')
+        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        >>> model = QDQBertForNextSentencePrediction.from_pretrained("bert-base-uncased")
 
         >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
         >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
 
         >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
         >>> logits = outputs.logits
-        >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
+        >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
         ```"""
 
         if "next_sentence_label" in kwargs:
diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py
index 5c37927667..b4d14f936d 100644
--- a/src/transformers/models/rag/modeling_rag.py
+++ b/src/transformers/models/rag/modeling_rag.py
@@ -303,8 +303,11 @@ class RagPreTrainedModel(PreTrainedModel):
 
         ```python
         >>> from transformers import RagModel
+
         >>> # initialize a RAG from two pretrained models.
-        >>> model = RagModel.from_question_encoder_generator_pretrained('facebook/dpr-question_encoder-single-nq-base', 't5-small')
+        >>> model = RagModel.from_question_encoder_generator_pretrained(
+        ...     "facebook/dpr-question_encoder-single-nq-base", "t5-small"
+        ... )
         >>> # saving model after fine-tuning
         >>> model.save_pretrained("./rag")
         >>> # load fine-tuned model
@@ -560,7 +563,9 @@ class RagModel(RagPreTrainedModel):
         >>> import torch
 
         >>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-base")
-        >>> retriever = RagRetriever.from_pretrained("facebook/rag-token-base", index_name="exact", use_dummy_dataset=True)
+        >>> retriever = RagRetriever.from_pretrained(
+        ...     "facebook/rag-token-base", index_name="exact", use_dummy_dataset=True
+        ... )
         >>> # initialize with RagRetriever to do everything in one forward call
         >>> model = RagModel.from_pretrained("facebook/rag-token-base", retriever=retriever)
 
@@ -801,13 +806,15 @@ class RagSequenceForGeneration(RagPreTrainedModel):
         >>> import torch
 
         >>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
-        >>> retriever = RagRetriever.from_pretrained("facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True)
+        >>> retriever = RagRetriever.from_pretrained(
+        ...     "facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True
+        ... )
         >>> # initialize with RagRetriever to do everything in one forward call
         >>> model = RagSequenceForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)
 
         >>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
         >>> with tokenizer.as_target_tokenizer():
-        ...    targets = tokenizer("In Paris, there are 10 million people.", return_tensors="pt")
+        ...     targets = tokenizer("In Paris, there are 10 million people.", return_tensors="pt")
         >>> input_ids = inputs["input_ids"]
         >>> labels = targets["input_ids"]
         >>> outputs = model(input_ids=input_ids, labels=labels)
@@ -818,9 +825,16 @@ class RagSequenceForGeneration(RagPreTrainedModel):
         >>> question_hidden_states = model.question_encoder(input_ids)[0]
         >>> # 2. Retrieve
         >>> docs_dict = retriever(input_ids.numpy(), question_hidden_states.detach().numpy(), return_tensors="pt")
-        >>> doc_scores = torch.bmm(question_hidden_states.unsqueeze(1), docs_dict["retrieved_doc_embeds"].float().transpose(1, 2)).squeeze(1)
+        >>> doc_scores = torch.bmm(
+        ...     question_hidden_states.unsqueeze(1), docs_dict["retrieved_doc_embeds"].float().transpose(1, 2)
+        >>> ).squeeze(1)
         >>> # 3. Forward to generator
-        >>> outputs = model(context_input_ids=docs_dict["context_input_ids"], context_attention_mask=docs_dict["context_attention_mask"], doc_scores=doc_scores, decoder_input_ids=labels)
+        >>> outputs = model(
+        ...     context_input_ids=docs_dict["context_input_ids"],
+        ...     context_attention_mask=docs_dict["context_attention_mask"],
+        ...     doc_scores=doc_scores,
+        ...     decoder_input_ids=labels,
+        ... )
         ```"""
         n_docs = n_docs if n_docs is not None else self.config.n_docs
         exclude_bos_score = exclude_bos_score if exclude_bos_score is not None else self.config.exclude_bos_score
@@ -1259,13 +1273,15 @@ class RagTokenForGeneration(RagPreTrainedModel):
         >>> import torch
 
         >>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
-        >>> retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True)
+        >>> retriever = RagRetriever.from_pretrained(
+        ...     "facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True
+        ... )
         >>> # initialize with RagRetriever to do everything in one forward call
         >>> model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)
 
         >>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
         >>> with tokenizer.as_target_tokenizer():
-        ...    targets = tokenizer("In Paris, there are 10 million people.", return_tensors="pt")
+        ...     targets = tokenizer("In Paris, there are 10 million people.", return_tensors="pt")
         >>> input_ids = inputs["input_ids"]
         >>> labels = targets["input_ids"]
         >>> outputs = model(input_ids=input_ids, labels=labels)
@@ -1276,12 +1292,23 @@ class RagTokenForGeneration(RagPreTrainedModel):
         >>> question_hidden_states = model.question_encoder(input_ids)[0]
         >>> # 2. Retrieve
         >>> docs_dict = retriever(input_ids.numpy(), question_hidden_states.detach().numpy(), return_tensors="pt")
-        >>> doc_scores = torch.bmm(question_hidden_states.unsqueeze(1), docs_dict["retrieved_doc_embeds"].float().transpose(1, 2)).squeeze(1)
+        >>> doc_scores = torch.bmm(
+        ...     question_hidden_states.unsqueeze(1), docs_dict["retrieved_doc_embeds"].float().transpose(1, 2)
+        >>> ).squeeze(1)
         >>> # 3. Forward to generator
-        >>> outputs = model(context_input_ids=docs_dict["context_input_ids"], context_attention_mask=docs_dict["context_attention_mask"], doc_scores=doc_scores, decoder_input_ids=labels)
+        >>> outputs = model(
+        ...     context_input_ids=docs_dict["context_input_ids"],
+        ...     context_attention_mask=docs_dict["context_attention_mask"],
+        ...     doc_scores=doc_scores,
+        ...     decoder_input_ids=labels,
+        ... )
 
         >>> # or directly generate
-        >>> generated = model.generate(context_input_ids=docs_dict["context_input_ids"], context_attention_mask=docs_dict["context_attention_mask"], doc_scores=doc_scores)
+        >>> generated = model.generate(
+        ...     context_input_ids=docs_dict["context_input_ids"],
+        ...     context_attention_mask=docs_dict["context_attention_mask"],
+        ...     doc_scores=doc_scores,
+        ... )
         >>> generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)
         ```"""
         n_docs = n_docs if n_docs is not None else self.config.n_docs
diff --git a/src/transformers/models/rag/modeling_tf_rag.py b/src/transformers/models/rag/modeling_tf_rag.py
index f4fa8dc418..9601e19be7 100644
--- a/src/transformers/models/rag/modeling_tf_rag.py
+++ b/src/transformers/models/rag/modeling_tf_rag.py
@@ -276,10 +276,18 @@ class TFRagPreTrainedModel(TFPreTrainedModel):
 
         ```python
         >>> from transformers import RagRetriever, TFRagModel
+
         >>> # initialize a RAG from two pretrained models.
-        >>> model = TFRagModel.from_pretrained_question_encoder_generator('facebook/dpr-question_encoder-single-nq-base', 't5-small')
+        >>> model = TFRagModel.from_pretrained_question_encoder_generator(
+        ...     "facebook/dpr-question_encoder-single-nq-base", "t5-small"
+        ... )
         >>> # alternatively, initialize from pytorch pretrained models can also be done
-        >>> model = TFRagModel.from_pretrained_question_encoder_generator('facebook/dpr-question_encoder-single-nq-base', "facebook/bart-base", generator_from_pt=True, question_encoder_from_pt=True)
+        >>> model = TFRagModel.from_pretrained_question_encoder_generator(
+        ...     "facebook/dpr-question_encoder-single-nq-base",
+        ...     "facebook/bart-base",
+        ...     generator_from_pt=True,
+        ...     question_encoder_from_pt=True,
+        ... )
 
         >>> # saving model after fine-tuning
         >>> model.save_pretrained("./rag")
@@ -555,11 +563,15 @@ class TFRagModel(TFRagPreTrainedModel):
         >>> import torch
 
         >>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-base")
-        >>> retriever = RagRetriever.from_pretrained("facebook/rag-token-base", index_name="exact", use_dummy_dataset=True)
+        >>> retriever = RagRetriever.from_pretrained(
+        ...     "facebook/rag-token-base", index_name="exact", use_dummy_dataset=True
+        ... )
         >>> # initialize with RagRetriever to do everything in one forward call
         >>> model = TFRagModel.from_pretrained("facebook/rag-token-base", retriever=retriever, from_pt=True)
 
-        >>> input_dict = tokenizer.prepare_seq2seq_batch("How many people live in Paris?", "In Paris, there are 10 million people.", return_tensors="tf")
+        >>> input_dict = tokenizer.prepare_seq2seq_batch(
+        ...     "How many people live in Paris?", "In Paris, there are 10 million people.", return_tensors="tf"
+        ... )
         >>> input_ids = input_dict["input_ids"]
         >>> outputs = model(input_ids)
         ```"""
@@ -930,11 +942,15 @@ class TFRagTokenForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss
         >>> from transformers import RagTokenizer, RagRetriever, TFRagTokenForGeneration
 
         >>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
-        >>> retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True)
+        >>> retriever = RagRetriever.from_pretrained(
+        ...     "facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True
+        ... )
         >>> # initialize with RagRetriever to do everything in one forward call
         >>> model = TFRagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever, from_pt=True)
 
-        >>> input_dict = tokenizer.prepare_seq2seq_batch("How many people live in Paris?", "In Paris, there are 10 million people.", return_tensors="tf")
+        >>> input_dict = tokenizer.prepare_seq2seq_batch(
+        ...     "How many people live in Paris?", "In Paris, there are 10 million people.", return_tensors="tf"
+        ... )
         >>> outputs = model(input_dict, output_retrieved=True)
 
         >>> # or use retriever separately
@@ -943,12 +959,27 @@ class TFRagTokenForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss
         >>> question_hidden_states = model.question_encoder(input_ids)[0]
         >>> # 2. Retrieve
         >>> docs_dict = retriever(input_ids.numpy(), question_hidden_states.numpy(), return_tensors="tf")
-        >>> doc_scores = tf.squeeze(tf.matmul(tf.expand_dims(question_hidden_states, axis=1), docs_dict["retrieved_doc_embeds"], transpose_b=True), axis=1)
+        >>> doc_scores = tf.squeeze(
+        ...     tf.matmul(
+        ...         tf.expand_dims(question_hidden_states, axis=1), docs_dict["retrieved_doc_embeds"], transpose_b=True
+        ...     ),
+        ...     axis=1,
+        ... )
         >>> # 3. Forward to generator
-        >>> outputs = model(inputs=None, context_input_ids=docs_dict["context_input_ids"], context_attention_mask=docs_dict["context_attention_mask"], doc_scores=doc_scores, decoder_input_ids=input_dict["labels"])
+        >>> outputs = model(
+        ...     inputs=None,
+        ...     context_input_ids=docs_dict["context_input_ids"],
+        ...     context_attention_mask=docs_dict["context_attention_mask"],
+        ...     doc_scores=doc_scores,
+        ...     decoder_input_ids=input_dict["labels"],
+        ... )
 
         >>> # or directly generate
-        >>> generated = model.generate(context_input_ids=docs_dict["context_input_ids"], context_attention_mask=docs_dict["context_attention_mask"], doc_scores=doc_scores)
+        >>> generated = model.generate(
+        ...     context_input_ids=docs_dict["context_input_ids"],
+        ...     context_attention_mask=docs_dict["context_attention_mask"],
+        ...     doc_scores=doc_scores,
+        ... )
         >>> generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)
         ```"""
 
@@ -1519,11 +1550,17 @@ class TFRagSequenceForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingL
         >>> from transformers import RagTokenizer, RagRetriever, TFRagSequenceForGeneration
 
         >>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
-        >>> retriever = RagRetriever.from_pretrained("facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True)
+        >>> retriever = RagRetriever.from_pretrained(
+        ...     "facebook/rag-sequence-nq", index_name="exact", use_dummy_dataset=True
+        ... )
         >>> # initialize with RagRetriever to do everything in one forward call
-        >>> model = TFRagRagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq", retriever=retriever, from_pt=True)
+        >>> model = TFRagRagSequenceForGeneration.from_pretrained(
+        ...     "facebook/rag-sequence-nq", retriever=retriever, from_pt=True
+        ... )
 
-        >>> input_dict = tokenizer.prepare_seq2seq_batch("How many people live in Paris?", "In Paris, there are 10 million people.", return_tensors="tf")
+        >>> input_dict = tokenizer.prepare_seq2seq_batch(
+        ...     "How many people live in Paris?", "In Paris, there are 10 million people.", return_tensors="tf"
+        ... )
         >>> outputs = model(input_dict, output_retrieved=True)
 
         >>> # or use retriever separately
@@ -1532,12 +1569,27 @@ class TFRagSequenceForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingL
         >>> question_hidden_states = model.question_encoder(input_ids)[0]
         >>> # 2. Retrieve
         >>> docs_dict = retriever(input_ids.numpy(), question_hidden_states.numpy(), return_tensors="tf")
-        >>> doc_scores = tf.squeeze(tf.matmul(tf.expand_dims(question_hidden_states, axis=1), docs_dict["retrieved_doc_embeds"], transpose_b=True), axis=1)
+        >>> doc_scores = tf.squeeze(
+        ...     tf.matmul(
+        ...         tf.expand_dims(question_hidden_states, axis=1), docs_dict["retrieved_doc_embeds"], transpose_b=True
+        ...     ),
+        ...     axis=1,
+        ... )
         >>> # 3. Forward to generator
-        >>> outputs = model(inputs=None, context_input_ids=docs_dict["context_input_ids"], context_attention_mask=docs_dict["context_attention_mask"], doc_scores=doc_scores, decoder_input_ids=input_dict["labels"])
+        >>> outputs = model(
+        ...     inputs=None,
+        ...     context_input_ids=docs_dict["context_input_ids"],
+        ...     context_attention_mask=docs_dict["context_attention_mask"],
+        ...     doc_scores=doc_scores,
+        ...     decoder_input_ids=input_dict["labels"],
+        ... )
 
         >>> # or directly generate
-        >>> generated = model.generate(context_input_ids=docs_dict["context_input_ids"], context_attention_mask=docs_dict["context_attention_mask"], doc_scores=doc_scores)
+        >>> generated = model.generate(
+        ...     context_input_ids=docs_dict["context_input_ids"],
+        ...     context_attention_mask=docs_dict["context_attention_mask"],
+        ...     doc_scores=doc_scores,
+        ... )
         >>> generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)
         ```"""
 
diff --git a/src/transformers/models/rag/retrieval_rag.py b/src/transformers/models/rag/retrieval_rag.py
index db6aef4940..6a498169d5 100644
--- a/src/transformers/models/rag/retrieval_rag.py
+++ b/src/transformers/models/rag/retrieval_rag.py
@@ -345,22 +345,35 @@ class RagRetriever:
     ```python
     >>> # To load the default "wiki_dpr" dataset with 21M passages from wikipedia (index name is 'compressed' or 'exact')
     >>> from transformers import RagRetriever
-    >>> retriever = RagRetriever.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', dataset="wiki_dpr", index_name='compressed')
+
+    >>> retriever = RagRetriever.from_pretrained(
+    ...     "facebook/dpr-ctx_encoder-single-nq-base", dataset="wiki_dpr", index_name="compressed"
+    ... )
 
     >>> # To load your own indexed dataset built with the datasets library. More info on how to build the indexed dataset in examples/rag/use_own_knowledge_dataset.py
     >>> from transformers import RagRetriever
-    >>> dataset = ...  # dataset must be a datasets.Datasets object with columns "title", "text" and "embeddings", and it must have a faiss index
-    >>> retriever = RagRetriever.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', indexed_dataset=dataset)
+
+    >>> dataset = (
+    ...     ...
+    >>> )  # dataset must be a datasets.Datasets object with columns "title", "text" and "embeddings", and it must have a faiss index
+    >>> retriever = RagRetriever.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base", indexed_dataset=dataset)
 
     >>> # To load your own indexed dataset built with the datasets library that was saved on disk. More info in examples/rag/use_own_knowledge_dataset.py
     >>> from transformers import RagRetriever
+
     >>> dataset_path = "path/to/my/dataset"  # dataset saved via *dataset.save_to_disk(...)*
     >>> index_path = "path/to/my/index.faiss"  # faiss index saved via *dataset.get_index("embeddings").save(...)*
-    >>> retriever = RagRetriever.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', index_name='custom', passages_path=dataset_path, index_path=index_path)
+    >>> retriever = RagRetriever.from_pretrained(
+    ...     "facebook/dpr-ctx_encoder-single-nq-base",
+    ...     index_name="custom",
+    ...     passages_path=dataset_path,
+    ...     index_path=index_path,
+    ... )
 
     >>> # To load the legacy index built originally for Rag's paper
     >>> from transformers import RagRetriever
-    >>> retriever = RagRetriever.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', index_name='legacy')
+
+    >>> retriever = RagRetriever.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base", index_name="legacy")
     ```"""
 
     def __init__(self, config, question_encoder_tokenizer, generator_tokenizer, index=None, init_retrieval=True):
diff --git a/src/transformers/models/rembert/modeling_rembert.py b/src/transformers/models/rembert/modeling_rembert.py
index 5ec67c45de..1f9938a296 100755
--- a/src/transformers/models/rembert/modeling_rembert.py
+++ b/src/transformers/models/rembert/modeling_rembert.py
@@ -1085,10 +1085,10 @@ class RemBertForCausalLM(RemBertPreTrainedModel):
         >>> from transformers import RemBertTokenizer, RemBertForCausalLM, RemBertConfig
         >>> import torch
 
-        >>> tokenizer = RemBertTokenizer.from_pretrained('google/rembert')
+        >>> tokenizer = RemBertTokenizer.from_pretrained("google/rembert")
         >>> config = RemBertConfig.from_pretrained("google/rembert")
         >>> config.is_decoder = True
-        >>> model = RemBertForCausalLM.from_pretrained('google/rembert', config=config)
+        >>> model = RemBertForCausalLM.from_pretrained("google/rembert", config=config)
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
diff --git a/src/transformers/models/roberta/modeling_roberta.py b/src/transformers/models/roberta/modeling_roberta.py
index bcbe63cc57..dc5d717f84 100644
--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
@@ -956,10 +956,10 @@ class RobertaForCausalLM(RobertaPreTrainedModel):
         >>> from transformers import RobertaTokenizer, RobertaForCausalLM, RobertaConfig
         >>> import torch
 
-        >>> tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
+        >>> tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
         >>> config = RobertaConfig.from_pretrained("roberta-base")
         >>> config.is_decoder = True
-        >>> model = RobertaForCausalLM.from_pretrained('roberta-base', config=config)
+        >>> model = RobertaForCausalLM.from_pretrained("roberta-base", config=config)
 
         >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
         >>> outputs = model(**inputs)
diff --git a/src/transformers/models/roformer/modeling_roformer.py b/src/transformers/models/roformer/modeling_roformer.py
index c51145f70a..0609d6c26d 100644
--- a/src/transformers/models/roformer/modeling_roformer.py
+++ b/src/transformers/models/roformer/modeling_roformer.py
@@ -1131,10 +1131,10 @@ class RoFormerForCausalLM(RoFormerPreTrainedModel):
         >>> from transformers import RoFormerTokenizer, RoFormerForCausalLM, RoFormerConfig
         >>> import torch
 
-        >>> tokenizer = RoFormerTokenizer.from_pretrained('junnyu/roformer_chinese_base')
+        >>> tokenizer = RoFormerTokenizer.from_pretrained("junnyu/roformer_chinese_base")
         >>> config = RoFormerConfig.from_pretrained("junnyu/roformer_chinese_base")
         >>> config.is_decoder = True
-        >>> model = RoFormerForCausalLM.from_pretrained('junnyu/roformer_chinese_base', config=config)
+        >>> model = RoFormerForCausalLM.from_pretrained("junnyu/roformer_chinese_base", config=config)
 
         >>> inputs = tokenizer("今天天气非常好。", return_tensors="pt")
         >>> outputs = model(**inputs)
diff --git a/src/transformers/models/roformer/tokenization_roformer.py b/src/transformers/models/roformer/tokenization_roformer.py
index e71ac937ca..e5e3728c03 100644
--- a/src/transformers/models/roformer/tokenization_roformer.py
+++ b/src/transformers/models/roformer/tokenization_roformer.py
@@ -103,7 +103,8 @@ class RoFormerTokenizer(PreTrainedTokenizer):
 
     ```python
     >>> from transformers import RoFormerTokenizer
-    >>> tokenizer = RoFormerTokenizer.from_pretrained('junnyu/roformer_chinese_base')
+
+    >>> tokenizer = RoFormerTokenizer.from_pretrained("junnyu/roformer_chinese_base")
     >>> tokenizer.tokenize("今天天气非常好。")
     # ['今', '天', '天', '气', '非常', '好', '。']
     ```"""
diff --git a/src/transformers/models/roformer/tokenization_roformer_fast.py b/src/transformers/models/roformer/tokenization_roformer_fast.py
index 12ccb30afd..26c37d4580 100644
--- a/src/transformers/models/roformer/tokenization_roformer_fast.py
+++ b/src/transformers/models/roformer/tokenization_roformer_fast.py
@@ -74,7 +74,8 @@ class RoFormerTokenizerFast(PreTrainedTokenizerFast):
 
     ```python
     >>> from transformers import RoFormerTokenizerFast
-    >>> tokenizer = RoFormerTokenizerFast.from_pretrained('junnyu/roformer_chinese_base')
+
+    >>> tokenizer = RoFormerTokenizerFast.from_pretrained("junnyu/roformer_chinese_base")
     >>> tokenizer.tokenize("今天天气非常好。")
     # ['今', '天', '天', '气', '非常', '好', '。']
     ```"""
diff --git a/src/transformers/models/segformer/modeling_segformer.py b/src/transformers/models/segformer/modeling_segformer.py
index 4dd4ce7609..afa6d8cde8 100755
--- a/src/transformers/models/segformer/modeling_segformer.py
+++ b/src/transformers/models/segformer/modeling_segformer.py
@@ -493,7 +493,7 @@ class SegformerModel(SegformerPreTrainedModel):
         >>> feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
         >>> model = SegformerModel("nvidia/segformer-b0-finetuned-ade-512-512")
 
-        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
@@ -570,11 +570,11 @@ class SegformerForImageClassification(SegformerPreTrainedModel):
         >>> from PIL import Image
         >>> import requests
 
-        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = SegformerFeatureExtractor.from_pretrained('nvidia/mit-b0')
-        >>> model = SegformerForImageClassification.from_pretrained('nvidia/mit-b0')
+        >>> feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/mit-b0")
+        >>> model = SegformerForImageClassification.from_pretrained("nvidia/mit-b0")
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -729,12 +729,12 @@ class SegformerForSemanticSegmentation(SegformerPreTrainedModel):
         >>> feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
         >>> model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
 
-        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
-        >>> logits = outputs.logits # shape (batch_size, num_labels, height/4, width/4)
+        >>> logits = outputs.logits  # shape (batch_size, num_labels, height/4, width/4)
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
         output_hidden_states = (
diff --git a/src/transformers/models/sew_d/modeling_sew_d.py b/src/transformers/models/sew_d/modeling_sew_d.py
index 391b3c5b3c..1095adbdef 100644
--- a/src/transformers/models/sew_d/modeling_sew_d.py
+++ b/src/transformers/models/sew_d/modeling_sew_d.py
@@ -480,10 +480,10 @@ class XSoftmax(torch.autograd.Function):
     >>> from transformers.models.deberta_v2.modeling_deberta_v2 import XSoftmax
 
     >>> # Make a tensor
-    >>> x = torch.randn([4,20,100])
+    >>> x = torch.randn([4, 20, 100])
 
     >>> # Create a mask
-    >>> mask = (x>0).int()
+    >>> mask = (x > 0).int()
 
     >>> # Specify the dimension to apply softmax
     >>> dim = -1
diff --git a/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py
index c7d992c076..ca3e4966aa 100644
--- a/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py
@@ -58,17 +58,17 @@ class SpeechEncoderDecoderConfig(PretrainedConfig):
 
     >>> # Accessing the model configuration
     >>> config_encoder = model.config.encoder
-    >>> config_decoder  = model.config.decoder
+    >>> config_decoder = model.config.decoder
     >>> # set decoder config to causal lm
     >>> config_decoder.is_decoder = True
     >>> config_decoder.add_cross_attention = True
 
     >>> # Saving the model, including its configuration
-    >>> model.save_pretrained('my-model')
+    >>> model.save_pretrained("my-model")
 
     >>> # loading model and config from pretrained folder
-    >>> encoder_decoder_config = SpeechEncoderDecoderConfig.from_pretrained('my-model')
-    >>> model = SpeechEncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config)
+    >>> encoder_decoder_config = SpeechEncoderDecoderConfig.from_pretrained("my-model")
+    >>> model = SpeechEncoderDecoderModel.from_pretrained("my-model", config=encoder_decoder_config)
     ```"""
     model_type = "speech-encoder-decoder"
     is_composition = True
diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
index f0a5c0fbd3..e3d70dff9f 100644
--- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
@@ -330,8 +330,11 @@ class SpeechEncoderDecoderModel(PreTrainedModel):
 
         ```python
         >>> from transformers import SpeechEncoderDecoderModel
+
         >>> # initialize a wav2vec2bert from a pretrained Wav2Vec2 and a pretrained BERT model. Note that the cross-attention layers will be randomly initialized
-        >>> model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained('facebook/wav2vec2-base-960h', 'bert-base-uncased')
+        >>> model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained(
+        ...     "facebook/wav2vec2-base-960h", "bert-base-uncased"
+        ... )
         >>> # saving model after fine-tuning
         >>> model.save_pretrained("./wav2vec2bert")
         >>> # load fine-tuned model
@@ -447,8 +450,8 @@ class SpeechEncoderDecoderModel(PreTrainedModel):
         >>> from datasets import load_dataset
         >>> import torch
 
-        >>> processor = Speech2Text2Processor.from_pretrained('facebook/s2t-wav2vec2-large-en-de')
-        >>> model = SpeechEncoderDecoderModel.from_pretrained('facebook/s2t-wav2vec2-large-en-de')
+        >>> processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
+        >>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
 
         >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
 
diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
index 5264017423..9e01fdb07b 100755
--- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
@@ -1310,15 +1310,19 @@ class Speech2TextForConditionalGeneration(Speech2TextPreTrainedModel):
         >>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
         >>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")
 
+
         >>> def map_to_array(batch):
-        >>>     speech, _ = sf.read(batch["file"])
-        >>>     batch["speech"] = speech
-        >>>     return batch
+        ...     speech, _ = sf.read(batch["file"])
+        ...     batch["speech"] = speech
+        ...     return batch
+
 
         >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         >>> ds = ds.map(map_to_array)
 
-        >>> input_features = processor(ds["speech"][0], sampling_rate=16000, return_tensors="pt").input_features  # Batch size 1
+        >>> input_features = processor(
+        ...     ds["speech"][0], sampling_rate=16000, return_tensors="pt"
+        >>> ).input_features  # Batch size 1
         >>> generated_ids = model.generate(input_ids=input_features)
 
         >>> transcription = processor.batch_decode(generated_ids)
diff --git a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
index 9d09253203..61dba21dde 100755
--- a/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/modeling_speech_to_text_2.py
@@ -859,12 +859,18 @@ class Speech2Text2ForCausalLM(Speech2Text2PreTrainedModel):
         Example:
 
         ```python
-        >>> from transformers import SpeechEncoderDecoderModel, Speech2Text2ForCausalLM, Wav2Vec2Model, Speech2Text2Config, Wav2Vec2Config
+        >>> from transformers import (
+        ...     SpeechEncoderDecoderModel,
+        ...     Speech2Text2ForCausalLM,
+        ...     Wav2Vec2Model,
+        ...     Speech2Text2Config,
+        ...     Wav2Vec2Config,
+        ... )
 
         >>> encoder = Wav2Vec2Model(Wav2Vec2Config())
         >>> decoder = Speech2Text2ForCausalLM(Speech2Text2Config())
-
         # init speech2text model
+
         >>> model = SpeechEncoderDecoderModel(encoder=encoder, decoder=decoder)
         ```"""
 
diff --git a/src/transformers/models/squeezebert/modeling_squeezebert.py b/src/transformers/models/squeezebert/modeling_squeezebert.py
index 75ec95a2b8..0e3e88709e 100644
--- a/src/transformers/models/squeezebert/modeling_squeezebert.py
+++ b/src/transformers/models/squeezebert/modeling_squeezebert.py
@@ -471,7 +471,7 @@ SQUEEZEBERT_START_DOCSTRING = r"""
 
     Hierarchy:
 
-    ```python
+    ```
     Internal class hierarchy:
     SqueezeBertModel
         SqueezeBertEncoder
@@ -483,7 +483,7 @@ SQUEEZEBERT_START_DOCSTRING = r"""
 
     Data layouts:
 
-    ```python
+    ```
     Input data is in [batch, sequence_length, hidden_size] format.
 
     Data inside the encoder is in [batch, hidden_size, sequence_length] format. But, if `output_hidden_states == True`, the data from inside the encoder is returned in [batch, sequence_length, hidden_size] format.
diff --git a/src/transformers/models/t5/modeling_flax_t5.py b/src/transformers/models/t5/modeling_flax_t5.py
index 4d3aa3f397..015c698f42 100644
--- a/src/transformers/models/t5/modeling_flax_t5.py
+++ b/src/transformers/models/t5/modeling_flax_t5.py
@@ -1055,11 +1055,11 @@ class FlaxT5PreTrainedModel(FlaxPreTrainedModel):
         ```python
         >>> from transformers import T5Tokenizer, FlaxT5ForConditionalGeneration
 
-        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        >>> model = FlaxT5ForConditionalGeneration.from_pretrained('t5-small')
+        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> model = FlaxT5ForConditionalGeneration.from_pretrained("t5-small")
 
         >>> text = "My friends are cool but they eat too many carbs."
-        >>> inputs = tokenizer(text, return_tensors='np')
+        >>> inputs = tokenizer(text, return_tensors="np")
         >>> encoder_outputs = model.encode(**inputs)
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -1117,11 +1117,11 @@ class FlaxT5PreTrainedModel(FlaxPreTrainedModel):
         >>> from transformers import T5Tokenizer, FlaxT5ForConditionalGeneration
         >>> import jax.numpy as jnp
 
-        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        >>> model = FlaxT5ForConditionalGeneration.from_pretrained('t5-small')
+        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> model = FlaxT5ForConditionalGeneration.from_pretrained("t5-small")
 
         >>> text = "My friends are cool but they eat too many carbs."
-        >>> inputs = tokenizer(text, return_tensors='np')
+        >>> inputs = tokenizer(text, return_tensors="np")
         >>> encoder_outputs = model.encode(**inputs)
 
         >>> decoder_start_token_id = model.config.decoder_start_token_id
@@ -1333,10 +1333,12 @@ FLAX_T5_MODEL_DOCSTRING = """
     ```python
     >>> from transformers import T5Tokenizer, FlaxT5Model
 
-    >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-    >>> model = FlaxT5Model.from_pretrained('t5-small')
+    >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+    >>> model = FlaxT5Model.from_pretrained("t5-small")
 
-    >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="np").input_ids
+    >>> input_ids = tokenizer(
+    ...     "Studies have been shown that owning a dog is good for you", return_tensors="np"
+    >>> ).input_ids
     >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="np").input_ids
 
     >>> # forward pass
@@ -1483,11 +1485,11 @@ class FlaxT5ForConditionalGeneration(FlaxT5PreTrainedModel):
         >>> from transformers import T5Tokenizer, FlaxT5ForConditionalGeneration
         >>> import jax.numpy as jnp
 
-        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        >>> model = FlaxT5ForConditionalGeneration.from_pretrained('t5-small')
+        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> model = FlaxT5ForConditionalGeneration.from_pretrained("t5-small")
 
         >>> text = "summarize: My friends are cool but they eat too many carbs."
-        >>> inputs = tokenizer(text, return_tensors='np')
+        >>> inputs = tokenizer(text, return_tensors="np")
         >>> encoder_outputs = model.encode(**inputs)
 
         >>> decoder_start_token_id = model.config.decoder_start_token_id
@@ -1631,14 +1633,14 @@ FLAX_T5_CONDITIONAL_GENERATION_DOCSTRING = """
     ```python
     >>> from transformers import T5Tokenizer, FlaxT5ForConditionalGeneration
 
-    >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-    >>> model = FlaxT5ForConditionalGeneration.from_pretrained('t5-small')
+    >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+    >>> model = FlaxT5ForConditionalGeneration.from_pretrained("t5-small")
 
     >>> ARTICLE_TO_SUMMARIZE = "summarize: My friends are cool but they eat too many carbs."
-    >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], return_tensors='np')
+    >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], return_tensors="np")
 
     >>> # Generate Summary
-    >>> summary_ids = model.generate(inputs['input_ids']).sequences
+    >>> summary_ids = model.generate(inputs["input_ids"]).sequences
     >>> print(tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False))
     ```
 """
diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
index 77b6a989d9..ad6fe90227 100644
--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -204,12 +204,13 @@ PARALLELIZE_DOCSTRING = r"""
 
     ```python
     # Here is an example of a device map on a machine with 4 GPUs using t5-3b, which has a total of 24 attention modules:
-    model = T5ForConditionalGeneration.from_pretrained('t5-3b')
-    device_map = {0: [0, 1, 2],
-
-             1: [3, 4, 5, 6, 7, 8, 9],
-             2: [10, 11, 12, 13, 14, 15, 16],
-             3: [17, 18, 19, 20, 21, 22, 23]}
+    model = T5ForConditionalGeneration.from_pretrained("t5-3b")
+    device_map = {
+        0: [0, 1, 2],
+        1: [3, 4, 5, 6, 7, 8, 9],
+        2: [10, 11, 12, 13, 14, 15, 16],
+        3: [17, 18, 19, 20, 21, 22, 23],
+    }
     model.parallelize(device_map)
     ```
 """
@@ -220,14 +221,15 @@ DEPARALLELIZE_DOCSTRING = r"""
 
     ```python
     # On a 4 GPU machine with t5-3b:
-    model = T5ForConditionalGeneration.from_pretrained('t5-3b')
-    device_map = {0: [0, 1, 2],
-
-                 1: [3, 4, 5, 6, 7, 8, 9],
-                 2: [10, 11, 12, 13, 14, 15, 16],
-                 3: [17, 18, 19, 20, 21, 22, 23]}
-    model.parallelize(device_map) # Splits the model across several devices
-    model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
+    model = T5ForConditionalGeneration.from_pretrained("t5-3b")
+    device_map = {
+        0: [0, 1, 2],
+        1: [3, 4, 5, 6, 7, 8, 9],
+        2: [10, 11, 12, 13, 14, 15, 16],
+        3: [17, 18, 19, 20, 21, 22, 23],
+    }
+    model.parallelize(device_map)  # Splits the model across several devices
+    model.deparallelize()  # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
     ```
 """
 
@@ -1344,10 +1346,12 @@ class T5Model(T5PreTrainedModel):
         ```python
         >>> from transformers import T5Tokenizer, T5Model
 
-        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        >>> model = T5Model.from_pretrained('t5-small')
+        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> model = T5Model.from_pretrained("t5-small")
 
-        >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+        >>> input_ids = tokenizer(
+        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+        >>> ).input_ids  # Batch size 1
         >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
 
         >>> # forward pass
@@ -1542,18 +1546,20 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
         ```python
         >>> from transformers import T5Tokenizer, T5ForConditionalGeneration
 
-        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        >>> model = T5ForConditionalGeneration.from_pretrained('t5-small')
+        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> model = T5ForConditionalGeneration.from_pretrained("t5-small")
 
         >>> # training
-        >>> input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
-        >>> labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
+        >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
+        >>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
         >>> outputs = model(input_ids=input_ids, labels=labels)
         >>> loss = outputs.loss
         >>> logits = outputs.logits
 
         >>> # inference
-        >>> input_ids = tokenizer("summarize: studies have shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+        >>> input_ids = tokenizer(
+        ...     "summarize: studies have shown that owning a dog is good for you", return_tensors="pt"
+        >>> ).input_ids  # Batch size 1
         >>> outputs = model.generate(input_ids)
         >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
         >>> # studies have shown that owning a dog is good for you.
@@ -1796,9 +1802,12 @@ class T5EncoderModel(T5PreTrainedModel):
 
         ```python
         >>> from transformers import T5Tokenizer, T5EncoderModel
-        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        >>> model = T5EncoderModel.from_pretrained('t5-small')
-        >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+
+        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> model = T5EncoderModel.from_pretrained("t5-small")
+        >>> input_ids = tokenizer(
+        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+        >>> ).input_ids  # Batch size 1
         >>> outputs = model(input_ids=input_ids)
         >>> last_hidden_states = outputs.last_hidden_state
         ```"""
diff --git a/src/transformers/models/t5/modeling_tf_t5.py b/src/transformers/models/t5/modeling_tf_t5.py
index c779384ff8..3802594ee3 100644
--- a/src/transformers/models/t5/modeling_tf_t5.py
+++ b/src/transformers/models/t5/modeling_tf_t5.py
@@ -1177,10 +1177,12 @@ class TFT5Model(TFT5PreTrainedModel):
         ```python
         >>> from transformers import T5Tokenizer, TFT5Model
 
-        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        >>> model = TFT5Model.from_pretrained('t5-small')
+        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> model = TFT5Model.from_pretrained("t5-small")
 
-        >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="tf").input_ids  # Batch size 1
+        >>> input_ids = tokenizer(
+        ...     "Studies have been shown that owning a dog is good for you", return_tensors="tf"
+        >>> ).input_ids  # Batch size 1
         >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="tf").input_ids  # Batch size 1
 
         >>> # forward pass
@@ -1375,18 +1377,20 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling
         ```python
         >>> from transformers import T5Tokenizer, TFT5ForConditionalGeneration
 
-        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        >>> model = TFT5ForConditionalGeneration.from_pretrained('t5-small')
+        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> model = TFT5ForConditionalGeneration.from_pretrained("t5-small")
 
         >>> # training
-        >>> inputs = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='tf').input_ids
-        >>> labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='tf').input_ids
+        >>> inputs = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="tf").input_ids
+        >>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="tf").input_ids
         >>> outputs = model(inputs, labels=labels)
         >>> loss = outputs.loss
         >>> logits = outputs.logits
 
         >>> # inference
-        >>> inputs = tokenizer("summarize: studies have shown that owning a dog is good for you", return_tensors="tf").input_ids  # Batch size 1
+        >>> inputs = tokenizer(
+        ...     "summarize: studies have shown that owning a dog is good for you", return_tensors="tf"
+        >>> ).input_ids  # Batch size 1
         >>> outputs = model.generate(inputs)
         >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
         >>> # studies have shown that owning a dog is good for you
@@ -1633,10 +1637,12 @@ class TFT5EncoderModel(TFT5PreTrainedModel):
         ```python
         >>> from transformers import T5Tokenizer, TFT5EncoderModel
 
-        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        >>> model = TFT5EncoderModel.from_pretrained('t5-small')
+        >>> tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        >>> model = TFT5EncoderModel.from_pretrained("t5-small")
 
-        >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="tf").input_ids  # Batch size 1
+        >>> input_ids = tokenizer(
+        ...     "Studies have been shown that owning a dog is good for you", return_tensors="tf"
+        >>> ).input_ids  # Batch size 1
         >>> outputs = model(input_ids)
         ```"""
         inputs = input_processing(
diff --git a/src/transformers/models/tapas/configuration_tapas.py b/src/transformers/models/tapas/configuration_tapas.py
index 6eb78acc48..cc45d05259 100644
--- a/src/transformers/models/tapas/configuration_tapas.py
+++ b/src/transformers/models/tapas/configuration_tapas.py
@@ -133,6 +133,7 @@ class TapasConfig(PretrainedConfig):
 
     ```python
     >>> from transformers import TapasModel, TapasConfig
+
     >>> # Initializing a default (SQA) Tapas configuration
     >>> configuration = TapasConfig()
     >>> # Initializing a model from the configuration
diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py
index 680bd5bf7b..3d8ec1bc5e 100644
--- a/src/transformers/models/tapas/modeling_tapas.py
+++ b/src/transformers/models/tapas/modeling_tapas.py
@@ -916,12 +916,13 @@ class TapasModel(TapasPreTrainedModel):
         >>> from transformers import TapasTokenizer, TapasModel
         >>> import pandas as pd
 
-        >>> tokenizer = TapasTokenizer.from_pretrained('google/tapas-base')
-        >>> model = TapasModel.from_pretrained('google/tapas-base')
+        >>> tokenizer = TapasTokenizer.from_pretrained("google/tapas-base")
+        >>> model = TapasModel.from_pretrained("google/tapas-base")
 
-        >>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
-        ...         'Age': ["56", "45", "59"],
-        ...         'Number of movies': ["87", "53", "69"]
+        >>> data = {
+        ...     "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
+        ...     "Age": ["56", "45", "59"],
+        ...     "Number of movies": ["87", "53", "69"],
         ... }
         >>> table = pd.DataFrame.from_dict(data)
         >>> queries = ["How many movies has George Clooney played in?", "How old is Brad Pitt?"]
@@ -1056,17 +1057,22 @@ class TapasForMaskedLM(TapasPreTrainedModel):
         >>> from transformers import TapasTokenizer, TapasForMaskedLM
         >>> import pandas as pd
 
-        >>> tokenizer = TapasTokenizer.from_pretrained('google/tapas-base')
-        >>> model = TapasForMaskedLM.from_pretrained('google/tapas-base')
+        >>> tokenizer = TapasTokenizer.from_pretrained("google/tapas-base")
+        >>> model = TapasForMaskedLM.from_pretrained("google/tapas-base")
 
-        >>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
-        ...         'Age': ["56", "45", "59"],
-        ...         'Number of movies': ["87", "53", "69"]
+        >>> data = {
+        ...     "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
+        ...     "Age": ["56", "45", "59"],
+        ...     "Number of movies": ["87", "53", "69"],
         ... }
         >>> table = pd.DataFrame.from_dict(data)
 
-        >>> inputs = tokenizer(table=table, queries="How many [MASK] has George [MASK] played in?", return_tensors="pt")
-        >>> labels = tokenizer(table=table, queries="How many movies has George Clooney played in?", return_tensors="pt")["input_ids"]
+        >>> inputs = tokenizer(
+        ...     table=table, queries="How many [MASK] has George [MASK] played in?", return_tensors="pt"
+        ... )
+        >>> labels = tokenizer(
+        ...     table=table, queries="How many movies has George Clooney played in?", return_tensors="pt"
+        >>> )["input_ids"]
 
         >>> outputs = model(**inputs, labels=labels)
         >>> logits = outputs.logits
@@ -1204,12 +1210,13 @@ class TapasForQuestionAnswering(TapasPreTrainedModel):
         >>> from transformers import TapasTokenizer, TapasForQuestionAnswering
         >>> import pandas as pd
 
-        >>> tokenizer = TapasTokenizer.from_pretrained('google/tapas-base-finetuned-wtq')
-        >>> model = TapasForQuestionAnswering.from_pretrained('google/tapas-base-finetuned-wtq')
+        >>> tokenizer = TapasTokenizer.from_pretrained("google/tapas-base-finetuned-wtq")
+        >>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wtq")
 
-        >>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
-        ...         'Age': ["56", "45", "59"],
-        ...         'Number of movies': ["87", "53", "69"]
+        >>> data = {
+        ...     "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
+        ...     "Age": ["56", "45", "59"],
+        ...     "Number of movies": ["87", "53", "69"],
         ... }
         >>> table = pd.DataFrame.from_dict(data)
         >>> queries = ["How many movies has George Clooney played in?", "How old is Brad Pitt?"]
@@ -1501,18 +1508,22 @@ class TapasForSequenceClassification(TapasPreTrainedModel):
         >>> import torch
         >>> import pandas as pd
 
-        >>> tokenizer = TapasTokenizer.from_pretrained('google/tapas-base-finetuned-tabfact')
-        >>> model = TapasForSequenceClassification.from_pretrained('google/tapas-base-finetuned-tabfact')
+        >>> tokenizer = TapasTokenizer.from_pretrained("google/tapas-base-finetuned-tabfact")
+        >>> model = TapasForSequenceClassification.from_pretrained("google/tapas-base-finetuned-tabfact")
 
-        >>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
-        ...         'Age': ["56", "45", "59"],
-        ...         'Number of movies': ["87", "53", "69"]
+        >>> data = {
+        ...     "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
+        ...     "Age": ["56", "45", "59"],
+        ...     "Number of movies": ["87", "53", "69"],
         ... }
         >>> table = pd.DataFrame.from_dict(data)
-        >>> queries = ["There is only one actor who is 45 years old", "There are 3 actors which played in more than 60 movies"]
+        >>> queries = [
+        ...     "There is only one actor who is 45 years old",
+        ...     "There are 3 actors which played in more than 60 movies",
+        ... ]
 
         >>> inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="pt")
-        >>> labels = torch.tensor([1, 0]) # 1 means entailed, 0 means refuted
+        >>> labels = torch.tensor([1, 0])  # 1 means entailed, 0 means refuted
 
         >>> outputs = model(**inputs, labels=labels)
         >>> loss = outputs.loss
diff --git a/src/transformers/models/tapas/modeling_tf_tapas.py b/src/transformers/models/tapas/modeling_tf_tapas.py
index 5b2d5dec59..7818e1c90c 100644
--- a/src/transformers/models/tapas/modeling_tf_tapas.py
+++ b/src/transformers/models/tapas/modeling_tf_tapas.py
@@ -1004,12 +1004,13 @@ class TFTapasModel(TFTapasPreTrainedModel):
         >>> from transformers import TapasTokenizer, TapasModel
         >>> import pandas as pd
 
-        >>> tokenizer = TapasTokenizer.from_pretrained('google/tapas-base')
-        >>> model = TapasModel.from_pretrained('google/tapas-base')
+        >>> tokenizer = TapasTokenizer.from_pretrained("google/tapas-base")
+        >>> model = TapasModel.from_pretrained("google/tapas-base")
 
-        >>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
-        ...         'Age': ["56", "45", "59"],
-        ...         'Number of movies': ["87", "53", "69"]
+        >>> data = {
+        ...     "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
+        ...     "Age": ["56", "45", "59"],
+        ...     "Number of movies": ["87", "53", "69"],
         ... }
         >>> table = pd.DataFrame.from_dict(data)
         >>> queries = ["How many movies has George Clooney played in?", "How old is Brad Pitt?"]
@@ -1109,17 +1110,22 @@ class TFTapasForMaskedLM(TFTapasPreTrainedModel, TFMaskedLanguageModelingLoss):
         >>> from transformers import TapasTokenizer, TapasForMaskedLM
         >>> import pandas as pd
 
-        >>> tokenizer = TapasTokenizer.from_pretrained('google/tapas-base')
-        >>> model = TapasForMaskedLM.from_pretrained('google/tapas-base')
+        >>> tokenizer = TapasTokenizer.from_pretrained("google/tapas-base")
+        >>> model = TapasForMaskedLM.from_pretrained("google/tapas-base")
 
-        >>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
-        ...         'Age': ["56", "45", "59"],
-        ...         'Number of movies': ["87", "53", "69"]
+        >>> data = {
+        ...     "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
+        ...     "Age": ["56", "45", "59"],
+        ...     "Number of movies": ["87", "53", "69"],
         ... }
         >>> table = pd.DataFrame.from_dict(data)
 
-        >>> inputs = tokenizer(table=table, queries="How many [MASK] has George [MASK] played in?", return_tensors="tf")
-        >>> labels = tokenizer(table=table, queries="How many movies has George Clooney played in?", return_tensors="tf")["input_ids"]
+        >>> inputs = tokenizer(
+        ...     table=table, queries="How many [MASK] has George [MASK] played in?", return_tensors="tf"
+        ... )
+        >>> labels = tokenizer(
+        ...     table=table, queries="How many movies has George Clooney played in?", return_tensors="tf"
+        >>> )["input_ids"]
 
         >>> outputs = model(**inputs, labels=labels)
         >>> logits = outputs.logits
@@ -1359,12 +1365,13 @@ class TFTapasForQuestionAnswering(TFTapasPreTrainedModel):
         >>> from transformers import TapasTokenizer, TapasForQuestionAnswering
         >>> import pandas as pd
 
-        >>> tokenizer = TapasTokenizer.from_pretrained('google/tapas-base-finetuned-wtq')
-        >>> model = TapasForQuestionAnswering.from_pretrained('google/tapas-base-finetuned-wtq')
+        >>> tokenizer = TapasTokenizer.from_pretrained("google/tapas-base-finetuned-wtq")
+        >>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wtq")
 
-        >>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
-        ...         'Age': ["56", "45", "59"],
-        ...         'Number of movies': ["87", "53", "69"]
+        >>> data = {
+        ...     "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
+        ...     "Age": ["56", "45", "59"],
+        ...     "Number of movies": ["87", "53", "69"],
         ... }
         >>> table = pd.DataFrame.from_dict(data)
         >>> queries = ["How many movies has George Clooney played in?", "How old is Brad Pitt?"]
@@ -1681,18 +1688,22 @@ class TFTapasForSequenceClassification(TFTapasPreTrainedModel, TFSequenceClassif
         >>> import tensorflow as tf
         >>> import pandas as pd
 
-        >>> tokenizer = TapasTokenizer.from_pretrained('google/tapas-base-finetuned-tabfact')
-        >>> model = TapasForSequenceClassification.from_pretrained('google/tapas-base-finetuned-tabfact')
+        >>> tokenizer = TapasTokenizer.from_pretrained("google/tapas-base-finetuned-tabfact")
+        >>> model = TapasForSequenceClassification.from_pretrained("google/tapas-base-finetuned-tabfact")
 
-        >>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
-        ...         'Age': ["56", "45", "59"],
-        ...         'Number of movies': ["87", "53", "69"]
+        >>> data = {
+        ...     "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
+        ...     "Age": ["56", "45", "59"],
+        ...     "Number of movies": ["87", "53", "69"],
         ... }
         >>> table = pd.DataFrame.from_dict(data)
-        >>> queries = ["There is only one actor who is 45 years old", "There are 3 actors which played in more than 60 movies"]
+        >>> queries = [
+        ...     "There is only one actor who is 45 years old",
+        ...     "There are 3 actors which played in more than 60 movies",
+        ... ]
 
         >>> inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="tf")
-        >>> labels = tf.convert_to_tensor([1, 0]) # 1 means entailed, 0 means refuted
+        >>> labels = tf.convert_to_tensor([1, 0])  # 1 means entailed, 0 means refuted
 
         >>> outputs = model(**inputs, labels=labels)
         >>> loss = outputs.loss
diff --git a/src/transformers/models/trocr/modeling_trocr.py b/src/transformers/models/trocr/modeling_trocr.py
index c1a82aaf5c..2d3233a6d8 100644
--- a/src/transformers/models/trocr/modeling_trocr.py
+++ b/src/transformers/models/trocr/modeling_trocr.py
@@ -896,8 +896,8 @@ class TrOCRForCausalLM(TrOCRPreTrainedModel):
 
         >>> encoder = ViTModel(ViTConfig())
         >>> decoder = TrOCRForCausalLM(TrOCRConfig())
-
         # init vision2text model
+
         >>> model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)
         ```"""
 
diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py
index 1dbd7718a7..c32da4f309 100755
--- a/src/transformers/models/unispeech/modeling_unispeech.py
+++ b/src/transformers/models/unispeech/modeling_unispeech.py
@@ -1270,9 +1270,7 @@ class UniSpeechForPreTraining(UniSpeechPreTrainedModel):
         ...     outputs = model(input_values, mask_time_indices=mask_time_indices)
 
         >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
-        >>> cosine_sim = torch.cosine_similarity(
-        ...     outputs.projected_states, outputs.projected_quantized_states, dim=-1
-        ... )
+        >>> cosine_sim = torch.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)
 
         >>> # show that cosine similarity is much higher than random
         >>> assert cosine_sim[mask_time_indices].mean() > 0.5
diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
index 837ab17a18..a9969b7159 100755
--- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@@ -1303,9 +1303,7 @@ class UniSpeechSatForPreTraining(UniSpeechSatPreTrainedModel):
         ...     outputs = model(input_values, mask_time_indices=mask_time_indices)
 
         >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
-        >>> cosine_sim = torch.cosine_similarity(
-        ...     outputs.projected_states, outputs.projected_quantized_states, dim=-1
-        ... )
+        >>> cosine_sim = torch.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)
 
         >>> # show that cosine similarity is much higher than random
         >>> assert cosine_sim[mask_time_indices].mean() > 0.5
diff --git a/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
index 72fba3efa8..b2c3b2aacc 100644
--- a/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
@@ -58,17 +58,17 @@ class VisionEncoderDecoderConfig(PretrainedConfig):
 
     >>> # Accessing the model configuration
     >>> config_encoder = model.config.encoder
-    >>> config_decoder  = model.config.decoder
+    >>> config_decoder = model.config.decoder
     >>> # set decoder config to causal lm
     >>> config_decoder.is_decoder = True
     >>> config_decoder.add_cross_attention = True
 
     >>> # Saving the model, including its configuration
-    >>> model.save_pretrained('my-model')
+    >>> model.save_pretrained("my-model")
 
     >>> # loading model and config from pretrained folder
-    >>> encoder_decoder_config = VisionEncoderDecoderConfig.from_pretrained('my-model')
-    >>> model = VisionEncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config)
+    >>> encoder_decoder_config = VisionEncoderDecoderConfig.from_pretrained("my-model")
+    >>> model = VisionEncoderDecoderModel.from_pretrained("my-model", config=encoder_decoder_config)
     ```"""
     model_type = "vision-encoder-decoder"
     is_composition = True
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
index 524ccf3820..446f92fec0 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
@@ -397,13 +397,13 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
         >>> from PIL import Image
         >>> import requests
 
-        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
+        >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
 
         >>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
-        >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained('vit', 'gpt2')
+        >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained("vit", "gpt2")
 
         >>> pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values
         >>> encoder_outputs = model.encode(pixel_values)
@@ -474,13 +474,13 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
         >>> from PIL import Image
         >>> import requests
 
-        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
+        >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
 
         >>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
-        >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained('vit', 'gpt2')
+        >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained("vit", "gpt2")
 
         >>> pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values
         >>> encoder_outputs = model.encode(pixel_values)
@@ -601,16 +601,16 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
         >>> from PIL import Image
         >>> import requests
 
-        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
+        >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
 
         >>> # load output tokenizer
-        >>> tokenizer_output = GPT2Tokenizer.from_pretrained('gpt2')
+        >>> tokenizer_output = GPT2Tokenizer.from_pretrained("gpt2")
 
         >>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
-        >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained('vit', 'gpt2')
+        >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained("vit", "gpt2")
 
         >>> pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values
 
@@ -746,8 +746,11 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
 
         ```python
         >>> from transformers import FlaxVisionEncoderDecoderModel
+
         >>> # initialize a vit-gpt2 from a pretrained ViT and a pretrained GPT2 model. Note that the cross-attention layers will be randomly initialized
-        >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained('google/vit-base-patch16-224-in21k', 'gpt2')
+        >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained(
+        ...     "google/vit-base-patch16-224-in21k", "gpt2"
+        ... )
         >>> # saving model after fine-tuning
         >>> model.save_pretrained("./vit-gpt2")
         >>> # load fine-tuned model
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
index f4221c6aa0..6f9b547752 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
@@ -302,8 +302,11 @@ class VisionEncoderDecoderModel(PreTrainedModel):
 
         ```python
         >>> from transformers import VisionEncoderDecoderModel
+
         >>> # initialize a vit-bert from a pretrained ViT and a pretrained BERT model. Note that the cross-attention layers will be randomly initialized
-        >>> model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained('google/vit-base-patch16-224-in21k', 'bert-base-uncased')
+        >>> model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
+        ...     "google/vit-base-patch16-224-in21k", "bert-base-uncased"
+        ... )
         >>> # saving model after fine-tuning
         >>> model.save_pretrained("./vit-bert")
         >>> # load fine-tuned model
@@ -417,8 +420,8 @@ class VisionEncoderDecoderModel(PreTrainedModel):
         >>> from PIL import Image
         >>> import torch
 
-        >>> processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
-        >>> model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten')
+        >>> processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
+        >>> model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
 
         >>> # load image from the IAM dataset
         >>> url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg"
diff --git a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py
index e8f4d43a8b..52071b4bef 100644
--- a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py
@@ -61,15 +61,15 @@ class VisionTextDualEncoderConfig(PretrainedConfig):
     >>> model = VisionTextDualEncoderModel(config=config)
 
     >>> # Accessing the model configuration
-    >>> config_vision  = model.config.vision_config
+    >>> config_vision = model.config.vision_config
     >>> config_text = model.config.text_config
 
     >>> # Saving the model, including its configuration
-    >>> model.save_pretrained('my-model')
+    >>> model.save_pretrained("my-model")
 
     >>> # loading model and config from pretrained folder
-    >>> vision_text_config = VisionTextDualEncoderConfig.from_pretrained('vit-bert')
-    >>> model = VisionTextDualEncoderModel.from_pretrained('vit-bert', config=vision_text_config)
+    >>> vision_text_config = VisionTextDualEncoderConfig.from_pretrained("vit-bert")
+    >>> model = VisionTextDualEncoderModel.from_pretrained("vit-bert", config=vision_text_config)
     ```"""
 
     model_type = "vision-text-dual-encoder"
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
index c99ebee35c..130ade2e78 100644
--- a/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
@@ -446,12 +446,15 @@ class FlaxVisionTextDualEncoderModel(FlaxPreTrainedModel):
 
         ```python
         >>> from transformers import FlaxVisionTextDualEncoderModel
+
         >>> # initialize a model from pretrained ViT and BERT models. Note that the projection layers will be randomly initialized.
-        >>> model =  FlaxVisionTextDualEncoderModel.from_vision_text_pretrained('bert-base-uncased', 'google/vit-base-patch16-224')
+        >>> model = FlaxVisionTextDualEncoderModel.from_vision_text_pretrained(
+        ...     "bert-base-uncased", "google/vit-base-patch16-224"
+        ... )
         >>> # saving model after fine-tuning
         >>> model.save_pretrained("./vit-bert")
         >>> # load fine-tuned model
-        >>> model =  FlaxVisionTextDualEncoderModel.from_pretrained("./vit-bert")
+        >>> model = FlaxVisionTextDualEncoderModel.from_pretrained("./vit-bert")
         ```"""
 
         kwargs_vision = {
@@ -531,19 +534,36 @@ VISION_TEXT_DUAL_ENCODER_MODEL_DOCSTRING = r"""
     >>> from PIL import Image
     >>> import requests
     >>> import jax
-    >>> from transformers import FlaxVisionTextDualEncoderModel, VisionTextDualEncoderProcessor, ViTFeatureExtractor, BertTokenizer
+    >>> from transformers import (
+    ...     FlaxVisionTextDualEncoderModel,
+    ...     VisionTextDualEncoderProcessor,
+    ...     ViTFeatureExtractor,
+    ...     BertTokenizer,
+    ... )
 
     >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
     >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
     >>> processor = VisionTextDualEncoderProcessor(feature_extractor, tokenizer)
-    >>> model = FlaxVisionTextDualEncoderModel.from_vision_text_pretrained("google/vit-base-patch16-224", "bert-base-uncased")
+    >>> model = FlaxVisionTextDualEncoderModel.from_vision_text_pretrained(
+    ...     "google/vit-base-patch16-224", "bert-base-uncased"
+    ... )
 
     >>> # contrastive training
-    >>> urls = ["http://images.cocodataset.org/val2017/000000039769.jpg", "https://farm3.staticflickr.com/2674/5850229113_4fe05d5265_z.jpg]
+    >>> urls = [
+    ...     "http://images.cocodataset.org/val2017/000000039769.jpg",
+    ...     "https://farm3.staticflickr.com/2674/5850229113_4fe05d5265_z.jpg",
+    ... ]
     >>> images = [Image.open(requests.get(url, stream=True).raw) for url in urls]
-    >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=images, return_tensors="np", padding=True)
-    >>> outputs = model(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, pixel_values=inputs.pixel_values, return_loss=True)
-    >>> loss, logits_per_image = outputs.loss, outputs.logits_per_imag # this is the image-text similarity score
+    >>> inputs = processor(
+    ...     text=["a photo of a cat", "a photo of a dog"], images=images, return_tensors="np", padding=True
+    ... )
+    >>> outputs = model(
+    ...     input_ids=inputs.input_ids,
+    ...     attention_mask=inputs.attention_mask,
+    ...     pixel_values=inputs.pixel_values,
+    ...     return_loss=True,
+    ... )
+    >>> loss, logits_per_image = outputs.loss, outputs.logits_per_imag  # this is the image-text similarity score
 
     >>> # save and load from pretrained
     >>> model.save_pretrained("vit-bert")
@@ -551,8 +571,8 @@ VISION_TEXT_DUAL_ENCODER_MODEL_DOCSTRING = r"""
 
     >>> # inference
     >>> outputs = model(**inputs)
-    >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
-    >>> probs = jax.nn.softmax(logits_per_image, axis=1) # we can take the softmax to get the label probabilities
+    >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+    >>> probs = jax.nn.softmax(logits_per_image, axis=1)  # we can take the softmax to get the label probabilities
     ```
 """
 
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
index 2b2225dc36..f40cb2782f 100755
--- a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
@@ -231,7 +231,7 @@ class VisionTextDualEncoderModel(PreTrainedModel):
         >>> model = VisionTextDualEncoderModel.from_pretrained("clip-italian/clip-italian")
         >>> tokenizer = AutoTokenizer.from_pretrained("clip-italian/clip-italian")
 
-        >>> inputs = tokenizer(["una foto di un gatto", "una foto di un cane"],  padding=True, return_tensors="pt")
+        >>> inputs = tokenizer(["una foto di un gatto", "una foto di un cane"], padding=True, return_tensors="pt")
         >>> text_features = model.get_text_features(**inputs)
         ```"""
         text_outputs = self.text_model(
@@ -312,19 +312,36 @@ class VisionTextDualEncoderModel(PreTrainedModel):
         ```python
         >>> from PIL import Image
         >>> import requests
-        >>> from transformers import VisionTextDualEncoderModel, VisionTextDualEncoderProcessor, ViTFeatureExtractor, BertTokenizer
+        >>> from transformers import (
+        ...     VisionTextDualEncoderModel,
+        ...     VisionTextDualEncoderProcessor,
+        ...     ViTFeatureExtractor,
+        ...     BertTokenizer,
+        ... )
 
         >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
         >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
         >>> processor = VisionTextDualEncoderProcessor(feature_extractor, tokenizer)
-        >>> model = VisionTextDualEncoderModel.from_vision_text_pretrained("google/vit-base-patch16-224", "bert-base-uncased")
+        >>> model = VisionTextDualEncoderModel.from_vision_text_pretrained(
+        ...     "google/vit-base-patch16-224", "bert-base-uncased"
+        ... )
 
         >>> # contrastive training
-        >>> urls = ["http://images.cocodataset.org/val2017/000000039769.jpg", "https://farm3.staticflickr.com/2674/5850229113_4fe05d5265_z.jpg]
+        >>> urls = [
+        ...     "http://images.cocodataset.org/val2017/000000039769.jpg",
+        ...     "https://farm3.staticflickr.com/2674/5850229113_4fe05d5265_z.jpg",
+        ... ]
         >>> images = [Image.open(requests.get(url, stream=True).raw) for url in urls]
-        >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=images, return_tensors="pt", padding=True)
-        >>> outputs = model(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, pixel_values=inputs.pixel_values, return_loss=True)
-        >>> loss, logits_per_image = outputs.loss, outputs.logits_per_imag # this is the image-text similarity score
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=images, return_tensors="pt", padding=True
+        ... )
+        >>> outputs = model(
+        ...     input_ids=inputs.input_ids,
+        ...     attention_mask=inputs.attention_mask,
+        ...     pixel_values=inputs.pixel_values,
+        ...     return_loss=True,
+        ... )
+        >>> loss, logits_per_image = outputs.loss, outputs.logits_per_imag  # this is the image-text similarity score
 
         >>> # save and load from pretrained
         >>> model.save_pretrained("vit-bert")
@@ -332,8 +349,8 @@ class VisionTextDualEncoderModel(PreTrainedModel):
 
         >>> # inference
         >>> outputs = model(**inputs)
-        >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
-        >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.return_dict
 
@@ -447,8 +464,11 @@ class VisionTextDualEncoderModel(PreTrainedModel):
 
         ```python
         >>> from transformers import VisionTextDualEncoderModel
+
         >>> # initialize a model from pretrained ViT and BERT models. Note that the projection layers will be randomly initialized.
-        >>> model = VisionTextDualEncoderModel.from_vision_text_pretrained('bert-base-uncased', 'google/vit-base-patch16-224')
+        >>> model = VisionTextDualEncoderModel.from_vision_text_pretrained(
+        ...     "bert-base-uncased", "google/vit-base-patch16-224"
+        ... )
         >>> # saving model after fine-tuning
         >>> model.save_pretrained("./vit-bert")
         >>> # load fine-tuned model
diff --git a/src/transformers/models/visual_bert/configuration_visual_bert.py b/src/transformers/models/visual_bert/configuration_visual_bert.py
index b8aafa2c4d..d2365c80bc 100644
--- a/src/transformers/models/visual_bert/configuration_visual_bert.py
+++ b/src/transformers/models/visual_bert/configuration_visual_bert.py
@@ -93,7 +93,7 @@ class VisualBertConfig(PretrainedConfig):
     >>> from transformers import VisualBertModel, VisualBertConfig
 
     >>> # Initializing a VisualBERT visualbert-vqa-coco-pre style configuration
-    >>> configuration = VisualBertConfig.from_pretrained('visualbert-vqa-coco-pre')
+    >>> configuration = VisualBertConfig.from_pretrained("visualbert-vqa-coco-pre")
 
     >>> # Initializing a model from the visualbert-vqa-coco-pre style configuration
     >>> model = VisualBertModel(configuration)
diff --git a/src/transformers/models/visual_bert/modeling_visual_bert.py b/src/transformers/models/visual_bert/modeling_visual_bert.py
index cb388c60a1..919a46d7b6 100755
--- a/src/transformers/models/visual_bert/modeling_visual_bert.py
+++ b/src/transformers/models/visual_bert/modeling_visual_bert.py
@@ -745,19 +745,21 @@ class VisualBertModel(VisualBertPreTrainedModel):
         from transformers import BertTokenizer, VisualBertModel
         import torch
 
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = VisualBertModel.from_pretrained('uclanlp/visualbert-vqa-coco-pre')
+        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        model = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
 
         inputs = tokenizer("The capital of France is Paris.", return_tensors="pt")
         visual_embeds = get_visual_embeddings(image).unsqueeze(0)
         visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
         visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
 
-        inputs.update({
-            "visual_embeds": visual_embeds,
-            "visual_token_type_ids": visual_token_type_ids,
-            "visual_attention_mask": visual_attention_mask
-        })
+        inputs.update(
+            {
+                "visual_embeds": visual_embeds,
+                "visual_token_type_ids": visual_token_type_ids,
+                "visual_attention_mask": visual_attention_mask,
+            }
+        )
 
         outputs = model(**inputs)
 
@@ -927,22 +929,26 @@ class VisualBertForPreTraining(VisualBertPreTrainedModel):
         # Assumption: *get_visual_embeddings(image)* gets the visual embeddings of the image in the batch.
         from transformers import BertTokenizer, VisualBertForPreTraining
 
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = VisualBertForPreTraining.from_pretrained('uclanlp/visualbert-vqa-coco-pre')
+        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        model = VisualBertForPreTraining.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
 
         inputs = tokenizer("The capital of France is {mask}.", return_tensors="pt")
         visual_embeds = get_visual_embeddings(image).unsqueeze(0)
         visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
         visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
 
-        inputs.update({
-        "visual_embeds": visual_embeds,
-        "visual_token_type_ids": visual_token_type_ids,
-        "visual_attention_mask": visual_attention_mask
-        })
-        max_length  = inputs["input_ids"].shape[-1]+visual_embeds.shape[-2]
-        labels = tokenizer("The capital of France is Paris.", return_tensors="pt", padding="max_length", max_length=max_length)["input_ids"]
-        sentence_image_labels = torch.tensor(1).unsqueeze(0) # Batch_size
+        inputs.update(
+            {
+                "visual_embeds": visual_embeds,
+                "visual_token_type_ids": visual_token_type_ids,
+                "visual_attention_mask": visual_attention_mask,
+            }
+        )
+        max_length = inputs["input_ids"].shape[-1] + visual_embeds.shape[-2]
+        labels = tokenizer(
+            "The capital of France is Paris.", return_tensors="pt", padding="max_length", max_length=max_length
+        )["input_ids"]
+        sentence_image_labels = torch.tensor(1).unsqueeze(0)  # Batch_size
 
 
         outputs = model(**inputs, labels=labels, sentence_image_labels=sentence_image_labels)
@@ -1063,8 +1069,8 @@ class VisualBertForMultipleChoice(VisualBertPreTrainedModel):
         from transformers import BertTokenizer, VisualBertForMultipleChoice
         import torch
 
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = VisualBertForMultipleChoice.from_pretrained('uclanlp/visualbert-vcr')
+        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        model = VisualBertForMultipleChoice.from_pretrained("uclanlp/visualbert-vcr")
 
         prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
         choice0 = "It is eaten with a fork and a knife."
@@ -1078,15 +1084,17 @@ class VisualBertForMultipleChoice(VisualBertPreTrainedModel):
 
         labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
 
-        encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', padding=True)
+        encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors="pt", padding=True)
         # batch size is 1
-        inputs_dict = {k: v.unsqueeze(0) for k,v in encoding.items()}
-        inputs_dict.update({
-        "visual_embeds": visual_embeds,
-        "visual_attention_mask": visual_attention_mask,
-        "visual_token_type_ids": visual_token_type_ids,
-        "labels": labels
-        })
+        inputs_dict = {k: v.unsqueeze(0) for k, v in encoding.items()}
+        inputs_dict.update(
+            {
+                "visual_embeds": visual_embeds,
+                "visual_attention_mask": visual_attention_mask,
+                "visual_token_type_ids": visual_token_type_ids,
+                "labels": labels,
+            }
+        )
         outputs = model(**inputs_dict)
 
         loss = outputs.loss
@@ -1212,22 +1220,24 @@ class VisualBertForQuestionAnswering(VisualBertPreTrainedModel):
         from transformers import BertTokenizer, VisualBertForQuestionAnswering
         import torch
 
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = VisualBertForQuestionAnswering.from_pretrained('uclanlp/visualbert-vqa')
+        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        model = VisualBertForQuestionAnswering.from_pretrained("uclanlp/visualbert-vqa")
 
         text = "Who is eating the apple?"
-        inputs = tokenizer(text, return_tensors='pt')
+        inputs = tokenizer(text, return_tensors="pt")
         visual_embeds = get_visual_embeddings(image).unsqueeze(0)
         visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
         visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
 
-        inputs.update({
-        "visual_embeds": visual_embeds,
-        "visual_token_type_ids": visual_token_type_ids,
-        "visual_attention_mask": visual_attention_mask
-        })
+        inputs.update(
+            {
+                "visual_embeds": visual_embeds,
+                "visual_token_type_ids": visual_token_type_ids,
+                "visual_attention_mask": visual_attention_mask,
+            }
+        )
 
-        labels = torch.tensor([[0.0,1.0]]).unsqueeze(0)  # Batch size 1, Num labels 2
+        labels = torch.tensor([[0.0, 1.0]]).unsqueeze(0)  # Batch size 1, Num labels 2
 
         outputs = model(**inputs, labels=labels)
         loss = outputs.loss
@@ -1336,20 +1346,22 @@ class VisualBertForVisualReasoning(VisualBertPreTrainedModel):
         from transformers import BertTokenizer, VisualBertForVisualReasoning
         import torch
 
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = VisualBertForVisualReasoning.from_pretrained('uclanlp/visualbert-nlvr2')
+        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        model = VisualBertForVisualReasoning.from_pretrained("uclanlp/visualbert-nlvr2")
 
         text = "Who is eating the apple?"
-        inputs = tokenizer(text, return_tensors='pt')
+        inputs = tokenizer(text, return_tensors="pt")
         visual_embeds = get_visual_embeddings(image).unsqueeze(0)
         visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
         visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
 
-        inputs.update({
-        "visual_embeds": visual_embeds,
-        "visual_token_type_ids": visual_token_type_ids,
-        "visual_attention_mask": visual_attention_mask
-        })
+        inputs.update(
+            {
+                "visual_embeds": visual_embeds,
+                "visual_token_type_ids": visual_token_type_ids,
+                "visual_attention_mask": visual_attention_mask,
+            }
+        )
 
         labels = torch.tensor(1).unsqueeze(0)  # Batch size 1, Num choices 2
 
@@ -1498,24 +1510,28 @@ class VisualBertForRegionToPhraseAlignment(VisualBertPreTrainedModel):
         from transformers import BertTokenizer, VisualBertForRegionToPhraseAlignment
         import torch
 
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = VisualBertForRegionToPhraseAlignment.from_pretrained('uclanlp/visualbert-vqa-coco-pre')
+        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        model = VisualBertForRegionToPhraseAlignment.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
 
         text = "Who is eating the apple?"
-        inputs = tokenizer(text, return_tensors='pt')
+        inputs = tokenizer(text, return_tensors="pt")
         visual_embeds = get_visual_embeddings(image).unsqueeze(0)
         visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
         visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
-        region_to_phrase_position = torch.ones((1, inputs["input_ids"].shape[-1]+visual_embeds.shape[-2]))
+        region_to_phrase_position = torch.ones((1, inputs["input_ids"].shape[-1] + visual_embeds.shape[-2]))
 
-        inputs.update({
-        "region_to_phrase_position": region_to_phrase_position,
-        "visual_embeds": visual_embeds,
-        "visual_token_type_ids": visual_token_type_ids,
-        "visual_attention_mask": visual_attention_mask
-        })
+        inputs.update(
+            {
+                "region_to_phrase_position": region_to_phrase_position,
+                "visual_embeds": visual_embeds,
+                "visual_token_type_ids": visual_token_type_ids,
+                "visual_attention_mask": visual_attention_mask,
+            }
+        )
 
-        labels = torch.ones((1, inputs["input_ids"].shape[-1]+visual_embeds.shape[-2], visual_embeds.shape[-2])) # Batch size 1
+        labels = torch.ones(
+            (1, inputs["input_ids"].shape[-1] + visual_embeds.shape[-2], visual_embeds.shape[-2])
+        )  # Batch size 1
 
         outputs = model(**inputs, labels=labels)
         loss = outputs.loss
diff --git a/src/transformers/models/vit/modeling_flax_vit.py b/src/transformers/models/vit/modeling_flax_vit.py
index ace49edac6..aad3141dda 100644
--- a/src/transformers/models/vit/modeling_flax_vit.py
+++ b/src/transformers/models/vit/modeling_flax_vit.py
@@ -521,11 +521,11 @@ FLAX_VISION_MODEL_DOCSTRING = """
     >>> from PIL import Image
     >>> import requests
 
-    >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
     >>> image = Image.open(requests.get(url, stream=True).raw)
 
-    >>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
-    >>> model = FlaxViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
+    >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
+    >>> model = FlaxViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
 
     >>> inputs = feature_extractor(images=image, return_tensors="np")
     >>> outputs = model(**inputs)
@@ -603,11 +603,11 @@ FLAX_VISION_CLASSIF_DOCSTRING = """
     >>> import jax
     >>> import requests
 
-    >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
     >>> image = Image.open(requests.get(url, stream=True).raw)
 
-    >>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
-    >>> model = FlaxViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
+    >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
+    >>> model = FlaxViTForImageClassification.from_pretrained("google/vit-base-patch16-224")
 
     >>> inputs = feature_extractor(images=image, return_tensors="np")
     >>> outputs = model(**inputs)
diff --git a/src/transformers/models/vit/modeling_tf_vit.py b/src/transformers/models/vit/modeling_tf_vit.py
index 24cfd4d3a4..9825260986 100644
--- a/src/transformers/models/vit/modeling_tf_vit.py
+++ b/src/transformers/models/vit/modeling_tf_vit.py
@@ -682,11 +682,11 @@ class TFViTModel(TFViTPreTrainedModel):
         >>> from PIL import Image
         >>> import requests
 
-        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
-        >>> model = TFViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
+        >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
+        >>> model = TFViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
 
         >>> inputs = feature_extractor(images=image, return_tensors="tf")
         >>> outputs = model(**inputs)
@@ -803,11 +803,11 @@ class TFViTForImageClassification(TFViTPreTrainedModel, TFSequenceClassification
         >>> from PIL import Image
         >>> import requests
 
-        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
-        >>> model = TFViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
+        >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
+        >>> model = TFViTForImageClassification.from_pretrained("google/vit-base-patch16-224")
 
         >>> inputs = feature_extractor(images=image, return_tensors="tf")
         >>> outputs = model(**inputs)
diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index f8569b9645..27530ef8a0 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -522,11 +522,11 @@ class ViTModel(ViTPreTrainedModel):
         >>> from PIL import Image
         >>> import requests
 
-        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
-        >>> model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
+        >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224-in21k")
+        >>> model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
@@ -634,11 +634,11 @@ class ViTForImageClassification(ViTPreTrainedModel):
         >>> from PIL import Image
         >>> import requests
 
-        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
-        >>> model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
+        >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
+        >>> model = ViTForImageClassification.from_pretrained("google/vit-base-patch16-224")
 
         >>> inputs = feature_extractor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
diff --git a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
index 5764fed75d..73d9a3ad25 100644
--- a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
@@ -952,15 +952,19 @@ FLAX_WAV2VEC2_MODEL_DOCSTRING = """
     >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-lv60")
     >>> model = FlaxWav2Vec2Model.from_pretrained("facebook/wav2vec2-large-lv60")
 
+
     >>> def map_to_array(batch):
-    >>>     speech, _ = sf.read(batch["file"])
-    >>>     batch["speech"] = speech
-    >>>     return batch
+    ...     speech, _ = sf.read(batch["file"])
+    ...     batch["speech"] = speech
+    ...     return batch
+
 
     >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
     >>> ds = ds.map(map_to_array)
 
-    >>> input_values = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="np").input_values  # Batch size 1
+    >>> input_values = processor(
+    ...     ds["speech"][0], sampling_rate=16_000, return_tensors="np"
+    >>> ).input_values  # Batch size 1
     >>> hidden_states = model(input_values).last_hidden_state
     ```
 """
@@ -1055,15 +1059,19 @@ FLAX_WAV2VEC2_FOR_CTC_DOCSTRING = """
     >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60")
     >>> model = FlaxWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60")
 
+
     >>> def map_to_array(batch):
-    >>>     speech, _ = sf.read(batch["file"])
-    >>>     batch["speech"] = speech
-    >>>     return batch
+    ...     speech, _ = sf.read(batch["file"])
+    ...     batch["speech"] = speech
+    ...     return batch
+
 
     >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
     >>> ds = ds.map(map_to_array)
 
-    >>> input_values = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="np").input_values  # Batch size 1
+    >>> input_values = processor(
+    ...     ds["speech"][0], sampling_rate=16_000, return_tensors="np"
+    >>> ).input_values  # Batch size 1
     >>> logits = model(input_values).logits
     >>> predicted_ids = jnp.argmax(logits, axis=-1)
 
@@ -1264,9 +1272,7 @@ FLAX_WAV2VEC2_FOR_PRETRAINING_DOCSTRING = """
     >>> outputs = model(input_values, mask_time_indices=mask_time_indices)
 
     >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
-    >>> cosine_sim = optax.cosine_similarity(
-    ...     outputs.projected_states, outputs.projected_quantized_states
-    ... )
+    >>> cosine_sim = optax.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states)
 
     >>> # show that cosine similarity is much higher than random
     >>> assert np.asarray(cosine_sim)[mask_time_indices].mean() > 0.5
diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
index eeccb467a4..ac7658b0c3 100644
--- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
@@ -1408,10 +1408,12 @@ class TFWav2Vec2Model(TFWav2Vec2PreTrainedModel):
         >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
         >>> model = TFWav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
 
+
         >>> def map_to_array(batch):
-        >>>     speech, _ = sf.read(batch["file"])
-        >>>     batch["speech"] = speech
-        >>>     return batch
+        ...     speech, _ = sf.read(batch["file"])
+        ...     batch["speech"] = speech
+        ...     return batch
+
 
         >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         >>> ds = ds.map(map_to_array)
@@ -1519,15 +1521,17 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel):
         >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
         >>> model = TFWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
 
+
         >>> def map_to_array(batch):
-        >>>     speech, _ = sf.read(batch["file"])
-        >>>     batch["speech"] = speech
-        >>>     return batch
+        ...     speech, _ = sf.read(batch["file"])
+        ...     batch["speech"] = speech
+        ...     return batch
+
 
         >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         >>> ds = ds.map(map_to_array)
 
-        >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values # Batch size 1
+        >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values  # Batch size 1
         >>> logits = model(input_values).logits
         >>> predicted_ids = tf.argmax(logits, axis=-1)
 
@@ -1538,7 +1542,7 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel):
 
         >>> # wrap processor as target processor to encode labels
         >>> with processor.as_target_processor():
-        >>>     labels = processor(transcription, return_tensors="tf").input_ids
+        ...     labels = processor(transcription, return_tensors="tf").input_ids
 
         >>> loss = model(input_values, labels=labels).loss
         ```"""
diff --git a/src/transformers/models/wav2vec2/modeling_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
index 9d28d4980a..2fd354eba7 100755
--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -1421,9 +1421,7 @@ class Wav2Vec2ForPreTraining(Wav2Vec2PreTrainedModel):
         ...     outputs = model(input_values, mask_time_indices=mask_time_indices)
 
         >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
-        >>> cosine_sim = torch.cosine_similarity(
-        ...     outputs.projected_states, outputs.projected_quantized_states, dim=-1
-        ... )
+        >>> cosine_sim = torch.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)
 
         >>> # show that cosine similarity is much higher than random
         >>> assert cosine_sim[mask_time_indices].mean() > 0.5
@@ -1568,10 +1566,12 @@ class Wav2Vec2ForMaskedLM(Wav2Vec2PreTrainedModel):
         >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
         >>> model = Wav2Vec2ForMaskedLM.from_pretrained("facebook/wav2vec2-base-960h")
 
+
         >>> def map_to_array(batch):
-        >>>     speech, _ = sf.read(batch["file"])
-        >>>     batch["speech"] = speech
-        >>>     return batch
+        ...     speech, _ = sf.read(batch["file"])
+        ...     batch["speech"] = speech
+        ...     return batch
+
 
         >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
         >>> ds = ds.map(map_to_array)
diff --git a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
index ed9b32b0b9..4b22fbdda1 100644
--- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
@@ -298,11 +298,11 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
 
         ```python
         # Let's see how to increase the vocabulary of Bert model and tokenizer
-        tokenizer = Wav2Vec2CTCTokenizer.from_pretrained('facebook/wav2vec2-base-960h')
-        model = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base-960h')
+        tokenizer = Wav2Vec2CTCTokenizer.from_pretrained("facebook/wav2vec2-base-960h")
+        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
 
-        num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
-        print('We have added', num_added_toks, 'tokens')
+        num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
+        print("We have added", num_added_toks, "tokens")
         # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
         model.resize_token_embeddings(len(tokenizer))
         ```"""
diff --git a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
index d1de690e4a..73811f167c 100644
--- a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
+++ b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
@@ -370,11 +370,11 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
 
         ```python
         # Let's see how to increase the vocabulary of Bert model and tokenizer
-        tokenizer = Wav2Vec2PhonemeCTCTokenizer.from_pretrained('facebook/wav2vec2-lv-60-espeak-cv-ft')
-        model = Wav2Vec2PhonemeForCTC.from_pretrained('facebook/wav2vec2-lv-60-espeak-cv-ft')
+        tokenizer = Wav2Vec2PhonemeCTCTokenizer.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
+        model = Wav2Vec2PhonemeForCTC.from_pretrained("facebook/wav2vec2-lv-60-espeak-cv-ft")
 
-        num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
-        print('We have added', num_added_toks, 'tokens')
+        num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
+        print("We have added", num_added_toks, "tokens")
         # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
         model.resize_token_embeddings(len(tokenizer))
         ```"""
diff --git a/src/transformers/models/xlm/modeling_xlm.py b/src/transformers/models/xlm/modeling_xlm.py
index 77e34b7926..5ab432015f 100755
--- a/src/transformers/models/xlm/modeling_xlm.py
+++ b/src/transformers/models/xlm/modeling_xlm.py
@@ -1042,10 +1042,12 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
         >>> from transformers import XLMTokenizer, XLMForQuestionAnswering
         >>> import torch
 
-        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        >>> model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048')
+        >>> tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
+        >>> model = XLMForQuestionAnswering.from_pretrained("xlm-mlm-en-2048")
 
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(
+        ...     0
+        >>> )  # Batch size 1
         >>> start_positions = torch.tensor([1])
         >>> end_positions = torch.tensor([3])
 
diff --git a/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
index c272de5d64..dfb7b39491 100644
--- a/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
+++ b/src/transformers/models/xlm_prophetnet/modeling_xlm_prophetnet.py
@@ -46,8 +46,8 @@ class XLMProphetNetEncoder(ProphetNetEncoder):
     >>> from transformers import XLMProphetNetTokenizer, XLMProphetNetEncoder
     >>> import torch
 
-    >>> tokenizer = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
-    >>> model = XLMProphetNetEncoder.from_pretrained('patrickvonplaten/xprophetnet-large-uncased-standalone')
+    >>> tokenizer = XLMProphetNetTokenizer.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
+    >>> model = XLMProphetNetEncoder.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
     >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
     >>> outputs = model(**inputs)
@@ -69,8 +69,10 @@ class XLMProphetNetDecoder(ProphetNetDecoder):
     >>> from transformers import XLMProphetNetTokenizer, XLMProphetNetDecoder
     >>> import torch
 
-    >>> tokenizer = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
-    >>> model = XLMProphetNetDecoder.from_pretrained('patrickvonplaten/xprophetnet-large-uncased-standalone', add_cross_attention=False)
+    >>> tokenizer = XLMProphetNetTokenizer.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
+    >>> model = XLMProphetNetDecoder.from_pretrained(
+    ...     "patrickvonplaten/xprophetnet-large-uncased-standalone", add_cross_attention=False
+    ... )
     >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
     >>> outputs = model(**inputs)
@@ -91,10 +93,12 @@ class XLMProphetNetModel(ProphetNetModel):
     ```python
     >>> from transformers import XLMProphetNetTokenizer, XLMProphetNetModel
 
-    >>> tokenizer = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
-    >>> model = XLMProphetNetModel.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
+    >>> tokenizer = XLMProphetNetTokenizer.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
+    >>> model = XLMProphetNetModel.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
 
-    >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+    >>> input_ids = tokenizer(
+    ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+    >>> ).input_ids  # Batch size 1
     >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
     >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
 
@@ -115,10 +119,12 @@ class XLMProphetNetForConditionalGeneration(ProphetNetForConditionalGeneration):
     ```python
     >>> from transformers import XLMProphetNetTokenizer, XLMProphetNetForConditionalGeneration
 
-    >>> tokenizer = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
-    >>> model =  XLMProphetNetForConditionalGeneration.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
+    >>> tokenizer = XLMProphetNetTokenizer.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
+    >>> model = XLMProphetNetForConditionalGeneration.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
 
-    >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+    >>> input_ids = tokenizer(
+    ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+    >>> ).input_ids  # Batch size 1
     >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
     >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
 
@@ -140,8 +146,8 @@ class XLMProphetNetForCausalLM(ProphetNetForCausalLM):
     >>> from transformers import XLMProphetNetTokenizer, XLMProphetNetForCausalLM
     >>> import torch
 
-    >>> tokenizer = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
-    >>> model = XLMProphetNetForCausalLM.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
+    >>> tokenizer = XLMProphetNetTokenizer.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
+    >>> model = XLMProphetNetForCausalLM.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
     >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
     >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
     >>> outputs = model(**inputs)
@@ -152,14 +158,16 @@ class XLMProphetNetForCausalLM(ProphetNetForCausalLM):
     >>> from transformers import EncoderDecoderModel, XLMProphetNetTokenizer, XLMRobertaTokenizer
     >>> import torch
 
-    >>> tokenizer_enc = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
-    >>> tokenizer_dec = XLMProphetNetTokenizer.from_pretrained('microsoft/xprophetnet-large-wiki100-cased')
-    >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("xlm-roberta-large", 'microsoft/xprophetnet-large-wiki100-cased')
+    >>> tokenizer_enc = XLMRobertaTokenizer.from_pretrained("xlm-roberta-large")
+    >>> tokenizer_dec = XLMProphetNetTokenizer.from_pretrained("microsoft/xprophetnet-large-wiki100-cased")
+    >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained(
+    ...     "xlm-roberta-large", "microsoft/xprophetnet-large-wiki100-cased"
+    ... )
 
     >>> ARTICLE = (
-    ... "the us state department said wednesday it had received no "
-    ... "formal word from bolivia that it was expelling the us ambassador there "
-    ... "but said the charges made against him are `` baseless ."
+    ...     "the us state department said wednesday it had received no "
+    ...     "formal word from bolivia that it was expelling the us ambassador there "
+    ...     "but said the charges made against him are `` baseless ."
     ... )
     >>> input_ids = tokenizer_enc(ARTICLE, return_tensors="pt").input_ids
     >>> labels = tokenizer_dec("us rejects charges against its ambassador in bolivia", return_tensors="pt").input_ids
diff --git a/src/transformers/models/xlnet/modeling_tf_xlnet.py b/src/transformers/models/xlnet/modeling_tf_xlnet.py
index f8ab393a93..8a7b34197e 100644
--- a/src/transformers/models/xlnet/modeling_tf_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_tf_xlnet.py
@@ -1321,21 +1321,33 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel, TFCausalLanguageModelingLoss):
         >>> import numpy as np
         >>> from transformers import XLNetTokenizer, TFXLNetLMHeadModel
 
-        >>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        >>> model = TFXLNetLMHeadModel.from_pretrained('xlnet-large-cased')
+        >>> tokenizer = XLNetTokenizer.from_pretrained("xlnet-large-cased")
+        >>> model = TFXLNetLMHeadModel.from_pretrained("xlnet-large-cased")
 
         >>> # We show how to setup inputs to predict a next token using a bi-directional context.
-        >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=True))[None, :]  # We will predict the masked token
+        >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=True))[
+        ...     None, :
+        >>> ]  # We will predict the masked token
 
         >>> perm_mask = np.zeros((1, input_ids.shape[1], input_ids.shape[1]))
         >>> perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
 
-        >>> target_mapping = np.zeros((1, 1, input_ids.shape[1]))  # Shape [1, 1, seq_length] => let's predict one token
-        >>> target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
+        >>> target_mapping = np.zeros(
+        ...     (1, 1, input_ids.shape[1])
+        >>> )  # Shape [1, 1, seq_length] => let's predict one token
+        >>> target_mapping[
+        ...     0, 0, -1
+        >>> ] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
 
-        >>> outputs = model(input_ids, perm_mask=tf.constant(perm_mask, dtype=tf.float32), target_mapping=tf.constant(target_mapping, dtype=tf.float32))
+        >>> outputs = model(
+        ...     input_ids,
+        ...     perm_mask=tf.constant(perm_mask, dtype=tf.float32),
+        ...     target_mapping=tf.constant(target_mapping, dtype=tf.float32),
+        ... )
 
-        >>> next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
+        >>> next_token_logits = outputs[
+        ...     0
+        >>> ]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
         ```"""
         inputs = input_processing(
             func=self.call,
diff --git a/src/transformers/models/xlnet/modeling_xlnet.py b/src/transformers/models/xlnet/modeling_xlnet.py
index 957ba0b5d4..278320a6b4 100755
--- a/src/transformers/models/xlnet/modeling_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_xlnet.py
@@ -1400,31 +1400,53 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
         >>> from transformers import XLNetTokenizer, XLNetLMHeadModel
         >>> import torch
 
-        >>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        >>> model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased')
+        >>> tokenizer = XLNetTokenizer.from_pretrained("xlnet-large-cased")
+        >>> model = XLNetLMHeadModel.from_pretrained("xlnet-large-cased")
 
         >>> # We show how to setup inputs to predict a next token using a bi-directional context.
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)).unsqueeze(0)  # We will predict the masked token
+        >>> input_ids = torch.tensor(
+        ...     tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)
+        >>> ).unsqueeze(
+        ...     0
+        >>> )  # We will predict the masked token
         >>> perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
         >>> perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
-        >>> target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float)  # Shape [1, 1, seq_length] => let's predict one token
-        >>> target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
+        >>> target_mapping = torch.zeros(
+        ...     (1, 1, input_ids.shape[1]), dtype=torch.float
+        >>> )  # Shape [1, 1, seq_length] => let's predict one token
+        >>> target_mapping[
+        ...     0, 0, -1
+        >>> ] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
 
         >>> outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
-        >>> next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
+        >>> next_token_logits = outputs[
+        ...     0
+        >>> ]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
 
         >>> # The same way can the XLNetLMHeadModel be used to be trained by standard auto-regressive language modeling.
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)).unsqueeze(0)  # We will predict the masked token
+        >>> input_ids = torch.tensor(
+        ...     tokenizer.encode("Hello, my dog is very <mask>", add_special_tokens=False)
+        >>> ).unsqueeze(
+        ...     0
+        >>> )  # We will predict the masked token
         >>> labels = torch.tensor(tokenizer.encode("cute", add_special_tokens=False)).unsqueeze(0)
-        >>> assert labels.shape[0] == 1, 'only one word will be predicted'
+        >>> assert labels.shape[0] == 1, "only one word will be predicted"
         >>> perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
-        >>> perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token as is done in standard auto-regressive lm training
-        >>> target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float)  # Shape [1, 1, seq_length] => let's predict one token
-        >>> target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
+        >>> perm_mask[
+        ...     :, :, -1
+        >>> ] = 1.0  # Previous tokens don't see last token as is done in standard auto-regressive lm training
+        >>> target_mapping = torch.zeros(
+        ...     (1, 1, input_ids.shape[1]), dtype=torch.float
+        >>> )  # Shape [1, 1, seq_length] => let's predict one token
+        >>> target_mapping[
+        ...     0, 0, -1
+        >>> ] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
 
         >>> outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping, labels=labels)
         >>> loss = outputs.loss
-        >>> next_token_logits = outputs.logits  # Logits have shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
+        >>> next_token_logits = (
+        ...     outputs.logits
+        >>> )  # Logits have shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
         ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
@@ -1968,10 +1990,12 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
         >>> from transformers import XLNetTokenizer, XLNetForQuestionAnswering
         >>> import torch
 
-        >>> tokenizer =  XLNetTokenizer.from_pretrained('xlnet-base-cased')
-        >>> model = XLNetForQuestionAnswering.from_pretrained('xlnet-base-cased')
+        >>> tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
+        >>> model = XLNetForQuestionAnswering.from_pretrained("xlnet-base-cased")
 
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(
+        ...     0
+        >>> )  # Batch size 1
         >>> start_positions = torch.tensor([1])
         >>> end_positions = torch.tensor([3])
         >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py
index 50e7d3cd00..124ce7f086 100644
--- a/src/transformers/optimization.py
+++ b/src/transformers/optimization.py
@@ -433,6 +433,7 @@ class Adafactor(Optimizer):
 
     ```python
     from transformers.optimization import Adafactor, AdafactorSchedule
+
     optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
     lr_scheduler = AdafactorSchedule(optimizer)
     trainer = Trainer(..., optimizers=(optimizer, lr_scheduler))
@@ -452,7 +453,7 @@ class Adafactor(Optimizer):
         weight_decay=0.0,
         relative_step=False,
         scale_parameter=False,
-        warmup_init=False
+        warmup_init=False,
     )
     ```"""
 
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 229963175f..e65068e534 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -469,15 +469,15 @@ def pipeline(
     >>> from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
 
     >>> # Sentiment analysis pipeline
-    >>> pipeline('sentiment-analysis')
+    >>> pipeline("sentiment-analysis")
 
     >>> # Question answering pipeline, specifying the checkpoint identifier
-    >>> pipeline('question-answering', model='distilbert-base-cased-distilled-squad', tokenizer='bert-base-cased')
+    >>> pipeline("question-answering", model="distilbert-base-cased-distilled-squad", tokenizer="bert-base-cased")
 
     >>> # Named entity recognition pipeline, passing in a specific model and tokenizer
     >>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
     >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
-    >>> pipeline('ner', model=model, tokenizer=tokenizer)
+    >>> pipeline("ner", model=model, tokenizer=tokenizer)
     ```"""
     if model_kwargs is None:
         model_kwargs = {}
diff --git a/src/transformers/pipelines/table_question_answering.py b/src/transformers/pipelines/table_question_answering.py
index e50c492702..f37eb16603 100644
--- a/src/transformers/pipelines/table_question_answering.py
+++ b/src/transformers/pipelines/table_question_answering.py
@@ -259,10 +259,10 @@ class TableQuestionAnsweringPipeline(Pipeline):
 
         ```python
         data = {
-        "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
-        "age": ["56", "45", "59"],
-        "number of movies": ["87", "53", "69"],
-        "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
+            "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
+            "age": ["56", "45", "59"],
+            "number of movies": ["87", "53", "69"],
+            "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
         }
         ```
 
@@ -272,6 +272,7 @@ class TableQuestionAnsweringPipeline(Pipeline):
 
         ```python
         import pandas as pd
+
         table = pd.DataFrame.from_dict(data)
         ```
 
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index fb912012c4..60521acbfc 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -709,6 +709,7 @@ class CaptureStd:
 
     # to capture stderr only with auto-replay
     import sys
+
     with CaptureStderr() as cs:
         print("Warning: ", file=sys.stderr)
     assert "Warning" in cs.err
@@ -826,7 +827,7 @@ class CaptureLogger:
     >>> logger = logging.get_logger("transformers.models.bart.tokenization_bart")
     >>> with CaptureLogger(logger) as cl:
     ...     logger.info(msg)
-    >>> assert cl.out, msg+"\n"
+    >>> assert cl.out, msg + "\n"
     ```
     """
 
@@ -858,7 +859,7 @@ def LoggingLevel(level):
 
     ```python
     with LoggingLevel(logging.INFO):
-        AutoModel.from_pretrained("gpt2") # calls logger.info() several times
+        AutoModel.from_pretrained("gpt2")  # calls logger.info() several times
     ```
     """
     orig_level = transformers_logging.get_verbosity()
@@ -878,8 +879,8 @@ def ExtendSysPath(path: Union[str, os.PathLike]) -> Iterator[None]:
     Usage :
 
     ```python
-    with ExtendSysPath('/path/to/dir'):
-        mymodule = importlib.import_module('mymodule')
+    with ExtendSysPath("/path/to/dir"):
+        mymodule = importlib.import_module("mymodule")
     ```
     """
 
diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index 90d6a1d2f3..8ce3396bdd 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -73,6 +73,7 @@ class Trie:
         >>> trie.add("Hello 友達")
         >>> trie.data
         {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}}
+
         >>> trie.add("Hello")
         >>> trie.data
         {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}}
@@ -100,6 +101,7 @@ class Trie:
         >>> trie = Trie()
         >>> trie.split("[CLS] This is a extra_id_100")
         ["[CLS] This is a extra_id_100"]
+
         >>> trie.add("[CLS]")
         >>> trie.add("extra_id_1")
         >>> trie.add("extra_id_100")
@@ -393,11 +395,11 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
 
         ```python
         # Let's see how to increase the vocabulary of Bert model and tokenizer
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertModel.from_pretrained('bert-base-uncased')
+        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        model = BertModel.from_pretrained("bert-base-uncased")
 
-        num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
-        print('We have added', num_added_toks, 'tokens')
+        num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
+        print("We have added", num_added_toks, "tokens")
         # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
         model.resize_token_embeddings(len(tokenizer))
         ```"""
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 0365f69cc7..985bbf3e95 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -862,17 +862,17 @@ class SpecialTokensMixin:
 
         ```python
         # Let's see how to add a new classification token to GPT-2
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        model = GPT2Model.from_pretrained('gpt2')
+        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        model = GPT2Model.from_pretrained("gpt2")
 
-        special_tokens_dict = {'cls_token': '<CLS>'}
+        special_tokens_dict = {"cls_token": "<CLS>"}
 
         num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
-        print('We have added', num_added_toks, 'tokens')
+        print("We have added", num_added_toks, "tokens")
         # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
         model.resize_token_embeddings(len(tokenizer))
 
-        assert tokenizer.cls_token == '<CLS>'
+        assert tokenizer.cls_token == "<CLS>"
         ```"""
         if not special_tokens_dict:
             return 0
@@ -929,11 +929,11 @@ class SpecialTokensMixin:
 
         ```python
         # Let's see how to increase the vocabulary of Bert model and tokenizer
-        tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
-        model = BertModel.from_pretrained('bert-base-uncased')
+        tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
+        model = BertModel.from_pretrained("bert-base-uncased")
 
-        num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
-        print('We have added', num_added_toks, 'tokens')
+        num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"])
+        print("We have added", num_added_toks, "tokens")
         # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
         model.resize_token_embeddings(len(tokenizer))
         ```"""
@@ -1585,22 +1585,22 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         ```python
         # We can't instantiate directly the base class *PreTrainedTokenizerBase* so let's show our examples on a derived class: BertTokenizer
         # Download vocabulary from huggingface.co and cache.
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
 
         # Download vocabulary from huggingface.co (user-uploaded) and cache.
-        tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
+        tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-german-cased")
 
         # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
-        tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')
+        tokenizer = BertTokenizer.from_pretrained("./test/saved_model/")
 
         # If the tokenizer uses a single vocabulary file, you can point directly to this file
-        tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt')
+        tokenizer = BertTokenizer.from_pretrained("./test/saved_model/my_vocab.txt")
 
         # You can link tokens to special vocabulary when instantiating
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='<unk>')
+        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", unk_token="<unk>")
         # You should be sure '<unk>' is in the vocabulary when doing that.
         # Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
-        assert tokenizer.unk_token == '<unk>'
+        assert tokenizer.unk_token == "<unk>"
         ```"""
         cache_dir = kwargs.pop("cache_dir", None)
         force_download = kwargs.pop("force_download", False)
diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py
index e2afa7e939..e81d0a27f2 100644
--- a/src/transformers/trainer_callback.py
+++ b/src/transformers/trainer_callback.py
@@ -201,7 +201,6 @@ class TrainerCallback:
 
     ```python
     class PrinterCallback(TrainerCallback):
-
         def on_log(self, args, state, control, logs=None, **kwargs):
             _ = logs.pop("total_flos", None)
             if state.is_local_process_zero:
diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py
index 74757feb6e..58efc2e4b1 100644
--- a/src/transformers/trainer_pt_utils.py
+++ b/src/transformers/trainer_pt_utils.py
@@ -870,7 +870,7 @@ def log_metrics(self, split, metrics):
 
     Now when this method is run, you will see a report that will include: :
 
-    ```python
+    ```
     init_mem_cpu_alloc_delta   =     1301MB
     init_mem_cpu_peaked_delta  =      154MB
     init_mem_gpu_alloc_delta   =      230MB
diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py
index e822457676..135ebf0a48 100644
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -300,7 +300,7 @@ class TrainerMemoryTracker:
     ```python
     self._memory_tracker = TrainerMemoryTracker(self.args.skip_memory_metrics)
     self._memory_tracker.start()
-    code ...
+    # code ...
     metrics = {"train_runtime": 10.5}
     self._memory_tracker.stop_and_update_metrics(metrics)
     ```
diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py
index 25a2138589..23a2eb4c1f 100644
--- a/src/transformers/utils/fx.py
+++ b/src/transformers/utils/fx.py
@@ -526,6 +526,7 @@ def symbolic_trace(
 
     ```python
     from transformers.utils.fx import symbolic_trace
+
     traced_model = symbolic_trace(
         model,
         input_names=["input_ids", "attention_mask", "token_type_ids"],
diff --git a/utils/style_doc.py b/utils/style_doc.py
index 73703d1d16..4f44929bd4 100644
--- a/utils/style_doc.py
+++ b/utils/style_doc.py
@@ -19,6 +19,16 @@ import os
 import re
 import warnings
 
+import black
+
+
+BLACK_AVOID_PATTERNS = {
+    "===PT-TF-SPLIT===": "### PT-TF-SPLIT",
+    "{processor_class}": "FakeProcessorClass",
+    "{model_class}": "FakeModelClass",
+    "{object_class}": "FakeObjectClass",
+}
+
 
 # Regexes
 # Re pattern that catches list introduction (with potential indent)
@@ -50,6 +60,136 @@ def find_indent(line):
     return len(search.groups()[0])
 
 
+def parse_code_example(code_lines):
+    """
+    Parses a code example
+
+    Args:
+        code_lines (`List[str]`): The code lines to parse.
+        max_len (`int`): The maximum lengh per line.
+
+    Returns:
+        (List[`str`], List[`str`]): The list of code samples and the list of outputs.
+    """
+    has_doctest = code_lines[0][:3] in DOCTEST_PROMPTS
+
+    code_samples = []
+    outputs = []
+    in_code = True
+    current_bit = []
+
+    for line in code_lines:
+        if in_code and has_doctest and not is_empty_line(line) and line[:3] not in DOCTEST_PROMPTS:
+            code_sample = "\n".join(current_bit)
+            code_samples.append(code_sample.strip())
+            in_code = False
+            current_bit = []
+        elif not in_code and line[:3] in DOCTEST_PROMPTS:
+            output = "\n".join(current_bit)
+            outputs.append(output.strip())
+            in_code = True
+            current_bit = []
+
+        # Add the line without doctest prompt
+        if line[:3] in DOCTEST_PROMPTS:
+            line = line[4:]
+        current_bit.append(line)
+
+    # Add last sample
+    if in_code:
+        code_sample = "\n".join(current_bit)
+        code_samples.append(code_sample.strip())
+    else:
+        output = "\n".join(current_bit)
+        outputs.append(output.strip())
+
+    return code_samples, outputs
+
+
+def format_code_example(code: str, max_len: int, in_docstring: bool = False):
+    """
+    Format a code example using black. Will take into account the doctest syntax as well as any initial indentation in
+    the code provided.
+
+    Args:
+        code (`str`): The code example to format.
+        max_len (`int`): The maximum lengh per line.
+        in_docstring (`bool`, *optional*, defaults to `False`): Whether or not the code example is inside a docstring.
+
+    Returns:
+        `str`: The formatted code.
+    """
+    code_lines = code.split("\n")
+
+    # Find initial indent
+    idx = 0
+    while idx < len(code_lines) and is_empty_line(code_lines[idx]):
+        idx += 1
+    if idx >= len(code_lines):
+        return "", ""
+    indent = find_indent(code_lines[idx])
+
+    # Remove the initial indent for now, we will had it back after styling.
+    # Note that l[indent:] works for empty lines
+    code_lines = [l[indent:] for l in code_lines[idx:]]
+    has_doctest = code_lines[0][:3] in DOCTEST_PROMPTS
+
+    code_samples, outputs = parse_code_example(code_lines)
+
+    # Let's blackify the code! We put everything in one big text to go faster.
+    delimiter = "\n\n### New code sample ###\n"
+    full_code = delimiter.join(code_samples)
+    line_length = max_len - indent
+    if has_doctest:
+        line_length -= 4
+
+    for k, v in BLACK_AVOID_PATTERNS.items():
+        full_code = full_code.replace(k, v)
+    try:
+        formatted_code = black.format_str(
+            full_code, mode=black.FileMode([black.TargetVersion.PY37], line_length=line_length)
+        )
+        error = ""
+    except Exception as e:
+        formatted_code = full_code
+        error = f"Code sample:\n{full_code}\n\nError message:\n{e}"
+
+    # Let's get back the formatted code samples
+    for k, v in BLACK_AVOID_PATTERNS.items():
+        formatted_code = formatted_code.replace(v, k)
+    # Triple quotes will mess docstrings.
+    if in_docstring:
+        formatted_code = formatted_code.replace('"""', "'''")
+
+    code_samples = formatted_code.split(delimiter)
+    # We can have one output less than code samples
+    if len(outputs) == len(code_samples) - 1:
+        outputs.append("")
+
+    formatted_lines = []
+    for code_sample, output in zip(code_samples, outputs):
+        # black may have added some new lines, we remove them
+        code_sample = code_sample.strip()
+        in_triple_quotes = False
+        for line in code_sample.strip().split("\n"):
+            if has_doctest and not is_empty_line(line):
+                prefix = "... " if line.startswith(" ") or line in [")", "]", "}"] or in_triple_quotes else ">>> "
+            else:
+                prefix = ""
+            indent_str = "" if is_empty_line(line) else (" " * indent)
+            formatted_lines.append(indent_str + prefix + line)
+
+            if '"""' in line:
+                in_triple_quotes = not in_triple_quotes
+
+        formatted_lines.extend([" " * indent + line for line in output.split("\n")])
+        if not output.endswith("===PT-TF-SPLIT==="):
+            formatted_lines.append("")
+
+    result = "\n".join(formatted_lines)
+    return result.rstrip(), error
+
+
 def format_text(text, max_len, prefix="", min_indent=None):
     """
     Format a text in the biggest lines possible with the constraint of a maximum length and an indentation.
@@ -110,6 +250,7 @@ def style_docstring(docstring, max_len):
     in_code = False
     param_indent = -1
     prefix = ""
+    black_errors = []
 
     # Special case for docstrings that begin with continuation of Args with no Args block.
     idx = 0
@@ -153,8 +294,10 @@ def style_docstring(docstring, max_len):
                 current_indent = -1
                 code = "\n".join(current_paragraph)
                 if current_code in ["py", "python"]:
-                    new_lines.append(code)
-                    # new_lines.append(format_code_example(code, max_len))
+                    formatted_code, error = format_code_example(code, max_len, in_docstring=True)
+                    new_lines.append(formatted_code)
+                    if len(error) > 0:
+                        black_errors.append(error)
                 else:
                     new_lines.append(code)
                 current_paragraph = None
@@ -210,7 +353,7 @@ def style_docstring(docstring, max_len):
         paragraph = " ".join(current_paragraph)
         new_lines.append(format_text(paragraph, max_len, prefix=prefix, min_indent=current_indent))
 
-    return "\n".join(new_lines)
+    return "\n".join(new_lines), "\n\n".join(black_errors)
 
 
 def style_file_docstrings(code_file, max_len=119, check_only=False):
@@ -234,6 +377,8 @@ def style_file_docstrings(code_file, max_len=119, check_only=False):
         (s if i % 2 == 0 or _re_doc_ignore.search(splits[i - 1]) is not None else style_docstring(s, max_len=max_len))
         for i, s in enumerate(splits)
     ]
+    black_errors = "\n\n".join([s[1] for s in splits if isinstance(s, tuple) and len(s[1]) > 0])
+    splits = [s[0] if isinstance(s, tuple) else s for s in splits]
     clean_code = '\"\"\"'.join(splits)
     # fmt: on
 
@@ -243,7 +388,7 @@ def style_file_docstrings(code_file, max_len=119, check_only=False):
         with open(code_file, "w", encoding="utf-8", newline="\n") as f:
             f.write(clean_code)
 
-    return diff
+    return diff, black_errors
 
 
 def style_mdx_file(mdx_file, max_len=119, check_only=False):
@@ -267,6 +412,8 @@ def style_mdx_file(mdx_file, max_len=119, check_only=False):
     current_language = ""
     in_code = False
     new_lines = []
+    black_errors = []
+
     for line in lines:
         if _re_code.search(line) is not None:
             in_code = not in_code
@@ -276,8 +423,9 @@ def style_mdx_file(mdx_file, max_len=119, check_only=False):
             else:
                 code = "\n".join(current_code)
                 if current_language in ["py", "python"]:
-                    pass
-                    # code = format_code_example(code, max_len)
+                    code, error = format_code_example(code, max_len)
+                    if len(error) > 0:
+                        black_errors.append(error)
                 new_lines.append(code)
 
             new_lines.append(line)
@@ -293,7 +441,7 @@ def style_mdx_file(mdx_file, max_len=119, check_only=False):
         with open(mdx_file, "w", encoding="utf-8", newline="\n") as f:
             f.write(clean_content)
 
-    return diff
+    return diff, "\n\n".join(black_errors)
 
 
 def style_doc_files(*files, max_len=119, check_only=False):
@@ -310,26 +458,49 @@ def style_doc_files(*files, max_len=119, check_only=False):
         List[`str`]: The list of files changed or that should be restyled.
     """
     changed = []
+    black_errors = []
     for file in files:
         # Treat folders
         if os.path.isdir(file):
             files = [os.path.join(file, f) for f in os.listdir(file)]
-            files = [f for f in files if os.path.isdir(f) or f.endswith(".rst") or f.endswith(".py")]
+            files = [f for f in files if os.path.isdir(f) or f.endswith(".mdx") or f.endswith(".py")]
             changed += style_doc_files(*files, max_len=max_len, check_only=check_only)
         # Treat mdx
         elif file.endswith(".mdx"):
-            if style_mdx_file(file, max_len=max_len, check_only=check_only):
-                changed.append(file)
+            try:
+                diff, black_error = style_mdx_file(file, max_len=max_len, check_only=check_only)
+                if diff:
+                    changed.append(file)
+                if len(black_error) > 0:
+                    black_errors.append(
+                        f"There was a problem while formatting an example in {file} with black:\m{black_error}"
+                    )
+            except Exception:
+                print(f"There is a problem in {file}.")
+                raise
         # Treat python files
         elif file.endswith(".py"):
             try:
-                if style_file_docstrings(file, max_len=max_len, check_only=check_only):
+                diff, black_error = style_file_docstrings(file, max_len=max_len, check_only=check_only)
+                if diff:
                     changed.append(file)
+                if len(black_error) > 0:
+                    black_errors.append(
+                        f"There was a problem while formatting an example in {file} with black:\m{black_error}"
+                    )
             except Exception:
                 print(f"There is a problem in {file}.")
                 raise
         else:
             warnings.warn(f"Ignoring {file} because it's not a py or an mdx file or a folder.")
+    if len(black_errors) > 0:
+        black_message = "\n\n".join(black_errors)
+        raise ValueError(
+            "Some code examples can't be interpreted by black, which means they aren't regular python:\n\n"
+            + black_message
+            + "\n\nMake sure to fix the corresponding docstring or doc file, or remove the py/python after ``` if it "
+            + "was not supposed to be a Python code sample."
+        )
     return changed