From 27b3031de2fb8195dec9bc2093e3e70bdb1c4bff Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Tue, 21 Dec 2021 15:06:33 -0500
Subject: [PATCH] Mass conversion of documentation from rst to Markdown
 (#14866)

* Convert docstrings of all configurations and tokenizers

* Processors and fixes

* Last modeling files and fixes to models

* Pipeline modules

* Utils files

* Data submodule

* All the other files

* Style

* Missing examples

* Style again

* Fix copies

* Say bye bye to rst docstrings forever
---
 src/transformers/configuration_utils.py       |  288 ++--
 src/transformers/convert_slow_tokenizer.py    |   14 +-
 src/transformers/data/data_collator.py        |  143 +-
 src/transformers/data/processors/glue.py      |   16 +-
 src/transformers/data/processors/squad.py     |   52 +-
 src/transformers/data/processors/utils.py     |   22 +-
 src/transformers/debug_utils.py               |  113 +-
 src/transformers/deepspeed.py                 |   36 +-
 .../feature_extraction_sequence_utils.py      |   55 +-
 src/transformers/feature_extraction_utils.py  |  149 +-
 src/transformers/generation_beam_search.py    |  108 +-
 .../generation_flax_logits_process.py         |   66 +-
 src/transformers/generation_flax_utils.py     |   93 +-
 src/transformers/generation_logits_process.py |  137 +-
 .../generation_stopping_criteria.py           |   30 +-
 src/transformers/generation_tf_utils.py       |  456 +++---
 src/transformers/generation_utils.py          | 1222 ++++++++---------
 src/transformers/image_utils.py               |   48 +-
 src/transformers/integrations.py              |   76 +-
 src/transformers/keras_callbacks.py           |   32 +-
 src/transformers/modelcard.py                 |   44 +-
 .../modeling_flax_pytorch_utils.py            |    2 +-
 src/transformers/modeling_flax_utils.py       |  271 ++--
 src/transformers/modeling_tf_utils.py         |  372 ++---
 src/transformers/modeling_utils.py            |  591 ++++----
 .../models/albert/configuration_albert.py     |   95 +-
 .../models/albert/modeling_flax_albert.py     |   18 +-
 .../models/albert/modeling_tf_albert.py       |   21 +-
 .../models/albert/tokenization_albert.py      |  103 +-
 .../models/albert/tokenization_albert_fast.py |   64 +-
 src/transformers/models/auto/auto_factory.py  |  357 ++---
 .../models/auto/configuration_auto.py         |  102 +-
 src/transformers/models/auto/dynamic.py       |   67 +-
 .../models/auto/feature_extraction_auto.py    |   81 +-
 .../models/auto/processing_auto.py            |   73 +-
 .../models/auto/tokenization_auto.py          |  158 +--
 .../models/bart/configuration_bart.py         |   90 +-
 .../models/bart/modeling_flax_bart.py         |   67 +-
 .../models/bart/tokenization_bart.py          |    4 +-
 .../models/bart/tokenization_bart_fast.py     |    6 +-
 .../models/barthez/tokenization_barthez.py    |   91 +-
 .../barthez/tokenization_barthez_fast.py      |   60 +-
 .../models/bartpho/tokenization_bartpho.py    |   90 +-
 .../models/beit/configuration_beit.py         |   81 +-
 .../models/beit/feature_extraction_beit.py    |   66 +-
 src/transformers/models/beit/modeling_beit.py |   25 +-
 .../models/beit/modeling_flax_beit.py         |   54 +-
 .../models/bert/configuration_bert.py         |   81 +-
 .../models/bert/modeling_flax_bert.py         |   40 +-
 .../models/bert/modeling_tf_bert.py           |   23 +-
 .../models/bert/tokenization_bert.py          |   94 +-
 .../models/bert/tokenization_bert_fast.py     |   58 +-
 .../configuration_bert_generation.py          |   67 +-
 .../tokenization_bert_generation.py           |   33 +-
 .../tokenization_bert_japanese.py             |   34 +-
 .../models/bertweet/tokenization_bertweet.py  |  101 +-
 .../models/big_bird/configuration_big_bird.py |   67 +-
 .../models/big_bird/modeling_flax_big_bird.py |   18 +-
 .../models/big_bird/tokenization_big_bird.py  |   67 +-
 .../big_bird/tokenization_big_bird_fast.py    |   68 +-
 .../configuration_bigbird_pegasus.py          |   81 +-
 .../blenderbot/configuration_blenderbot.py    |   85 +-
 .../models/blenderbot/modeling_blenderbot.py  |   19 +-
 .../blenderbot/modeling_flax_blenderbot.py    |   67 +-
 .../blenderbot/tokenization_blenderbot.py     |   12 +-
 .../tokenization_blenderbot_fast.py           |   14 +-
 .../configuration_blenderbot_small.py         |   85 +-
 .../modeling_blenderbot_small.py              |   19 +-
 .../modeling_flax_blenderbot_small.py         |   67 +-
 .../tokenization_blenderbot_small.py          |   16 +-
 .../tokenization_blenderbot_small_fast.py     |   10 +-
 .../models/byt5/tokenization_byt5.py          |   50 +-
 .../camembert/configuration_camembert.py      |    2 +-
 .../camembert/tokenization_camembert.py       |   91 +-
 .../camembert/tokenization_camembert_fast.py  |   61 +-
 .../models/canine/configuration_canine.py     |   66 +-
 .../models/canine/tokenization_canine.py      |   42 +-
 .../models/clip/configuration_clip.py         |  138 +-
 .../models/clip/feature_extraction_clip.py    |   74 +-
 src/transformers/models/clip/modeling_clip.py |   74 +-
 .../models/clip/modeling_flax_clip.py         |   72 +-
 .../models/clip/processing_clip.py            |  100 +-
 .../models/clip/tokenization_clip.py          |   49 +-
 .../models/clip/tokenization_clip_fast.py     |   51 +-
 .../models/convbert/configuration_convbert.py |   71 +-
 .../models/convbert/tokenization_convbert.py  |    6 +-
 .../convbert/tokenization_convbert_fast.py    |    6 +-
 .../models/cpm/tokenization_cpm.py            |   53 +-
 .../models/cpm/tokenization_cpm_fast.py       |   53 +-
 .../models/ctrl/configuration_ctrl.py         |   57 +-
 .../models/ctrl/tokenization_ctrl.py          |    8 +-
 .../models/deberta/configuration_deberta.py   |   66 +-
 .../models/deberta/tokenization_deberta.py    |   46 +-
 .../deberta/tokenization_deberta_fast.py      |   42 +-
 .../deberta_v2/configuration_deberta_v2.py    |   62 +-
 .../deberta_v2/tokenization_deberta_v2.py     |   98 +-
 .../models/deit/configuration_deit.py         |   55 +-
 .../models/deit/feature_extraction_deit.py    |   60 +-
 src/transformers/models/deit/modeling_deit.py |   56 +-
 .../models/detr/configuration_detr.py         |  103 +-
 .../models/detr/feature_extraction_detr.py    |  142 +-
 src/transformers/models/detr/modeling_detr.py |   25 +-
 .../distilbert/configuration_distilbert.py    |   66 +-
 .../distilbert/tokenization_distilbert.py     |    4 +-
 .../tokenization_distilbert_fast.py           |    6 +-
 .../models/dpr/configuration_dpr.py           |   54 +-
 src/transformers/models/dpr/modeling_dpr.py   |   86 +-
 .../models/dpr/modeling_tf_dpr.py             |   85 +-
 .../models/dpr/tokenization_dpr.py            |  122 +-
 .../models/dpr/tokenization_dpr_fast.py       |  122 +-
 .../models/electra/configuration_electra.py   |  104 +-
 .../models/electra/modeling_flax_electra.py   |   16 +-
 .../models/electra/modeling_tf_electra.py     |   19 +-
 .../models/electra/tokenization_electra.py    |    4 +-
 .../electra/tokenization_electra_fast.py      |    6 +-
 .../configuration_encoder_decoder.py          |   63 +-
 .../modeling_encoder_decoder.py               |   40 +-
 .../modeling_flax_encoder_decoder.py          |   89 +-
 .../modeling_tf_encoder_decoder.py            |   70 +-
 .../models/flaubert/configuration_flaubert.py |   98 +-
 .../models/flaubert/tokenization_flaubert.py  |    6 +-
 .../models/fnet/configuration_fnet.py         |   69 +-
 .../models/fnet/tokenization_fnet.py          |   80 +-
 .../models/fnet/tokenization_fnet_fast.py     |   51 +-
 .../models/fsmt/configuration_fsmt.py         |  102 +-
 .../models/fsmt/tokenization_fsmt.py          |   71 +-
 .../models/funnel/configuration_funnel.py     |   75 +-
 .../models/funnel/modeling_tf_funnel.py       |   17 +-
 .../models/funnel/tokenization_funnel.py      |   20 +-
 .../models/funnel/tokenization_funnel_fast.py |   22 +-
 .../models/gpt2/configuration_gpt2.py         |  113 +-
 src/transformers/models/gpt2/modeling_gpt2.py |   20 +-
 .../models/gpt2/tokenization_gpt2.py          |   45 +-
 .../models/gpt2/tokenization_gpt2_fast.py     |   49 +-
 .../models/gpt_neo/configuration_gpt_neo.py   |   73 +-
 .../models/gptj/configuration_gptj.py         |   64 +-
 src/transformers/models/gptj/modeling_gptj.py |   21 +-
 .../models/herbert/tokenization_herbert.py    |    2 +-
 .../herbert/tokenization_herbert_fast.py      |   42 +-
 .../models/hubert/configuration_hubert.py     |  151 +-
 .../models/hubert/modeling_hubert.py          |   31 +-
 .../models/hubert/modeling_tf_hubert.py       |   31 +-
 .../models/ibert/configuration_ibert.py       |   60 +-
 .../models/ibert/quant_modules.py             |  122 +-
 .../models/imagegpt/configuration_imagegpt.py |   69 +-
 .../imagegpt/feature_extraction_imagegpt.py   |   54 +-
 .../models/layoutlm/configuration_layoutlm.py |   61 +-
 .../models/layoutlm/modeling_layoutlm.py      |   43 +-
 .../models/layoutlm/modeling_tf_layoutlm.py   |   43 +-
 .../models/layoutlm/tokenization_layoutlm.py  |    4 +-
 .../layoutlm/tokenization_layoutlm_fast.py    |    4 +-
 .../layoutlmv2/configuration_layoutlmv2.py    |   90 +-
 .../feature_extraction_layoutlmv2.py          |   82 +-
 .../models/layoutlmv2/modeling_layoutlmv2.py  |   21 +-
 .../layoutlmv2/processing_layoutlmv2.py       |   92 +-
 .../layoutlmv2/tokenization_layoutlmv2.py     |  169 ++-
 .../tokenization_layoutlmv2_fast.py           |   72 +-
 .../models/layoutxlm/processing_layoutxlm.py  |   88 +-
 .../layoutxlm/tokenization_layoutxlm.py       |  151 +-
 .../layoutxlm/tokenization_layoutxlm_fast.py  |   79 +-
 .../models/led/configuration_led.py           |   65 +-
 src/transformers/models/led/modeling_led.py   |   25 +-
 .../models/led/modeling_tf_led.py             |   50 +-
 .../models/led/tokenization_led.py            |    4 +-
 .../models/led/tokenization_led_fast.py       |    6 +-
 .../longformer/configuration_longformer.py    |   36 +-
 .../models/longformer/modeling_longformer.py  |   62 +-
 .../longformer/modeling_tf_longformer.py      |   25 +-
 .../longformer/tokenization_longformer.py     |    2 +-
 .../tokenization_longformer_fast.py           |    4 +-
 .../models/luke/configuration_luke.py         |   68 +-
 src/transformers/models/luke/modeling_luke.py |   41 +-
 .../models/luke/tokenization_luke.py          |  209 ++-
 .../models/lxmert/configuration_lxmert.py     |   74 +-
 .../models/lxmert/tokenization_lxmert.py      |    4 +-
 .../models/lxmert/tokenization_lxmert_fast.py |    6 +-
 .../models/m2m_100/configuration_m2m_100.py   |   76 +-
 .../models/m2m_100/tokenization_m2m_100.py    |   86 +-
 .../models/marian/configuration_marian.py     |   85 +-
 .../models/marian/modeling_flax_marian.py     |   87 +-
 .../models/marian/modeling_marian.py          |   51 +-
 .../models/marian/modeling_tf_marian.py       |   30 +-
 .../models/marian/tokenization_marian.py      |   98 +-
 .../models/mbart/configuration_mbart.py       |   86 +-
 .../models/mbart/modeling_flax_mbart.py       |   67 +-
 .../models/mbart/tokenization_mbart.py        |   49 +-
 .../models/mbart/tokenization_mbart_fast.py   |   42 +-
 .../models/mbart50/tokenization_mbart50.py    |   82 +-
 .../mbart50/tokenization_mbart50_fast.py      |   56 +-
 .../configuration_megatron_bert.py            |   75 +-
 .../models/mluke/tokenization_mluke.py        |  288 ++--
 .../models/mmbt/configuration_mmbt.py         |    8 +-
 src/transformers/models/mmbt/modeling_mmbt.py |   13 +-
 .../mobilebert/configuration_mobilebert.py    |   80 +-
 .../mobilebert/modeling_tf_mobilebert.py      |   41 +-
 .../mobilebert/tokenization_mobilebert.py     |    4 +-
 .../tokenization_mobilebert_fast.py           |    6 +-
 .../models/mpnet/configuration_mpnet.py       |   59 +-
 .../models/mpnet/tokenization_mpnet.py        |  106 +-
 .../models/mpnet/tokenization_mpnet_fast.py   |   60 +-
 .../models/mt5/configuration_mt5.py           |   47 +-
 .../models/openai/configuration_openai.py     |   95 +-
 .../models/openai/tokenization_openai.py      |   12 +-
 .../models/openai/tokenization_openai_fast.py |   10 +-
 .../models/pegasus/configuration_pegasus.py   |   86 +-
 .../models/pegasus/modeling_flax_pegasus.py   |   67 +-
 .../models/pegasus/modeling_pegasus.py        |   19 +-
 .../models/pegasus/tokenization_pegasus.py    |   71 +-
 .../pegasus/tokenization_pegasus_fast.py      |   53 +-
 .../perceiver/configuration_perceiver.py      |   83 +-
 .../perceiver/feature_extraction_perceiver.py |   64 +-
 .../models/perceiver/modeling_perceiver.py    |  131 +-
 .../perceiver/tokenization_perceiver.py       |   43 +-
 .../models/phobert/tokenization_phobert.py    |   64 +-
 .../prophetnet/configuration_prophetnet.py    |   68 +-
 .../models/prophetnet/modeling_prophetnet.py  |   40 +-
 .../prophetnet/tokenization_prophetnet.py     |   71 +-
 .../models/qdqbert/configuration_qdqbert.py   |   62 +-
 .../models/rag/configuration_rag.py           |   94 +-
 src/transformers/models/rag/modeling_rag.py   |   21 +-
 .../models/rag/modeling_tf_rag.py             |   24 +-
 src/transformers/models/rag/retrieval_rag.py  |  162 +--
 .../models/reformer/configuration_reformer.py |  152 +-
 .../models/reformer/tokenization_reformer.py  |   42 +-
 .../reformer/tokenization_reformer_fast.py    |   26 +-
 .../models/rembert/configuration_rembert.py   |   58 +-
 .../models/rembert/tokenization_rembert.py    |   80 +-
 .../rembert/tokenization_rembert_fast.py      |   74 +-
 .../retribert/configuration_retribert.py      |   40 +-
 .../retribert/tokenization_retribert.py       |    4 +-
 .../retribert/tokenization_retribert_fast.py  |    6 +-
 .../models/roberta/configuration_roberta.py   |   29 +-
 .../models/roberta/tokenization_roberta.py    |   97 +-
 .../roberta/tokenization_roberta_fast.py      |   85 +-
 .../models/roformer/configuration_roformer.py |   71 +-
 .../models/roformer/tokenization_roformer.py  |   83 +-
 .../roformer/tokenization_roformer_fast.py    |   46 +-
 .../segformer/configuration_segformer.py      |   73 +-
 .../segformer/feature_extraction_segformer.py |   54 +-
 .../models/segformer/modeling_segformer.py    |   25 +-
 .../models/sew/configuration_sew.py           |  142 +-
 .../models/sew_d/configuration_sew_d.py       |  166 ++-
 .../configuration_speech_encoder_decoder.py   |   63 +-
 .../modeling_speech_encoder_decoder.py        |   30 +-
 .../configuration_speech_to_text.py           |   89 +-
 .../feature_extraction_speech_to_text.py      |   62 +-
 .../processing_speech_to_text.py              |   76 +-
 .../tokenization_speech_to_text.py            |   51 +-
 .../configuration_speech_to_text_2.py         |   68 +-
 .../processing_speech_to_text_2.py            |   76 +-
 .../tokenization_speech_to_text_2.py          |   14 +-
 .../models/splinter/configuration_splinter.py |   66 +-
 .../models/splinter/tokenization_splinter.py  |   87 +-
 .../splinter/tokenization_splinter_fast.py    |   55 +-
 .../squeezebert/configuration_squeezebert.py  |   72 +-
 .../squeezebert/tokenization_squeezebert.py   |    4 +-
 .../tokenization_squeezebert_fast.py          |    6 +-
 .../models/t5/configuration_t5.py             |   49 +-
 .../models/t5/modeling_flax_t5.py             |  111 +-
 src/transformers/models/t5/modeling_t5.py     |   58 +-
 src/transformers/models/t5/modeling_tf_t5.py  |   38 +-
 src/transformers/models/t5/tokenization_t5.py |   77 +-
 .../models/t5/tokenization_t5_fast.py         |   47 +-
 .../models/tapas/configuration_tapas.py       |  111 +-
 .../models/tapas/modeling_tapas.py            |   31 +-
 .../models/tapas/modeling_tf_tapas.py         |   31 +-
 .../models/tapas/tokenization_tapas.py        |  265 ++--
 .../transfo_xl/configuration_transfo_xl.py    |   83 +-
 .../modeling_transfo_xl_utilities.py          |   10 +-
 .../transfo_xl/tokenization_transfo_xl.py     |   73 +-
 .../models/trocr/configuration_trocr.py       |   72 +-
 .../models/trocr/processing_trocr.py          |   76 +-
 .../unispeech/configuration_unispeech.py      |  171 ++-
 .../configuration_unispeech_sat.py            |  191 ++-
 .../unispeech_sat/modeling_unispeech_sat.py   |   63 +-
 .../configuration_vision_encoder_decoder.py   |   63 +-
 .../modeling_flax_vision_encoder_decoder.py   |  101 +-
 .../modeling_vision_encoder_decoder.py        |   48 +-
 .../configuration_vision_text_dual_encoder.py |   65 +-
 .../modeling_flax_vision_text_dual_encoder.py |   47 +-
 .../modeling_vision_text_dual_encoder.py      |   46 +-
 .../processing_vision_text_dual_encoder.py    |  102 +-
 .../visual_bert/configuration_visual_bert.py  |   75 +-
 .../visual_bert/modeling_visual_bert.py       |   37 +-
 .../models/vit/configuration_vit.py           |   56 +-
 .../models/vit/feature_extraction_vit.py      |   48 +-
 .../models/vit/modeling_flax_vit.py           |   56 +-
 .../models/vit/modeling_tf_vit.py             |   25 +-
 src/transformers/models/vit/modeling_vit.py   |   25 +-
 .../models/wav2vec2/configuration_wav2vec2.py |  212 ++-
 .../wav2vec2/feature_extraction_wav2vec2.py   |   88 +-
 .../models/wav2vec2/modeling_flax_wav2vec2.py |  131 +-
 .../models/wav2vec2/modeling_tf_wav2vec2.py   |   31 +-
 .../models/wav2vec2/processing_wav2vec2.py    |   84 +-
 .../models/wav2vec2/tokenization_wav2vec2.py  |  122 +-
 .../tokenization_wav2vec2_phoneme.py          |   79 +-
 .../processing_wav2vec2_with_lm.py            |  122 +-
 .../models/wavlm/configuration_wavlm.py       |  212 +--
 .../models/xlm/configuration_xlm.py           |  113 +-
 .../models/xlm/tokenization_xlm.py            |   77 +-
 .../configuration_xlm_prophetnet.py           |    2 +-
 .../tokenization_xlm_prophetnet.py            |   89 +-
 .../xlm_roberta/configuration_xlm_roberta.py  |    2 +-
 .../xlm_roberta/tokenization_xlm_roberta.py   |   89 +-
 .../tokenization_xlm_roberta_fast.py          |   59 +-
 .../models/xlnet/configuration_xlnet.py       |  125 +-
 .../models/xlnet/modeling_tf_xlnet.py         |   41 +-
 .../models/xlnet/tokenization_xlnet.py        |  105 +-
 .../models/xlnet/tokenization_xlnet_fast.py   |   79 +-
 src/transformers/optimization.py              |  190 +--
 src/transformers/optimization_tf.py           |   72 +-
 src/transformers/pipelines/__init__.py        |  172 +--
 .../pipelines/audio_classification.py         |   31 +-
 .../pipelines/automatic_speech_recognition.py |   36 +-
 src/transformers/pipelines/base.py            |  240 ++--
 src/transformers/pipelines/conversational.py  |  101 +-
 .../pipelines/feature_extraction.py           |   32 +-
 src/transformers/pipelines/fill_mask.py       |   41 +-
 .../pipelines/image_classification.py         |   17 +-
 .../pipelines/image_segmentation.py           |   24 +-
 .../pipelines/object_detection.py             |   23 +-
 .../pipelines/question_answering.py           |   88 +-
 .../pipelines/table_question_answering.py     |   82 +-
 .../pipelines/text2text_generation.py         |  114 +-
 .../pipelines/text_classification.py          |   45 +-
 src/transformers/pipelines/text_generation.py |   38 +-
 .../pipelines/token_classification.py         |   48 +-
 .../pipelines/zero_shot_classification.py     |   40 +-
 src/transformers/testing_utils.py             |  238 ++--
 src/transformers/tokenization_utils.py        |  120 +-
 src/transformers/tokenization_utils_base.py   |  999 +++++++-------
 src/transformers/tokenization_utils_fast.py   |   65 +-
 src/transformers/trainer.py                   |  303 ++--
 src/transformers/trainer_callback.py          |  143 +-
 src/transformers/trainer_pt_utils.py          |  189 +--
 src/transformers/trainer_seq2seq.py           |   66 +-
 src/transformers/trainer_tf.py                |  120 +-
 src/transformers/trainer_utils.py             |   43 +-
 src/transformers/training_args.py             |  450 +++---
 src/transformers/training_args_seq2seq.py     |   18 +-
 src/transformers/training_args_tf.py          |  169 +--
 src/transformers/utils/fx.py                  |   40 +-
 src/transformers/utils/logging.py             |   41 +-
 src/transformers/utils/notebook.py            |   69 +-
 src/transformers/utils/versions.py            |   16 +-
 ...on_{{cookiecutter.lowercase_modelname}}.py |  116 +-
 ...st_{{cookiecutter.lowercase_modelname}}.py |   22 +-
 ...on_{{cookiecutter.lowercase_modelname}}.py |   50 +-
 utils/check_repo.py                           |   45 +
 349 files changed, 14049 insertions(+), 13622 deletions(-)

diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index 23938bfb1f..25358fae42 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -79,36 +79,36 @@ class PretrainedConfig(PushToHubMixin):
     - **num_hidden_layers** (`int`) -- The number of blocks in the model.
 
     Arg:
-        name_or_path (`str`, _optional_, defaults to `""`):
+        name_or_path (`str`, *optional*, defaults to `""`):
             Store the string that was passed to [`PreTrainedModel.from_pretrained`] or
             [`TFPreTrainedModel.from_pretrained`] as `pretrained_model_name_or_path` if the configuration was created
             with such a method.
-        output_hidden_states (`bool`, _optional_, defaults to `False`):
+        output_hidden_states (`bool`, *optional*, defaults to `False`):
             Whether or not the model should return all hidden-states.
-        output_attentions (`bool`, _optional_, defaults to `False`):
+        output_attentions (`bool`, *optional*, defaults to `False`):
             Whether or not the model should returns all attentions.
-        return_dict (`bool`, _optional_, defaults to `True`):
+        return_dict (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return a [`~transformers.file_utils.ModelOutput`] instead of a plain tuple.
-        is_encoder_decoder (`bool`, _optional_, defaults to `False`):
+        is_encoder_decoder (`bool`, *optional*, defaults to `False`):
             Whether the model is used as an encoder/decoder or not.
-        is_decoder (`bool`, _optional_, defaults to `False`):
+        is_decoder (`bool`, *optional*, defaults to `False`):
             Whether the model is used as decoder or not (in which case it's used as an encoder).
-        cross_attention_hidden_size** (`bool`, _optional_):
+        cross_attention_hidden_size** (`bool`, *optional*):
             The hidden size of the cross-attention layer in case the model is used as a decoder in an encoder-decoder
             setting and the cross-attention hidden dimension differs from `self.config.hidden_size`.
-        add_cross_attention (`bool`, _optional_, defaults to `False`):
+        add_cross_attention (`bool`, *optional*, defaults to `False`):
             Whether cross-attention layers should be added to the model. Note, this option is only relevant for models
             that can be used as decoder models within the [`EncoderDecoderModel`] class, which consists of all models
             in `AUTO_MODELS_FOR_CAUSAL_LM`.
-        tie_encoder_decoder (`bool`, _optional_, defaults to `False`):
+        tie_encoder_decoder (`bool`, *optional*, defaults to `False`):
             Whether all encoder weights should be tied to their equivalent decoder weights. This requires the encoder
             and decoder model to have the exact same parameter names.
-        prune_heads (`Dict[int, List[int]]`, _optional_, defaults to `{}`):
+        prune_heads (`Dict[int, List[int]]`, *optional*, defaults to `{}`):
             Pruned heads of the model. The keys are the selected layer indices and the associated values, the list of
             heads to prune in said layer.
 
             For instance `{1: [0, 2], 2: [2, 3]}` will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2.
-        chunk_size_feed_forward (`int`, _optional_, defaults to `0`):
+        chunk_size_feed_forward (`int`, *optional*, defaults to `0`):
             The chunk size of all feed forward layers in the residual attention blocks. A chunk size of `0` means that
             the feed forward layer is not chunked. A chunk size of n means that the feed forward layer processes `n` <
             sequence_length embeddings at a time. For more information on feed forward chunking, see [How does Feed
@@ -116,105 +116,105 @@ class PretrainedConfig(PushToHubMixin):
 
         > Parameters for sequence generation
 
-        max_length (`int`, _optional_, defaults to 20):
+        max_length (`int`, *optional*, defaults to 20):
             Maximum length that will be used by default in the `generate` method of the model.
-        min_length (`int`, _optional_, defaults to 10):
+        min_length (`int`, *optional*, defaults to 10):
             Minimum length that will be used by default in the `generate` method of the model.
-        do_sample (`bool`, _optional_, defaults to `False`):
+        do_sample (`bool`, *optional*, defaults to `False`):
             Flag that will be used by default in the `generate` method of the model. Whether or not to use sampling ;
             use greedy decoding otherwise.
-        early_stopping (`bool`, _optional_, defaults to `False`):
+        early_stopping (`bool`, *optional*, defaults to `False`):
             Flag that will be used by default in the `generate` method of the model. Whether to stop the beam search
             when at least `num_beams` sentences are finished per batch or not.
-        num_beams (`int`, _optional_, defaults to 1):
+        num_beams (`int`, *optional*, defaults to 1):
             Number of beams for beam search that will be used by default in the `generate` method of the model. 1 means
             no beam search.
-        num_beam_groups (`int`, _optional_, defaults to 1):
+        num_beam_groups (`int`, *optional*, defaults to 1):
             Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams
             that will be used by default in the `generate` method of the model. 1 means no group beam search.
-        diversity_penalty (`float`, _optional_, defaults to 0.0):
+        diversity_penalty (`float`, *optional*, defaults to 0.0):
             Value to control diversity for group beam search. that will be used by default in the `generate` method of
             the model. 0 means no diversity penalty. The higher the penalty, the more diverse are the outputs.
-        temperature (`float`, _optional_, defaults to 1):
+        temperature (`float`, *optional*, defaults to 1):
             The value used to module the next token probabilities that will be used by default in the `generate` method
             of the model. Must be strictly positive.
-        top_k (`int`, _optional_, defaults to 50):
+        top_k (`int`, *optional*, defaults to 50):
             Number of highest probability vocabulary tokens to keep for top-k-filtering that will be used by default in
             the `generate` method of the model.
-        top_p (`float`, _optional_, defaults to 1):
+        top_p (`float`, *optional*, defaults to 1):
             Value that will be used by default in the `generate` method of the model for `top_p`. If set to float < 1,
             only the most probable tokens with probabilities that add up to `top_p` or higher are kept for generation.
-        repetition_penalty (`float`, _optional_, defaults to 1):
+        repetition_penalty (`float`, *optional*, defaults to 1):
             Parameter for repetition penalty that will be used by default in the `generate` method of the model. 1.0
             means no penalty.
-        length_penalty (`float`, _optional_, defaults to 1):
+        length_penalty (`float`, *optional*, defaults to 1):
             Exponential penalty to the length that will be used by default in the `generate` method of the model.
-        no_repeat_ngram_size (`int`, _optional_, defaults to 0) -- Value that will be used by default in the
+        no_repeat_ngram_size (`int`, *optional*, defaults to 0) -- Value that will be used by default in the
             `generate` method of the model for `no_repeat_ngram_size`. If set to int > 0, all ngrams of that size can
             only occur once.
-        encoder_no_repeat_ngram_size (`int`, _optional_, defaults to 0) -- Value that will be used by
+        encoder_no_repeat_ngram_size (`int`, *optional*, defaults to 0) -- Value that will be used by
             default in the `generate` method of the model for `encoder_no_repeat_ngram_size`. If set to int > 0, all
             ngrams of that size that occur in the `encoder_input_ids` cannot occur in the `decoder_input_ids`.
-        bad_words_ids (`List[int]`, _optional_):
+        bad_words_ids (`List[int]`, *optional*):
             List of token ids that are not allowed to be generated that will be used by default in the `generate`
             method of the model. In order to get the tokens of the words that should not appear in the generated text,
             use `tokenizer.encode(bad_word, add_prefix_space=True)`.
-        num_return_sequences (`int`, _optional_, defaults to 1):
+        num_return_sequences (`int`, *optional*, defaults to 1):
             Number of independently computed returned sequences for each element in the batch that will be used by
             default in the `generate` method of the model.
-        output_scores (`bool`, _optional_, defaults to `False`):
+        output_scores (`bool`, *optional*, defaults to `False`):
             Whether the model should return the logits when used for generation.
-        return_dict_in_generate (`bool`, _optional_, defaults to `False`):
+        return_dict_in_generate (`bool`, *optional*, defaults to `False`):
             Whether the model should return a [`~transformers.file_utils.ModelOutput`] instead of a `torch.LongTensor`.
-        forced_bos_token_id (`int`, _optional_):
+        forced_bos_token_id (`int`, *optional*):
             The id of the token to force as the first generated token after the `decoder_start_token_id`. Useful for
             multilingual models like [mBART](../model_doc/mbart) where the first generated token needs to be the target
             language token.
-        forced_eos_token_id (`int`, _optional_):
+        forced_eos_token_id (`int`, *optional*):
             The id of the token to force as the last generated token when `max_length` is reached.
-        remove_invalid_values (`bool`, _optional_):
+        remove_invalid_values (`bool`, *optional*):
             Whether to remove possible _nan_ and _inf_ outputs of the model to prevent the generation method to crash.
             Note that using `remove_invalid_values` can slow down generation.
 
         > Parameters for fine-tuning tasks
 
-        architectures (`List[str]`, _optional_): Model architectures that can be used with the model pretrained weights.
-        finetuning_task (`str`, _optional_):
+        architectures (`List[str]`, *optional*): Model architectures that can be used with the model pretrained weights.
+        finetuning_task (`str`, *optional*):
             Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow
             or PyTorch) checkpoint.
-        id2label (`Dict[int, str]`, _optional_):
+        id2label (`Dict[int, str]`, *optional*):
             A map from index (for instance prediction index, or target index) to label.
-        label2id (`Dict[str, int]`, _optional_): A map from label to index for the model.
-        num_labels (`int`, _optional_):
+        label2id (`Dict[str, int]`, *optional*): A map from label to index for the model.
+        num_labels (`int`, *optional*):
             Number of labels to use in the last layer added to the model, typically for a classification task.
-        task_specific_params (`Dict[str, Any]`, _optional_):
+        task_specific_params (`Dict[str, Any]`, *optional*):
             Additional keyword arguments to store for the current task.
-        problem_type (`str`, _optional_):
+        problem_type (`str`, *optional*):
             Problem type for `XxxForSequenceClassification` models. Can be one of `"regression"`,
             `"single_label_classification"` or `"multi_label_classification"`.
 
         > Parameters linked to the tokenizer
 
-        tokenizer_class (`str`, _optional_):
+        tokenizer_class (`str`, *optional*):
             The name of the associated tokenizer class to use (if none is set, will use the tokenizer associated to the
             model by default).
-        prefix (`str`, _optional_):
+        prefix (`str`, *optional*):
             A specific prompt that should be added at the beginning of each text before calling the model.
-        bos_token_id (`int`, _optional_): The id of the _beginning-of-stream_ token.
-        pad_token_id (`int`, _optional_): The id of the _padding_ token.
-        eos_token_id (`int`, _optional_): The id of the _end-of-stream_ token.
-        decoder_start_token_id (`int`, _optional_):
+        bos_token_id (`int`, *optional*): The id of the _beginning-of-stream_ token.
+        pad_token_id (`int`, *optional*): The id of the _padding_ token.
+        eos_token_id (`int`, *optional*): The id of the _end-of-stream_ token.
+        decoder_start_token_id (`int`, *optional*):
             If an encoder-decoder model starts decoding with a different token than _bos_, the id of that token.
-        sep_token_id (`int`, _optional_): The id of the _separation_ token.
+        sep_token_id (`int`, *optional*): The id of the _separation_ token.
 
         > PyTorch specific parameters
 
-        torchscript (`bool`, _optional_, defaults to `False`):
+        torchscript (`bool`, *optional*, defaults to `False`):
             Whether or not the model should be used with Torchscript.
-        tie_word_embeddings (`bool`, _optional_, defaults to `True`):
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
             Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
             model has a output word embedding layer.
-        torch_dtype (`str`, _optional_):
+        torch_dtype (`str`, *optional*):
             The `dtype` of the weights. This attribute can be used to initialize the model to a non-default `dtype`
             (which is normally `float32`) and thus allow for optimal storage allocation. For example, if the saved
             model is `float16`, ideally we want to load it back using the minimal amount of memory needed to load
@@ -227,7 +227,7 @@ class PretrainedConfig(PushToHubMixin):
 
         > TensorFlow specific parameters
 
-        use_bfloat16 (`bool`, _optional_, defaults to `False`):
+        use_bfloat16 (`bool`, *optional*, defaults to `False`):
             Whether or not the model should use BFloat16 scalars (only used by some TensorFlow models).
     """
     model_type: str = ""
@@ -370,7 +370,7 @@ class PretrainedConfig(PushToHubMixin):
     @property
     def use_return_dict(self) -> bool:
         """
-        :obj:`bool`: Whether or not return :class:`~transformers.file_utils.ModelOutput` instead of tuples.
+        `bool`: Whether or not return [`~file_utils.ModelOutput`] instead of tuples.
         """
         # If torchscript is set, force `return_dict=False` to avoid jit errors
         return self.return_dict and not self.torchscript
@@ -378,7 +378,7 @@ class PretrainedConfig(PushToHubMixin):
     @property
     def num_labels(self) -> int:
         """
-        :obj:`int`: The number of labels for classification models.
+        `int`: The number of labels for classification models.
         """
         return len(self.id2label)
 
@@ -390,25 +390,27 @@ class PretrainedConfig(PushToHubMixin):
 
     def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
         """
-        Save a configuration object to the directory ``save_directory``, so that it can be re-loaded using the
-        :func:`~transformers.PretrainedConfig.from_pretrained` class method.
+        Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the
+        [`~PretrainedConfig.from_pretrained`] class method.
 
         Args:
-            save_directory (:obj:`str` or :obj:`os.PathLike`):
+            save_directory (`str` or `os.PathLike`):
                 Directory where the configuration JSON file will be saved (will be created if it does not exist).
-            push_to_hub (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            push_to_hub (`bool`, *optional*, defaults to `False`):
                 Whether or not to push your model to the Hugging Face model hub after saving it.
 
-                .. warning::
+                <Tip warning={true}>
 
-                    Using :obj:`push_to_hub=True` will synchronize the repository you are pushing to with
-                    :obj:`save_directory`, which requires :obj:`save_directory` to be a local clone of the repo you are
-                    pushing to if it's an existing folder. Pass along :obj:`temp_dir=True` to use a temporary directory
-                    instead.
+                Using `push_to_hub=True` will synchronize the repository you are pushing to with
+                `save_directory`, which requires `save_directory` to be a local clone of the repo you are
+                pushing to if it's an existing folder. Pass along `temp_dir=True` to use a temporary directory
+                instead.
+
+                </Tip>
 
             kwargs:
                 Additional key word arguments passed along to the
-                :meth:`~transformers.file_utils.PushToHubMixin.push_to_hub` method.
+                [`~file_utils.PushToHubMixin.push_to_hub`] method.
         """
         if os.path.isfile(save_directory):
             raise AssertionError(f"Provided path ({save_directory}) should be a directory, not a file")
@@ -431,73 +433,73 @@ class PretrainedConfig(PushToHubMixin):
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
         r"""
-        Instantiate a :class:`~transformers.PretrainedConfig` (or a derived class) from a pretrained model
+        Instantiate a [`PretrainedConfig`] (or a derived class) from a pretrained model
         configuration.
 
         Args:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
                 This can be either:
 
-                - a string, the `model id` of a pretrained model configuration hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
-                  namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing a configuration file saved using the
-                  :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g., ``./my_model_directory/``.
-                - a path or url to a saved configuration JSON `file`, e.g.,
-                  ``./my_model_directory/configuration.json``.
-            cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
+                - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
+                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                - a path to a *directory* containing a configuration file saved using the
+                  [`~PretrainedConfig.save_pretrained`] method, e.g., `./my_model_directory/`.
+                - a path or url to a saved configuration JSON *file*, e.g.,
+                  `./my_model_directory/configuration.json`.
+            cache_dir (`str` or `os.PathLike`, *optional*):
                 Path to a directory in which a downloaded pretrained model configuration should be cached if the
                 standard cache should not be used.
-            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            force_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to force to (re-)download the configuration files and override the cached versions if
                 they exist.
-            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            resume_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to delete incompletely received file. Attempts to resume the download if such a file
                 exists.
-            proxies (:obj:`Dict[str, str]`, `optional`):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-            use_auth_token (:obj:`str` or `bool`, `optional`):
-                The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
-                generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
-            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+            use_auth_token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token
+                generated when running `transformers-cli login` (stored in `~/.huggingface`).
+            revision(`str`, *optional*, defaults to `"main"`):
                 The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                 identifier allowed by git.
-            return_unused_kwargs (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                If :obj:`False`, then this function returns just the final configuration object.
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                If `False`, then this function returns just the final configuration object.
 
-                If :obj:`True`, then this functions returns a :obj:`Tuple(config, unused_kwargs)` where `unused_kwargs`
+                If `True`, then this functions returns a `Tuple(config, unused_kwargs)` where *unused_kwargs*
                 is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: i.e.,
-                the part of ``kwargs`` which has not been used to update ``config`` and is otherwise ignored.
-            kwargs (:obj:`Dict[str, Any]`, `optional`):
+                the part of `kwargs` which has not been used to update `config` and is otherwise ignored.
+            kwargs (`Dict[str, Any]`, *optional*):
                 The values in kwargs of any keys which are configuration attributes will be used to override the loaded
                 values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
-                by the ``return_unused_kwargs`` keyword parameter.
+                by the `return_unused_kwargs` keyword parameter.
 
-        .. note::
+        <Tip>
 
-            Passing :obj:`use_auth_token=True` is required when you want to use a private model.
+        Passing `use_auth_token=True` is required when you want to use a private model.
 
+        </Tip>
 
         Returns:
-            :class:`PretrainedConfig`: The configuration object instantiated from this pretrained model.
+            [`PretrainedConfig`]: The configuration object instantiated from this pretrained model.
 
-        Examples::
+        Examples:
 
-            # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a
-            # derived class: BertConfig
-            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from huggingface.co and cache.
-            config = BertConfig.from_pretrained('./test/saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
-            config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
-            config = BertConfig.from_pretrained('bert-base-uncased', output_attentions=True, foo=False)
-            assert config.output_attentions == True
-            config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attentions=True,
-                                                               foo=False, return_unused_kwargs=True)
-            assert config.output_attentions == True
-            assert unused_kwargs == {'foo': False}
-
-        """
+        ```python
+        # We can't instantiate directly the base class *PretrainedConfig* so let's show the examples on a
+        # derived class: BertConfig
+        config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from huggingface.co and cache.
+        config = BertConfig.from_pretrained('./test/saved_model/')  # E.g. config (or model) was saved using *save_pretrained('./test/saved_model/')*
+        config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
+        config = BertConfig.from_pretrained('bert-base-uncased', output_attentions=True, foo=False)
+        assert config.output_attentions == True
+        config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attentions=True,
+                                                   foo=False, return_unused_kwargs=True)
+        assert config.output_attentions == True
+        assert unused_kwargs == {'foo': False}
+        ```"""
         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
         if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
             logger.warn(
@@ -512,17 +514,17 @@ class PretrainedConfig(PushToHubMixin):
         cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
     ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         """
-        From a ``pretrained_model_name_or_path``, resolve to a dictionary of parameters, to be used for instantiating a
-        :class:`~transformers.PretrainedConfig` using ``from_dict``.
+        From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
+        [`PretrainedConfig`] using `from_dict`.
 
 
 
         Parameters:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
                 The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
 
         Returns:
-            :obj:`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the configuration object.
+            `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the configuration object.
 
         """
         cache_dir = kwargs.pop("cache_dir", None)
@@ -608,18 +610,18 @@ class PretrainedConfig(PushToHubMixin):
     @classmethod
     def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "PretrainedConfig":
         """
-        Instantiates a :class:`~transformers.PretrainedConfig` from a Python dictionary of parameters.
+        Instantiates a [`PretrainedConfig`] from a Python dictionary of parameters.
 
         Args:
-            config_dict (:obj:`Dict[str, Any]`):
+            config_dict (`Dict[str, Any]`):
                 Dictionary that will be used to instantiate the configuration object. Such a dictionary can be
                 retrieved from a pretrained checkpoint by leveraging the
-                :func:`~transformers.PretrainedConfig.get_config_dict` method.
-            kwargs (:obj:`Dict[str, Any]`):
+                [`~PretrainedConfig.get_config_dict`] method.
+            kwargs (`Dict[str, Any]`):
                 Additional parameters from which to initialize the configuration object.
 
         Returns:
-            :class:`PretrainedConfig`: The configuration object instantiated from those parameters.
+            [`PretrainedConfig`]: The configuration object instantiated from those parameters.
         """
         return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
 
@@ -647,14 +649,14 @@ class PretrainedConfig(PushToHubMixin):
     @classmethod
     def from_json_file(cls, json_file: Union[str, os.PathLike]) -> "PretrainedConfig":
         """
-        Instantiates a :class:`~transformers.PretrainedConfig` from the path to a JSON file of parameters.
+        Instantiates a [`PretrainedConfig`] from the path to a JSON file of parameters.
 
         Args:
-            json_file (:obj:`str` or :obj:`os.PathLike`):
+            json_file (`str` or `os.PathLike`):
                 Path to the JSON file containing the parameters.
 
         Returns:
-            :class:`PretrainedConfig`: The configuration object instantiated from that JSON file.
+            [`PretrainedConfig`]: The configuration object instantiated from that JSON file.
 
         """
         config_dict = cls._dict_from_json_file(json_file)
@@ -678,7 +680,7 @@ class PretrainedConfig(PushToHubMixin):
         serializes to a Python dictionary.
 
         Returns:
-            :obj:`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance,
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance,
         """
         config_dict = self.to_dict()
 
@@ -709,7 +711,7 @@ class PretrainedConfig(PushToHubMixin):
         Serializes this instance to a Python dictionary.
 
         Returns:
-            :obj:`Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance.
         """
         output = copy.deepcopy(self.__dict__)
         if hasattr(self.__class__, "model_type"):
@@ -727,12 +729,12 @@ class PretrainedConfig(PushToHubMixin):
         Serializes this instance to a JSON string.
 
         Args:
-            use_diff (:obj:`bool`, `optional`, defaults to :obj:`True`):
-                If set to ``True``, only the difference between the config instance and the default
-                ``PretrainedConfig()`` is serialized to JSON string.
+            use_diff (`bool`, *optional*, defaults to `True`):
+                If set to `True`, only the difference between the config instance and the default
+                `PretrainedConfig()` is serialized to JSON string.
 
         Returns:
-            :obj:`str`: String containing all the attributes that make up this configuration instance in JSON format.
+            `str`: String containing all the attributes that make up this configuration instance in JSON format.
         """
         if use_diff is True:
             config_dict = self.to_diff_dict()
@@ -745,36 +747,36 @@ class PretrainedConfig(PushToHubMixin):
         Save this instance to a JSON file.
 
         Args:
-            json_file_path (:obj:`str` or :obj:`os.PathLike`):
+            json_file_path (`str` or `os.PathLike`):
                 Path to the JSON file in which this configuration instance's parameters will be saved.
-            use_diff (:obj:`bool`, `optional`, defaults to :obj:`True`):
-                If set to ``True``, only the difference between the config instance and the default
-                ``PretrainedConfig()`` is serialized to JSON file.
+            use_diff (`bool`, *optional*, defaults to `True`):
+                If set to `True`, only the difference between the config instance and the default
+                `PretrainedConfig()` is serialized to JSON file.
         """
         with open(json_file_path, "w", encoding="utf-8") as writer:
             writer.write(self.to_json_string(use_diff=use_diff))
 
     def update(self, config_dict: Dict[str, Any]):
         """
-        Updates attributes of this class with attributes from ``config_dict``.
+        Updates attributes of this class with attributes from `config_dict`.
 
         Args:
-            config_dict (:obj:`Dict[str, Any]`): Dictionary of attributes that should be updated for this class.
+            config_dict (`Dict[str, Any]`): Dictionary of attributes that should be updated for this class.
         """
         for key, value in config_dict.items():
             setattr(self, key, value)
 
     def update_from_string(self, update_str: str):
         """
-        Updates attributes of this class with attributes from ``update_str``.
+        Updates attributes of this class with attributes from `update_str`.
 
-        The expected format is ints, floats and strings as is, and for booleans use ``true`` or ``false``. For example:
+        The expected format is ints, floats and strings as is, and for booleans use `true` or `false`. For example:
         "n_embd=10,resid_pdrop=0.2,scale_attn_weights=false,summary_type=cls_index"
 
         The keys to change have to already exist in the config object.
 
         Args:
-            update_str (:obj:`str`): String with attributes that should be updated for this class.
+            update_str (`str`): String with attributes that should be updated for this class.
 
         """
 
@@ -804,8 +806,8 @@ class PretrainedConfig(PushToHubMixin):
 
     def dict_torch_dtype_to_str(self, d: Dict[str, Any]) -> None:
         """
-        Checks whether the passed dictionary has a `torch_dtype` key and if it's not None, converts torch.dtype to a
-        string of just the type. For example, :obj:`torch.float32` get converted into `"float32"` string, which can
+        Checks whether the passed dictionary has a *torch_dtype* key and if it's not None, converts torch.dtype to a
+        string of just the type. For example, `torch.float32` get converted into *"float32"* string, which can
         then be stored in the json format.
         """
         if d.get("torch_dtype", None) is not None and not isinstance(d["torch_dtype"], str):
@@ -822,20 +824,20 @@ def get_configuration_file(
     Get the configuration file to use for this version of transformers.
 
     Args:
-        path_or_repo (:obj:`str` or :obj:`os.PathLike`):
-            Can be either the id of a repo on huggingface.co or a path to a `directory`.
-        revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+        path_or_repo (`str` or `os.PathLike`):
+            Can be either the id of a repo on huggingface.co or a path to a *directory*.
+        revision(`str`, *optional*, defaults to `"main"`):
             The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-            git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
             identifier allowed by git.
-        use_auth_token (:obj:`str` or `bool`, `optional`):
-            The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
-            generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
-        local_files_only (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        use_auth_token (`str` or *bool*, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token
+            generated when running `transformers-cli login` (stored in `~/.huggingface`).
+        local_files_only (`bool`, *optional*, defaults to `False`):
             Whether or not to only rely on local files and not to attempt to download any files.
 
     Returns:
-        :obj:`str`: The configuration file to use.
+        `str`: The configuration file to use.
     """
     # Inspect all files from the repo/folder.
     all_files = get_list_of_files(
diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py
index ef6832ed71..e9611fdca6 100644
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -13,10 +13,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
- Utilities to convert slow tokenizers in their fast tokenizers counterparts.
+Utilities to convert slow tokenizers in their fast tokenizers counterparts.
 
-    All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and
-    allow to make our dependency on SentencePiece optional.
+All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and
+allow to make our dependency on SentencePiece optional.
 """
 
 from typing import Dict, List, Tuple
@@ -960,13 +960,13 @@ def convert_slow_tokenizer(transformer_tokenizer) -> Tokenizer:
     Utilities to convert a slow tokenizer instance in a fast tokenizer instance.
 
     Args:
-        transformer_tokenizer (:class:`~transformers.tokenization_utils_base.PreTrainedTokenizer`):
+        transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
             Instance of a slow tokenizer to convert in the backend tokenizer for
-            :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerFast`.
+            [`~tokenization_utils_base.PreTrainedTokenizerFast`].
 
     Return:
-        A instance of :class:`~tokenizers.Tokenizer` to be used as the backend tokenizer of a
-        :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerFast`
+        A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
+        [`~tokenization_utils_base.PreTrainedTokenizerFast`]
     """
 
     tokenizer_class_name = transformer_tokenizer.__class__.__name__
diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py
index 8b16280e3f..f6750268cb 100644
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@@ -50,8 +50,8 @@ def default_data_collator(features: List[InputDataClass], return_tensors="pt") -
     Very simple data collator that simply collates batches of dict-like objects and performs special handling for
     potential keys named:
 
-        - ``label``: handles a single value (int or float) per object
-        - ``label_ids``: handles a list of values per object
+        - `label`: handles a single value (int or float) per object
+        - `label_ids`: handles a list of values per object
 
     Does not do any additional preprocessing: property names of the input object will be used as corresponding inputs
     to the model. See glue and ner for example of how it's useful.
@@ -76,8 +76,8 @@ class DefaultDataCollator(DataCollatorMixin):
     Very simple data collator that simply collates batches of dict-like objects and performs special handling for
     potential keys named:
 
-        - ``label``: handles a single value (int or float) per object
-        - ``label_ids``: handles a list of values per object
+        - `label`: handles a single value (int or float) per object
+        - `label_ids`: handles a list of values per object
 
     Does not do any additional preprocessing: property names of the input object will be used as corresponding inputs
     to the model. See glue and ner for example of how it's useful.
@@ -86,7 +86,7 @@ class DefaultDataCollator(DataCollatorMixin):
     helpful if you need to set a return_tensors value at initialization.
 
     Args:
-        return_tensors (:obj:`str`):
+        return_tensors (`str`):
             The type of Tensor to return. Allowable values are "np", "pt" and "tf".
     """
 
@@ -213,26 +213,26 @@ class DataCollatorWithPadding:
     Data collator that will dynamically pad the inputs received.
 
     Args:
-        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
+        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
             The tokenizer used for encoding the data.
-        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+        padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `True`):
             Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
             among:
 
-            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+            - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
               sequence if provided).
-            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
               maximum acceptable input length for the model if that argument is not provided.
-            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+            - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
               different lengths).
-        max_length (:obj:`int`, `optional`):
+        max_length (`int`, *optional*):
             Maximum length of the returned list and optionally padding length (see above).
-        pad_to_multiple_of (:obj:`int`, `optional`):
+        pad_to_multiple_of (`int`, *optional*):
             If set will pad the sequence to a multiple of the provided value.
 
             This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
             7.5 (Volta).
-        return_tensors (:obj:`str`):
+        return_tensors (`str`):
             The type of Tensor to return. Allowable values are "np", "pt" and "tf".
     """
 
@@ -265,28 +265,28 @@ class DataCollatorForTokenClassification(DataCollatorMixin):
     Data collator that will dynamically pad the inputs received, as well as the labels.
 
     Args:
-        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
+        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
             The tokenizer used for encoding the data.
-        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+        padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `True`):
             Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
             among:
 
-            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+            - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
               sequence if provided).
-            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
               maximum acceptable input length for the model if that argument is not provided.
-            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+            - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
               different lengths).
-        max_length (:obj:`int`, `optional`):
+        max_length (`int`, *optional*):
             Maximum length of the returned list and optionally padding length (see above).
-        pad_to_multiple_of (:obj:`int`, `optional`):
+        pad_to_multiple_of (`int`, *optional*):
             If set will pad the sequence to a multiple of the provided value.
 
             This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
             7.5 (Volta).
-        label_pad_token_id (:obj:`int`, `optional`, defaults to -100):
+        label_pad_token_id (`int`, *optional*, defaults to -100):
             The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions).
-        return_tensors (:obj:`str`):
+        return_tensors (`str`):
             The type of Tensor to return. Allowable values are "np", "pt" and "tf".
     """
 
@@ -515,33 +515,33 @@ class DataCollatorForSeq2Seq:
     Data collator that will dynamically pad the inputs received, as well as the labels.
 
     Args:
-        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
+        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
             The tokenizer used for encoding the data.
-        model (:class:`~transformers.PreTrainedModel`):
-            The model that is being trained. If set and has the `prepare_decoder_input_ids_from_labels`, use it to
-            prepare the `decoder_input_ids`
+        model ([`PreTrainedModel`]):
+            The model that is being trained. If set and has the *prepare_decoder_input_ids_from_labels*, use it to
+            prepare the *decoder_input_ids*
 
-            This is useful when using `label_smoothing` to avoid calculating loss twice.
-        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            This is useful when using *label_smoothing* to avoid calculating loss twice.
+        padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `True`):
             Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
             among:
 
-            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+            - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
               sequence is provided).
-            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
               maximum acceptable input length for the model if that argument is not provided.
-            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+            - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
               different lengths).
-        max_length (:obj:`int`, `optional`):
+        max_length (`int`, *optional*):
             Maximum length of the returned list and optionally padding length (see above).
-        pad_to_multiple_of (:obj:`int`, `optional`):
+        pad_to_multiple_of (`int`, *optional*):
             If set will pad the sequence to a multiple of the provided value.
 
             This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
             7.5 (Volta).
-        label_pad_token_id (:obj:`int`, `optional`, defaults to -100):
+        label_pad_token_id (`int`, *optional*, defaults to -100):
             The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).
-        return_tensors (:obj:`str`):
+        return_tensors (`str`):
             The type of Tensor to return. Allowable values are "np", "pt" and "tf".
     """
 
@@ -605,26 +605,27 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
     are not all of the same length.
 
     Args:
-        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
+        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
             The tokenizer used for encoding the data.
-        mlm (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to use masked language modeling. If set to :obj:`False`, the labels are the same as the
+        mlm (`bool`, *optional*, defaults to `True`):
+            Whether or not to use masked language modeling. If set to `False`, the labels are the same as the
             inputs with the padding tokens ignored (by setting them to -100). Otherwise, the labels are -100 for
             non-masked tokens and the value to predict for the masked token.
-        mlm_probability (:obj:`float`, `optional`, defaults to 0.15):
-            The probability with which to (randomly) mask tokens in the input, when :obj:`mlm` is set to :obj:`True`.
-        pad_to_multiple_of (:obj:`int`, `optional`):
+        mlm_probability (`float`, *optional*, defaults to 0.15):
+            The probability with which to (randomly) mask tokens in the input, when `mlm` is set to `True`.
+        pad_to_multiple_of (`int`, *optional*):
             If set will pad the sequence to a multiple of the provided value.
-        return_tensors (:obj:`str`):
+        return_tensors (`str`):
             The type of Tensor to return. Allowable values are "np", "pt" and "tf".
 
-    .. note::
+    <Tip>
 
-        For best performance, this data collator should be used with a dataset having items that are dictionaries or
-        BatchEncoding, with the :obj:`"special_tokens_mask"` key, as returned by a
-        :class:`~transformers.PreTrainedTokenizer` or a :class:`~transformers.PreTrainedTokenizerFast` with the
-        argument :obj:`return_special_tokens_mask=True`.
-    """
+    For best performance, this data collator should be used with a dataset having items that are dictionaries or
+    BatchEncoding, with the `"special_tokens_mask"` key, as returned by a
+    [`PreTrainedTokenizer`] or a [`PreTrainedTokenizerFast`] with the
+    argument `return_special_tokens_mask=True`.
+
+    </Tip>"""
 
     tokenizer: PreTrainedTokenizerBase
     mlm: bool = True
@@ -845,13 +846,14 @@ class DataCollatorForWholeWordMask(DataCollatorForLanguageModeling):
     - collates batches of tensors, honoring their tokenizer's pad_token
     - preprocesses batches for masked language modeling
 
-    .. note::
+    <Tip>
 
-        This collator relies on details of the implementation of subword tokenization by
-        :class:`~transformers.BertTokenizer`, specifically that subword tokens are prefixed with `##`. For tokenizers
-        that do not adhere to this scheme, this collator will produce an output that is roughly equivalent to
-        :class:`.DataCollatorForLanguageModeling`.
-    """
+    This collator relies on details of the implementation of subword tokenization by
+    [`BertTokenizer`], specifically that subword tokens are prefixed with *##*. For tokenizers
+    that do not adhere to this scheme, this collator will produce an output that is roughly equivalent to
+    [`.DataCollatorForLanguageModeling`].
+
+    </Tip>"""
 
     def torch_call(self, examples: List[Union[List[int], Any, Dict[str, Any]]]) -> Dict[str, Any]:
         if isinstance(examples[0], (dict, BatchEncoding)):
@@ -1227,14 +1229,13 @@ class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
         """
         The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
 
-            0. Start from the beginning of the sequence by setting ``cur_len = 0`` (number of tokens processed so far).
-            1. Sample a ``span_length`` from the interval ``[1, max_span_length]`` (length of span of tokens to be
+            0. Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
+            1. Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be
                masked)
-            2. Reserve a context of length ``context_length = span_length / plm_probability`` to surround span to be
+            2. Reserve a context of length `context_length = span_length / plm_probability` to surround span to be
                masked
-            3. Sample a starting point ``start_index`` from the interval ``[cur_len, cur_len + context_length -
-               span_length]`` and mask tokens ``start_index:start_index + span_length``
-            4. Set ``cur_len = cur_len + context_length``. If ``cur_len < max_len`` (i.e. there are tokens remaining in
+            3. Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length - span_length]` and mask tokens `start_index:start_index + span_length`
+            4. Set `cur_len = cur_len + context_length`. If `cur_len < max_len` (i.e. there are tokens remaining in
                the sequence to be processed), repeat from Step 1.
         """
         import torch
@@ -1325,14 +1326,13 @@ class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
         """
         The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
 
-            0. Start from the beginning of the sequence by setting ``cur_len = 0`` (number of tokens processed so far).
-            1. Sample a ``span_length`` from the interval ``[1, max_span_length]`` (length of span of tokens to be
+            0. Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
+            1. Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be
                masked)
-            2. Reserve a context of length ``context_length = span_length / plm_probability`` to surround span to be
+            2. Reserve a context of length `context_length = span_length / plm_probability` to surround span to be
                masked
-            3. Sample a starting point ``start_index`` from the interval ``[cur_len, cur_len + context_length -
-               span_length]`` and mask tokens ``start_index:start_index + span_length``
-            4. Set ``cur_len = cur_len + context_length``. If ``cur_len < max_len`` (i.e. there are tokens remaining in
+            3. Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length - span_length]` and mask tokens `start_index:start_index + span_length`
+            4. Set `cur_len = cur_len + context_length`. If `cur_len < max_len` (i.e. there are tokens remaining in
                the sequence to be processed), repeat from Step 1.
         """
         from random import randint
@@ -1434,14 +1434,13 @@ class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
         """
         The masked tokens to be predicted for a particular sequence are determined by the following algorithm:
 
-            0. Start from the beginning of the sequence by setting ``cur_len = 0`` (number of tokens processed so far).
-            1. Sample a ``span_length`` from the interval ``[1, max_span_length]`` (length of span of tokens to be
+            0. Start from the beginning of the sequence by setting `cur_len = 0` (number of tokens processed so far).
+            1. Sample a `span_length` from the interval `[1, max_span_length]` (length of span of tokens to be
                masked)
-            2. Reserve a context of length ``context_length = span_length / plm_probability`` to surround span to be
+            2. Reserve a context of length `context_length = span_length / plm_probability` to surround span to be
                masked
-            3. Sample a starting point ``start_index`` from the interval ``[cur_len, cur_len + context_length -
-               span_length]`` and mask tokens ``start_index:start_index + span_length``
-            4. Set ``cur_len = cur_len + context_length``. If ``cur_len < max_len`` (i.e. there are tokens remaining in
+            3. Sample a starting point `start_index` from the interval `[cur_len, cur_len + context_length - span_length]` and mask tokens `start_index:start_index + span_length`
+            4. Set `cur_len = cur_len + context_length`. If `cur_len < max_len` (i.e. there are tokens remaining in
                the sequence to be processed), repeat from Step 1.
         """
         from random import randint
diff --git a/src/transformers/data/processors/glue.py b/src/transformers/data/processors/glue.py
index 3dc3e6544e..24ff39ddb3 100644
--- a/src/transformers/data/processors/glue.py
+++ b/src/transformers/data/processors/glue.py
@@ -48,20 +48,20 @@ def glue_convert_examples_to_features(
     output_mode=None,
 ):
     """
-    Loads a data file into a list of ``InputFeatures``
+    Loads a data file into a list of `InputFeatures`
 
     Args:
-        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
+        examples: List of `InputExamples` or `tf.data.Dataset` containing the examples.
         tokenizer: Instance of a tokenizer that will tokenize the examples
         max_length: Maximum example length. Defaults to the tokenizer's max_len
         task: GLUE task
-        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
-        output_mode: String indicating the output mode. Either ``regression`` or ``classification``
+        label_list: List of labels. Can be obtained from the processor using the `processor.get_labels()` method
+        output_mode: String indicating the output mode. Either `regression` or `classification`
 
     Returns:
-        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the
-        task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific
-        ``InputFeatures`` which can be fed to the model.
+        If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the
+        task-specific features. If the input is a list of `InputExamples`, will return a list of task-specific
+        `InputFeatures` which can be fed to the model.
 
     """
     warnings.warn(DEPRECATION_WARNING.format("function"), FutureWarning)
@@ -84,7 +84,7 @@ if is_tf_available():
     ) -> tf.data.Dataset:
         """
         Returns:
-            A ``tf.data.Dataset`` containing the task-specific features.
+            A `tf.data.Dataset` containing the task-specific features.
 
         """
         processor = glue_processors[task]()
diff --git a/src/transformers/data/processors/squad.py b/src/transformers/data/processors/squad.py
index cea84fb3b1..208ebe504f 100644
--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@@ -332,8 +332,8 @@ def squad_convert_examples_to_features(
     model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
 
     Args:
-        examples: list of :class:`~transformers.data.processors.squad.SquadExample`
-        tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer`
+        examples: list of [`~data.processors.squad.SquadExample`]
+        tokenizer: an instance of a child of [`PreTrainedTokenizer`]
         max_seq_length: The maximum sequence length of the inputs.
         doc_stride: The stride used when the context is too large and is split across several features.
         max_query_length: The maximum length of the query.
@@ -345,22 +345,23 @@ def squad_convert_examples_to_features(
 
 
     Returns:
-        list of :class:`~transformers.data.processors.squad.SquadFeatures`
+        list of [`~data.processors.squad.SquadFeatures`]
 
-    Example::
+    Example:
 
-        processor = SquadV2Processor()
-        examples = processor.get_dev_examples(data_dir)
+    ```python
+    processor = SquadV2Processor()
+    examples = processor.get_dev_examples(data_dir)
 
-        features = squad_convert_examples_to_features(
-            examples=examples,
-            tokenizer=tokenizer,
-            max_seq_length=args.max_seq_length,
-            doc_stride=args.doc_stride,
-            max_query_length=args.max_query_length,
-            is_training=not evaluate,
-        )
-    """
+    features = squad_convert_examples_to_features(
+        examples=examples,
+        tokenizer=tokenizer,
+        max_seq_length=args.max_seq_length,
+        doc_stride=args.doc_stride,
+        max_query_length=args.max_query_length,
+        is_training=not evaluate,
+    )
+    ```"""
     # Defining helper methods
     features = []
 
@@ -574,23 +575,24 @@ class SquadProcessor(DataProcessor):
 
     def get_examples_from_dataset(self, dataset, evaluate=False):
         """
-        Creates a list of :class:`~transformers.data.processors.squad.SquadExample` using a TFDS dataset.
+        Creates a list of [`~data.processors.squad.SquadExample`] using a TFDS dataset.
 
         Args:
-            dataset: The tfds dataset loaded from `tensorflow_datasets.load("squad")`
+            dataset: The tfds dataset loaded from *tensorflow_datasets.load("squad")*
             evaluate: Boolean specifying if in evaluation mode or in training mode
 
         Returns:
             List of SquadExample
 
-        Examples::
+        Examples:
 
-            >>> import tensorflow_datasets as tfds
-            >>> dataset = tfds.load("squad")
+        ```python
+        >>> import tensorflow_datasets as tfds
+        >>> dataset = tfds.load("squad")
 
-            >>> training_examples = get_examples_from_dataset(dataset, evaluate=False)
-            >>> evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
-        """
+        >>> training_examples = get_examples_from_dataset(dataset, evaluate=False)
+        >>> evaluation_examples = get_examples_from_dataset(dataset, evaluate=True)
+        ```"""
 
         if evaluate:
             dataset = dataset["validation"]
@@ -759,8 +761,8 @@ class SquadExample:
 class SquadFeatures:
     """
     Single squad example features to be fed to a model. Those features are model-specific and can be crafted from
-    :class:`~transformers.data.processors.squad.SquadExample` using the
-    :method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method.
+    [`~data.processors.squad.SquadExample`] using the
+    :method:*~transformers.data.processors.squad.squad_convert_examples_to_features* method.
 
     Args:
         input_ids: Indices of input sequence tokens in the vocabulary.
diff --git a/src/transformers/data/processors/utils.py b/src/transformers/data/processors/utils.py
index e96376d01e..bb008fe153 100644
--- a/src/transformers/data/processors/utils.py
+++ b/src/transformers/data/processors/utils.py
@@ -60,7 +60,7 @@ class InputFeatures:
     Args:
         input_ids: Indices of input sequence tokens in the vocabulary.
         attention_mask: Mask to avoid performing attention on padding token indices.
-            Mask values selected in ``[0, 1]``: Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded)
+            Mask values selected in `[0, 1]`: Usually `1` for tokens that are NOT MASKED, `0` for MASKED (padded)
             tokens.
         token_type_ids: (Optional) Segment token indices to indicate first and second
             portions of the inputs. Only some models use them.
@@ -92,15 +92,15 @@ class DataProcessor:
         raise NotImplementedError()
 
     def get_train_examples(self, data_dir):
-        """Gets a collection of :class:`InputExample` for the train set."""
+        """Gets a collection of [`InputExample`] for the train set."""
         raise NotImplementedError()
 
     def get_dev_examples(self, data_dir):
-        """Gets a collection of :class:`InputExample` for the dev set."""
+        """Gets a collection of [`InputExample`] for the dev set."""
         raise NotImplementedError()
 
     def get_test_examples(self, data_dir):
-        """Gets a collection of :class:`InputExample` for the test set."""
+        """Gets a collection of [`InputExample`] for the test set."""
         raise NotImplementedError()
 
     def get_labels(self):
@@ -240,21 +240,21 @@ class SingleSentenceClassificationProcessor(DataProcessor):
         return_tensors=None,
     ):
         """
-        Convert examples in a list of ``InputFeatures``
+        Convert examples in a list of `InputFeatures`
 
         Args:
             tokenizer: Instance of a tokenizer that will tokenize the examples
             max_length: Maximum example length
-            pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
+            pad_on_left: If set to `True`, the examples will be padded on the left rather than on the right (default)
             pad_token: Padding token
-            mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
-                and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
+            mask_padding_with_zero: If set to `True`, the attention mask will be filled by `1` for actual values
+                and by `0` for padded values. If set to `False`, inverts it (`1` for padded values, `0` for
                 actual values)
 
         Returns:
-            If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the
-            task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific
-            ``InputFeatures`` which can be fed to the model.
+            If the `examples` input is a `tf.data.Dataset`, will return a `tf.data.Dataset` containing the
+            task-specific features. If the input is a list of `InputExamples`, will return a list of task-specific
+            `InputFeatures` which can be fed to the model.
 
         """
         if max_length is None:
diff --git a/src/transformers/debug_utils.py b/src/transformers/debug_utils.py
index 4588ca58f5..d876c40527 100644
--- a/src/transformers/debug_utils.py
+++ b/src/transformers/debug_utils.py
@@ -28,7 +28,7 @@ logger = logging.get_logger(__name__)
 class DebugUnderflowOverflow:
     """
     This debug class helps detect and understand where the model starts getting very large or very small, and more
-    importantly ``nan`` or ``inf`` weight and activation elements.
+    importantly `nan` or `inf` weight and activation elements.
 
     There are 2 working modes:
 
@@ -37,69 +37,77 @@ class DebugUnderflowOverflow:
 
     Mode 1: Underflow/overflow detection
 
-    To activate the underflow/overflow detection, initialize the object with the model ::
+    To activate the underflow/overflow detection, initialize the object with the model :
 
-        debug_overflow = DebugUnderflowOverflow(model)
+    ```python
+    debug_overflow = DebugUnderflowOverflow(model)
+    ```
 
-    then run the training as normal and if ``nan`` or ``inf`` gets detected in at least one of the weight, input or
-    output elements this module will throw an exception and will print ``max_frames_to_save`` frames that lead to this
+    then run the training as normal and if `nan` or `inf` gets detected in at least one of the weight, input or
+    output elements this module will throw an exception and will print `max_frames_to_save` frames that lead to this
     event, each frame reporting
 
-    1. the fully qualified module name plus the class name whose ``forward`` was run
+    1. the fully qualified module name plus the class name whose `forward` was run
     2. the absolute min and max value of all elements for each module weights, and the inputs and output
 
-    For example, here is the header and the last few frames in detection report for ``google/mt5-small`` run in fp16 mixed precision ::
+    For example, here is the header and the last few frames in detection report for `google/mt5-small` run in fp16 mixed precision :
 
-        Detected inf/nan during batch_number=0
-        Last 21 forward frames:
-        abs min  abs max  metadata
-        [...]
-                          encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
-        2.17e-07 4.50e+00 weight
-        1.79e-06 4.65e+00 input[0]
-        2.68e-06 3.70e+01 output
-                          encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
-        8.08e-07 2.66e+01 weight
-        1.79e-06 4.65e+00 input[0]
-        1.27e-04 2.37e+02 output
-                          encoder.block.2.layer.1.DenseReluDense.wo Linear
-        1.01e-06 6.44e+00 weight
-        0.00e+00 9.74e+03 input[0]
-        3.18e-04 6.27e+04 output
-                          encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
-        1.79e-06 4.65e+00 input[0]
-        3.18e-04 6.27e+04 output
-                          encoder.block.2.layer.1.dropout Dropout
-        3.18e-04 6.27e+04 input[0]
-        0.00e+00      inf output
+    ```
+    Detected inf/nan during batch_number=0
+    Last 21 forward frames:
+    abs min  abs max  metadata
+    [...]
+                      encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
+    2.17e-07 4.50e+00 weight
+    1.79e-06 4.65e+00 input[0]
+    2.68e-06 3.70e+01 output
+                      encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
+    8.08e-07 2.66e+01 weight
+    1.79e-06 4.65e+00 input[0]
+    1.27e-04 2.37e+02 output
+                      encoder.block.2.layer.1.DenseReluDense.wo Linear
+    1.01e-06 6.44e+00 weight
+    0.00e+00 9.74e+03 input[0]
+    3.18e-04 6.27e+04 output
+                      encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
+    1.79e-06 4.65e+00 input[0]
+    3.18e-04 6.27e+04 output
+                      encoder.block.2.layer.1.dropout Dropout
+    3.18e-04 6.27e+04 input[0]
+    0.00e+00      inf output
+    ```
 
-    You can see here, that ``T5DenseGatedGeluDense.forward`` resulted in output activations, whose absolute max value
-    was around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have ``Dropout`` which
+    You can see here, that `T5DenseGatedGeluDense.forward` resulted in output activations, whose absolute max value
+    was around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have `Dropout` which
     renormalizes the weights, after it zeroed some of the elements, which pushes the absolute max value to more than
     64K, and we get an overlow.
 
     As you can see it's the previous frames that we need to look into when the numbers start going into very large for
     fp16 numbers.
 
-    The tracking is done in a forward hook, which gets invoked immediately after ``forward`` has completed.
+    The tracking is done in a forward hook, which gets invoked immediately after `forward` has completed.
 
-    By default the last 21 frames are printed. You can change the default to adjust for your needs. For example ::
+    By default the last 21 frames are printed. You can change the default to adjust for your needs. For example :
 
-        debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100)
+    ```python
+    debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100)
+    ```
 
-    To validate that you have set up this debugging feature correctly, and you intend to use it in a training that may
-    take hours to complete, first run it with normal tracing enabled for one of a few batches as explained in the next
-    section.
+        To validate that you have set up this debugging feature correctly, and you intend to use it in a training that may
+        take hours to complete, first run it with normal tracing enabled for one of a few batches as explained in the next
+        section.
 
 
-    Mode 2. Specific batch absolute min/max tracing without detection
+        Mode 2. Specific batch absolute min/max tracing without detection
 
-    The second work mode is per-batch tracing with the underflow/overflow detection feature turned off.
+        The second work mode is per-batch tracing with the underflow/overflow detection feature turned off.
 
-    Let's say you want to watch the absolute min and max values for all the ingredients of each ``forward`` call of a
-    given batch, and only do that for batches 1 and 3. Then you instantiate this class as ::
+        Let's say you want to watch the absolute min and max values for all the ingredients of each `forward` call of a
+    given batch, and only do that for batches 1 and 3. Then you instantiate this class as :
 
-        debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3])
+    ```python
+    debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3])
+    ```
 
     And now full batches 1 and 3 will be traced using the same format as explained above. Batches are 0-indexed.
 
@@ -109,28 +117,29 @@ class DebugUnderflowOverflow:
 
     Early stopping:
 
-    You can also specify the batch number after which to stop the training, with ::
+    You can also specify the batch number after which to stop the training, with :
 
-        debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3], abort_after_batch_num=3)
+    ```python
+    debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3], abort_after_batch_num=3)
+    ```
 
     This feature is mainly useful in the tracing mode, but you can use it for any mode.
 
 
     **Performance**:
 
-    As this module measures absolute ``min``/``max`` of each weight of the model on every forward it'll slow the
+    As this module measures absolute `min`/``max` of each weight of the model on every forward it'll slow the
     training down. Therefore remember to turn it off once the debugging needs have been met.
 
     Args:
-        model (:obj:`nn.Module`):
+        model (`nn.Module`):
             The model to debug.
-        max_frames_to_save (:obj:`int`, `optional`, defaults to 21):
+        max_frames_to_save (`int`, *optional*, defaults to 21):
             How many frames back to record
-        trace_batch_nums(:obj:`List[int]`, `optional`, defaults to ``[]``):
+        trace_batch_nums(`List[int]`, *optional*, defaults to `[]`):
             Which batch numbers to trace (turns detection off)
-        abort_after_batch_num  (:obj:`int`, `optional`):
+        abort_after_batch_num  (`int``, *optional*):
             Whether to abort after a certain batch number has finished
-
     """
 
     def __init__(self, model, max_frames_to_save=21, trace_batch_nums=[], abort_after_batch_num=None):
@@ -287,7 +296,7 @@ def get_abs_min_max(var, ctx):
 
 def detect_overflow(var, ctx):
     """
-    Report whether the tensor contains any ``nan`` or ``inf`` entries.
+    Report whether the tensor contains any `nan` or `inf` entries.
 
     This is useful for detecting overflows/underflows and best to call right after the function that did some math that
     modified the tensor in question.
@@ -300,7 +309,7 @@ def detect_overflow(var, ctx):
         ctx: the message to print as a context
 
     Return:
-        :obj:`True` if ``inf`` or ``nan`` was detected, :obj:`False` otherwise
+        `True` if `inf` or `nan` was detected, `False` otherwise
     """
     detected = False
     if torch.isnan(var).any().item():
diff --git a/src/transformers/deepspeed.py b/src/transformers/deepspeed.py
index d1d2114c89..0680be888d 100644
--- a/src/transformers/deepspeed.py
+++ b/src/transformers/deepspeed.py
@@ -41,16 +41,16 @@ class HfDeepSpeedConfig:
     """
     This object contains a DeepSpeed configuration dictionary and can be quickly queried for things like zero stage.
 
-    A ``weakref`` of this object is stored in the module's globals to be able to access the config from areas where
-    things like the Trainer object is not available (e.g. ``from_pretrained`` and ``_get_resized_embeddings``).
+    A `weakref` of this object is stored in the module's globals to be able to access the config from areas where
+    things like the Trainer object is not available (e.g. `from_pretrained` and `_get_resized_embeddings`).
     Therefore it's important that this object remains alive while the program is still running.
 
-    :class:`~transformers.Trainer` uses the ``HfTrainerDeepSpeedConfig`` subclass instead. That subclass has logic to
-    sync the configuration with values of :class:`~transformers.TrainingArguments` by replacing special placeholder
-    values: ``"auto"``. Without this special logic the DeepSpeed configuration is not modified in any way.
+    [`Trainer`] uses the `HfTrainerDeepSpeedConfig` subclass instead. That subclass has logic to
+    sync the configuration with values of [`TrainingArguments`] by replacing special placeholder
+    values: `"auto"`. Without this special logic the DeepSpeed configuration is not modified in any way.
 
     Args:
-        config_file_or_dict (:obj:`Union[str, Dict]`): path to DeepSpeed config file or dict.
+        config_file_or_dict (`Union[str, Dict]`): path to DeepSpeed config file or dict.
 
     """
 
@@ -104,7 +104,7 @@ class HfDeepSpeedConfig:
 
     def get_value(self, ds_key_long, default=None):
         """
-        Returns the set value or ``default`` if no value is set
+        Returns the set value or `default` if no value is set
         """
         config, ds_key = self.find_config_node(ds_key_long)
         if config is None:
@@ -115,7 +115,7 @@ class HfDeepSpeedConfig:
         """
         Deletes a sub-section of the config file if it's found.
 
-        Unless ``must_exist`` is :obj:`True` the section doesn't have to exist.
+        Unless `must_exist` is `True` the section doesn't have to exist.
         """
         config = self.config
 
@@ -136,8 +136,7 @@ class HfDeepSpeedConfig:
 
     def is_true(self, ds_key_long):
         """
-        Returns :obj:`True`/:obj:`False` only if the value is set, always :obj:`False` otherwise. So use this method to
-        ask the very specific question of whether the value is set to :obj:`True` (and it's not set to :obj:`False` or
+        Returns `True`/``False` only if the value is set, always `False` otherwise. So use this method to ask the very specific question of whether the value is set to `True` (and it's not set to `False`` or
         isn't set).
 
         """
@@ -146,8 +145,7 @@ class HfDeepSpeedConfig:
 
     def is_false(self, ds_key_long):
         """
-        Returns :obj:`True`/:obj:`False` only if the value is set, always :obj:`False` otherwise. So use this method to
-        ask the very specific question of whether the value is set to :obj:`False` (and it's not set to :obj:`True` or
+        Returns `True`/``False` only if the value is set, always `False` otherwise. So use this method to ask the very specific question of whether the value is set to `False` (and it's not set to `True`` or
         isn't set).
         """
         value = self.get_value(ds_key_long)
@@ -165,7 +163,7 @@ class HfDeepSpeedConfig:
 
 class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig):
     """
-    The ``HfTrainerDeepSpeedConfig`` object is meant to be created during ``TrainingArguments`` object creation and has
+    The `HfTrainerDeepSpeedConfig` object is meant to be created during `TrainingArguments` object creation and has
     the same lifespan as the latter.
     """
 
@@ -181,11 +179,11 @@ class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig):
         """
         A utility method that massages the config file and can optionally verify that the values match.
 
-        1. Replace "auto" values with ``TrainingArguments`` value.
+        1. Replace "auto" values with `TrainingArguments` value.
 
-        2. If it wasn't "auto" and ``must_match`` is true, then check that DS config matches Trainer
-        config values and if mismatched add the entry to ``self.mismatched`` - will assert during
-        ``trainer_config_finalize`` for one or more mismatches.
+        2. If it wasn't "auto" and `must_match` is true, then check that DS config matches Trainer
+        config values and if mismatched add the entry to `self.mismatched` - will assert during
+        `trainer_config_finalize` for one or more mismatches.
 
         """
         config, ds_key = self.find_config_node(ds_key_long)
@@ -207,7 +205,7 @@ class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig):
 
     def trainer_config_process(self, args):
         """
-        Adjust the config with ``TrainingArguments`` values. This stage is run during ``TrainingArguments`` object
+        Adjust the config with `TrainingArguments` values. This stage is run during `TrainingArguments` object
         creation.
         """
         # DeepSpeed does:
@@ -373,7 +371,7 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None, inf
     """
     Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args.
 
-    If ``resume_from_checkpoint`` was passed then an attempt to resume from a previously saved checkpoint will be made.
+    If `resume_from_checkpoint` was passed then an attempt to resume from a previously saved checkpoint will be made.
 
     Args:
         trainer: Trainer object
diff --git a/src/transformers/feature_extraction_sequence_utils.py b/src/transformers/feature_extraction_sequence_utils.py
index 5d8304b062..fbf2fc37ec 100644
--- a/src/transformers/feature_extraction_sequence_utils.py
+++ b/src/transformers/feature_extraction_sequence_utils.py
@@ -40,11 +40,11 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
     This is a general feature extraction class for speech recognition.
 
     Args:
-        feature_size (:obj:`int`):
+        feature_size (`int`):
             The feature dimension of the extracted features.
-        sampling_rate (:obj:`int`):
+        sampling_rate (`int`):
             The sampling rate at which the audio files should be digitalized expressed in Hertz per second (Hz).
-        padding_value (:obj:`float`):
+        padding_value (`float`):
             The value that is used to fill the padding values / vectors.
     """
 
@@ -79,53 +79,54 @@ class SequenceFeatureExtractor(FeatureExtractionMixin):
         max sequence length in the batch.
 
         Padding side (left/right) padding values are defined at the feature extractor level (with
-        ``self.padding_side``, ``self.padding_value``)
+        `self.padding_side`, `self.padding_value`)
 
-        .. note::
+        <Tip>
 
-            If the ``processed_features`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors,
-            the result will use the same type unless you provide a different tensor type with ``return_tensors``. In
-            the case of PyTorch tensors, you will lose the specific device of your tensors however.
+        If the `processed_features` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors,
+        the result will use the same type unless you provide a different tensor type with `return_tensors`. In
+        the case of PyTorch tensors, you will lose the specific device of your tensors however.
+
+        </Tip>
 
         Args:
-            processed_features (:class:`~transformers.BatchFeature`, list of :class:`~transformers.BatchFeature`, :obj:`Dict[str, List[float]]`, :obj:`Dict[str, List[List[float]]` or :obj:`List[Dict[str, List[float]]]`):
-                Processed inputs. Can represent one input (:class:`~transformers.BatchFeature` or :obj:`Dict[str,
-                List[float]]`) or a batch of input values / vectors (list of :class:`~transformers.BatchFeature`,
-                `Dict[str, List[List[float]]]` or `List[Dict[str, List[float]]]`) so you can use this method during
+            processed_features ([`BatchFeature`], list of [`BatchFeature`], `Dict[str, List[float]]`, `Dict[str, List[List[float]]` or `List[Dict[str, List[float]]]`):
+                Processed inputs. Can represent one input ([`BatchFeature`] or `Dict[str, List[float]]`) or a batch of input values / vectors (list of [`BatchFeature`],
+                *Dict[str, List[List[float]]]* or *List[Dict[str, List[float]]]*) so you can use this method during
                 preprocessing as well as in a PyTorch Dataloader collate function.
 
-                Instead of :obj:`List[float]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow
+                Instead of `List[float]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow
                 tensors), see the note above for the return type.
-            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `True`):
                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
                 index) among:
 
-                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a
                   single sequence if provided).
-                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
                   maximum acceptable input length for the model if that argument is not provided.
-                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
                   different lengths).
-            max_length (:obj:`int`, `optional`):
+            max_length (`int`, *optional*):
                 Maximum length of the returned list and optionally padding length (see above).
-            truncation (:obj:`bool`):
-                Activates truncation to cut input sequences longer than :obj:`max_length` to :obj:`max_length`.
-            pad_to_multiple_of (:obj:`int`, `optional`):
+            truncation (`bool`):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            pad_to_multiple_of (`int`, *optional*):
                 If set will pad the sequence to a multiple of the provided value.
 
                 This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                 >= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
-            return_attention_mask (:obj:`bool`, `optional`):
+            return_attention_mask (`bool`, *optional*):
                 Whether to return the attention mask. If left to the default, will return the attention mask according
                 to the specific feature_extractor's default.
 
-                `What are attention masks? <../glossary.html#attention-mask>`__
-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+                [What are attention masks?](../glossary#attention-mask)
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
 
-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
         """
         # If we have a list of dicts, let's convert it in a dict of lists
         # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py
index fd616b59cf..e96c8e23cd 100644
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@@ -54,16 +54,16 @@ PreTrainedFeatureExtractor = Union["SequenceFeatureExtractor"]  # noqa: F821
 
 class BatchFeature(UserDict):
     r"""
-    Holds the output of the :meth:`~transformers.SequenceFeatureExtractor.pad` and feature extractor specific
-    ``__call__`` methods.
+    Holds the output of the [`~SequenceFeatureExtractor.pad`] and feature extractor specific
+    `__call__` methods.
 
     This class is derived from a python dictionary and can be used as a dictionary.
 
     Args:
-        data (:obj:`dict`):
+        data (`dict`):
             Dictionary of lists/arrays/tensors returned by the __call__/pad methods ('input_values', 'attention_mask',
             etc.).
-        tensor_type (:obj:`Union[None, str, TensorType]`, `optional`):
+        tensor_type (`Union[None, str, TensorType]`, *optional*):
             You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
             initialization.
     """
@@ -74,7 +74,7 @@ class BatchFeature(UserDict):
 
     def __getitem__(self, item: str) -> Union[Any]:
         """
-        If the key is a string, returns the value of the dict associated to :obj:`key` ('input_values',
+        If the key is a string, returns the value of the dict associated to `key` ('input_values',
         'attention_mask', etc.).
         """
         if isinstance(item, str):
@@ -112,9 +112,9 @@ class BatchFeature(UserDict):
         Convert the inner content to tensors.
 
         Args:
-            tensor_type (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
-                The type of tensors to use. If :obj:`str`, should be one of the values of the enum
-                :class:`~transformers.file_utils.TensorType`. If :obj:`None`, no modification is done.
+            tensor_type (`str` or [`~file_utils.TensorType`], *optional*):
+                The type of tensors to use. If `str`, should be one of the values of the enum
+                [`~file_utils.TensorType`]. If `None`, no modification is done.
         """
         if tensor_type is None:
             return self
@@ -176,13 +176,13 @@ class BatchFeature(UserDict):
     # Copied from transformers.tokenization_utils_base.BatchEncoding.to with BatchEncoding->BatchFeature
     def to(self, device: Union[str, "torch.device"]) -> "BatchFeature":
         """
-        Send all values to device by calling :obj:`v.to(device)` (PyTorch only).
+        Send all values to device by calling `v.to(device)` (PyTorch only).
 
         Args:
-            device (:obj:`str` or :obj:`torch.device`): The device to put the tensors on.
+            device (`str` or `torch.device`): The device to put the tensors on.
 
         Returns:
-            :class:`~transformers.BatchFeature`: The same instance after modification.
+            [`BatchFeature`]: The same instance after modification.
         """
 
         # This check catches things like APEX blindly calling "to" on all inputs to a module
@@ -216,83 +216,84 @@ class FeatureExtractionMixin:
         cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
     ) -> PreTrainedFeatureExtractor:
         r"""
-        Instantiate a type of :class:`~transformers.feature_extraction_utils.FeatureExtractionMixin` from a feature
-        extractor, *e.g.* a derived class of :class:`~transformers.SequenceFeatureExtractor`.
+        Instantiate a type of [`~feature_extraction_utils.FeatureExtractionMixin`] from a feature
+        extractor, *e.g.* a derived class of [`SequenceFeatureExtractor`].
 
         Args:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
                 This can be either:
 
-                - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
-                  namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing a feature extractor file saved using the
-                  :func:`~transformers.feature_extraction_utils.FeatureExtractionMixin.save_pretrained` method, e.g.,
-                  ``./my_model_directory/``.
-                - a path or url to a saved feature extractor JSON `file`, e.g.,
-                  ``./my_model_directory/preprocessor_config.json``.
-            cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
+                - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
+                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                - a path to a *directory* containing a feature extractor file saved using the
+                  [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] method, e.g.,
+                  `./my_model_directory/`.
+                - a path or url to a saved feature extractor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.
+            cache_dir (`str` or `os.PathLike`, *optional*):
                 Path to a directory in which a downloaded pretrained model feature extractor should be cached if the
                 standard cache should not be used.
-            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            force_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to force to (re-)download the feature extractor files and override the cached versions
                 if they exist.
-            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            resume_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to delete incompletely received file. Attempts to resume the download if such a file
                 exists.
-            proxies (:obj:`Dict[str, str]`, `optional`):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-            use_auth_token (:obj:`str` or `bool`, `optional`):
-                The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
-                generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
-            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+            use_auth_token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token
+                generated when running `transformers-cli login` (stored in `~/.huggingface`).
+            revision(`str`, *optional*, defaults to `"main"`):
                 The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                 identifier allowed by git.
-            return_unused_kwargs (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                If :obj:`False`, then this function returns just the final feature extractor object. If :obj:`True`,
-                then this functions returns a :obj:`Tuple(feature_extractor, unused_kwargs)` where `unused_kwargs` is a
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                If `False`, then this function returns just the final feature extractor object. If `True`,
+                then this functions returns a `Tuple(feature_extractor, unused_kwargs)` where *unused_kwargs* is a
                 dictionary consisting of the key/value pairs whose keys are not feature extractor attributes: i.e., the
-                part of ``kwargs`` which has not been used to update ``feature_extractor`` and is otherwise ignored.
-            kwargs (:obj:`Dict[str, Any]`, `optional`):
+                part of `kwargs` which has not been used to update `feature_extractor` and is otherwise ignored.
+            kwargs (`Dict[str, Any]`, *optional*):
                 The values in kwargs of any keys which are feature extractor attributes will be used to override the
                 loaded values. Behavior concerning key/value pairs whose keys are *not* feature extractor attributes is
-                controlled by the ``return_unused_kwargs`` keyword parameter.
+                controlled by the `return_unused_kwargs` keyword parameter.
 
-        .. note::
+        <Tip>
 
-            Passing :obj:`use_auth_token=True` is required when you want to use a private model.
+        Passing `use_auth_token=True` is required when you want to use a private model.
 
+        </Tip>
 
         Returns:
-            A feature extractor of type :class:`~transformers.feature_extraction_utils.FeatureExtractionMixin`.
+            A feature extractor of type [`~feature_extraction_utils.FeatureExtractionMixin`].
 
-        Examples::
+        Examples:
 
-            # We can't instantiate directly the base class `FeatureExtractionMixin` nor `SequenceFeatureExtractor` so let's show the examples on a
-            # derived class: `Wav2Vec2FeatureExtractor`
-            feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h')    # Download feature_extraction_config from huggingface.co and cache.
-            feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('./test/saved_model/')  # E.g. feature_extractor (or model) was saved using `save_pretrained('./test/saved_model/')`
-            feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('./test/saved_model/preprocessor_config.json')
-            feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h', return_attention_mask=False, foo=False)
-            assert feature_extractor.return_attention_mask is False
-            feature_extractor, unused_kwargs = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h', return_attention_mask=False,
-                                                               foo=False, return_unused_kwargs=True)
-            assert feature_extractor.return_attention_mask is False
-            assert unused_kwargs == {'foo': False}
-        """
+        ```python
+        # We can't instantiate directly the base class *FeatureExtractionMixin* nor *SequenceFeatureExtractor* so let's show the examples on a
+        # derived class: *Wav2Vec2FeatureExtractor*
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h')    # Download feature_extraction_config from huggingface.co and cache.
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('./test/saved_model/')  # E.g. feature_extractor (or model) was saved using *save_pretrained('./test/saved_model/')*
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('./test/saved_model/preprocessor_config.json')
+        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h', return_attention_mask=False, foo=False)
+        assert feature_extractor.return_attention_mask is False
+        feature_extractor, unused_kwargs = Wav2Vec2FeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h', return_attention_mask=False,
+                                                           foo=False, return_unused_kwargs=True)
+        assert feature_extractor.return_attention_mask is False
+        assert unused_kwargs == {'foo': False}
+        ```"""
         feature_extractor_dict, kwargs = cls.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs)
 
         return cls.from_dict(feature_extractor_dict, **kwargs)
 
     def save_pretrained(self, save_directory: Union[str, os.PathLike]):
         """
-        Save a feature_extractor object to the directory ``save_directory``, so that it can be re-loaded using the
-        :func:`~transformers.feature_extraction_utils.FeatureExtractionMixin.from_pretrained` class method.
+        Save a feature_extractor object to the directory `save_directory`, so that it can be re-loaded using the
+        [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] class method.
 
         Args:
-            save_directory (:obj:`str` or :obj:`os.PathLike`):
+            save_directory (`str` or `os.PathLike`):
                 Directory where the feature extractor JSON file will be saved (will be created if it does not exist).
         """
         if os.path.isfile(save_directory):
@@ -309,16 +310,16 @@ class FeatureExtractionMixin:
         cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
     ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         """
-        From a ``pretrained_model_name_or_path``, resolve to a dictionary of parameters, to be used for instantiating a
-        feature extractor of type :class:`~transformers.feature_extraction_utils.FeatureExtractionMixin` using
-        ``from_dict``.
+        From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a
+        feature extractor of type [`~feature_extraction_utils.FeatureExtractionMixin`] using
+        `from_dict`.
 
         Parameters:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
                 The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
 
         Returns:
-            :obj:`Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the feature extractor
+            `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the feature extractor
             object.
         """
         cache_dir = kwargs.pop("cache_dir", None)
@@ -397,19 +398,19 @@ class FeatureExtractionMixin:
     @classmethod
     def from_dict(cls, feature_extractor_dict: Dict[str, Any], **kwargs) -> PreTrainedFeatureExtractor:
         """
-        Instantiates a type of :class:`~transformers.feature_extraction_utils.FeatureExtractionMixin` from a Python
+        Instantiates a type of [`~feature_extraction_utils.FeatureExtractionMixin`] from a Python
         dictionary of parameters.
 
         Args:
-            feature_extractor_dict (:obj:`Dict[str, Any]`):
+            feature_extractor_dict (`Dict[str, Any]`):
                 Dictionary that will be used to instantiate the feature extractor object. Such a dictionary can be
                 retrieved from a pretrained checkpoint by leveraging the
-                :func:`~transformers.feature_extraction_utils.FeatureExtractionMixin.to_dict` method.
-            kwargs (:obj:`Dict[str, Any]`):
+                [`~feature_extraction_utils.FeatureExtractionMixin.to_dict`] method.
+            kwargs (`Dict[str, Any]`):
                 Additional parameters from which to initialize the feature extractor object.
 
         Returns:
-            :class:`~transformers.feature_extraction_utils.FeatureExtractionMixin`: The feature extractor object
+            [`~feature_extraction_utils.FeatureExtractionMixin`]: The feature extractor object
             instantiated from those parameters.
         """
         return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
@@ -436,7 +437,7 @@ class FeatureExtractionMixin:
         Serializes this instance to a Python dictionary.
 
         Returns:
-            :obj:`Dict[str, Any]`: Dictionary of all the attributes that make up this feature extractor instance.
+            `Dict[str, Any]`: Dictionary of all the attributes that make up this feature extractor instance.
         """
         output = copy.deepcopy(self.__dict__)
         output["feature_extractor_type"] = self.__class__.__name__
@@ -446,15 +447,15 @@ class FeatureExtractionMixin:
     @classmethod
     def from_json_file(cls, json_file: Union[str, os.PathLike]) -> PreTrainedFeatureExtractor:
         """
-        Instantiates a feature extractor of type :class:`~transformers.feature_extraction_utils.FeatureExtractionMixin`
+        Instantiates a feature extractor of type [`~feature_extraction_utils.FeatureExtractionMixin`]
         from the path to a JSON file of parameters.
 
         Args:
-            json_file (:obj:`str` or :obj:`os.PathLike`):
+            json_file (`str` or `os.PathLike`):
                 Path to the JSON file containing the parameters.
 
         Returns:
-            A feature extractor of type :class:`~transformers.feature_extraction_utils.FeatureExtractionMixin`: The
+            A feature extractor of type [`~feature_extraction_utils.FeatureExtractionMixin`]: The
             feature_extractor object instantiated from that JSON file.
         """
         with open(json_file, "r", encoding="utf-8") as reader:
@@ -467,7 +468,7 @@ class FeatureExtractionMixin:
         Serializes this instance to a JSON string.
 
         Returns:
-            :obj:`str`: String containing all the attributes that make up this feature_extractor instance in JSON
+            `str`: String containing all the attributes that make up this feature_extractor instance in JSON
             format.
         """
         dictionary = self.to_dict()
@@ -483,7 +484,7 @@ class FeatureExtractionMixin:
         Save this instance to a JSON file.
 
         Args:
-            json_file_path (:obj:`str` or :obj:`os.PathLike`):
+            json_file_path (`str` or `os.PathLike`):
                 Path to the JSON file in which this feature_extractor instance's parameters will be saved.
         """
         with open(json_file_path, "w", encoding="utf-8") as writer:
diff --git a/src/transformers/generation_beam_search.py b/src/transformers/generation_beam_search.py
index aa20350b9a..663e8c31f0 100644
--- a/src/transformers/generation_beam_search.py
+++ b/src/transformers/generation_beam_search.py
@@ -25,70 +25,70 @@ from .file_utils import add_start_docstrings
 
 PROCESS_INPUTS_DOCSTRING = r"""
     Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * num_beams, sequence_length)`):
+        input_ids (`torch.LongTensor` of shape `(batch_size * num_beams, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using any class inheriting from :class:`~transformers.PreTrainedTokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            Indices can be obtained using any class inheriting from [`PreTrainedTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
             details.
 
-            `What are input IDs? <../glossary.html#input-ids>`__
-        next_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2 * num_beams)`):
-            Current scores of the top :obj:`2 * num_beams` non-finished beam hypotheses.
-        next_tokens (:obj:`torch.LongTensor` of shape :obj:`(batch_size, 2 * num_beams)`):
-            :obj:`input_ids` of the tokens corresponding to the top :obj:`2 * num_beams` non-finished beam hypotheses.
-        next_indices (:obj:`torch.LongTensor` of shape :obj:`(batch_size, 2 * num_beams)`):
-            Beam indices indicating to which beam hypothesis the :obj:`next_tokens` correspond.
-        pad_token_id (:obj:`int`, `optional`):
-            The id of the `padding` token.
-        eos_token_id (:obj:`int`, `optional`):
-            The id of the `end-of-sequence` token.
+            [What are input IDs?](../glossary#input-ids)
+        next_scores (`torch.FloatTensor` of shape `(batch_size, 2 * num_beams)`):
+            Current scores of the top `2 * num_beams` non-finished beam hypotheses.
+        next_tokens (`torch.LongTensor` of shape `(batch_size, 2 * num_beams)`):
+            `input_ids` of the tokens corresponding to the top `2 * num_beams` non-finished beam hypotheses.
+        next_indices (`torch.LongTensor` of shape `(batch_size, 2 * num_beams)`):
+            Beam indices indicating to which beam hypothesis the `next_tokens` correspond.
+        pad_token_id (`int`, *optional*):
+            The id of the *padding* token.
+        eos_token_id (`int`, *optional*):
+            The id of the *end-of-sequence* token.
 
     Return:
-        :obj:`UserDict`: A dictionary composed of the fields as defined above:
+        `UserDict`: A dictionary composed of the fields as defined above:
 
-            - **next_beam_scores** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Updated
+            - **next_beam_scores** (`torch.FloatTensor` of shape `(batch_size * num_beams)`) -- Updated
               scores of all non-finished beams.
-            - **next_beam_tokens** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Next tokens
+            - **next_beam_tokens** (`torch.FloatTensor` of shape `(batch_size * num_beams)`) -- Next tokens
               to be added to the non-finished beam_hypotheses.
-            - **next_beam_indices** (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`) -- Beam indices
+            - **next_beam_indices** (`torch.FloatTensor` of shape `(batch_size * num_beams)`) -- Beam indices
               indicating to which beam the next tokens shall be added.
 
 """
 
 FINALIZE_INPUTS_DOCSTRING = r"""
     Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size * num_beams, sequence_length)`):
+        input_ids (`torch.LongTensor` of shape `(batch_size * num_beams, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using any class inheriting from :class:`~transformers.PreTrainedTokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            Indices can be obtained using any class inheriting from [`PreTrainedTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
             details.
 
-            `What are input IDs? <../glossary.html#input-ids>`__
-        final_beam_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`):
+            [What are input IDs?](../glossary#input-ids)
+        final_beam_scores (`torch.FloatTensor` of shape `(batch_size * num_beams)`):
             The final scores of all non-finished beams.
-        final_beam_tokens (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`):
+        final_beam_tokens (`torch.FloatTensor` of shape `(batch_size * num_beams)`):
             The last tokens to be added to the non-finished beam_hypotheses.
-        final_beam_indices (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_beams)`):
-            The beam indices indicating to which beam the :obj:`final_beam_tokens` shall be added.
-        pad_token_id (:obj:`int`, `optional`):
-            The id of the `padding` token.
-        eos_token_id (:obj:`int`, `optional`):
-            The id of the `end-of-sequence` token.
+        final_beam_indices (`torch.FloatTensor` of shape `(batch_size * num_beams)`):
+            The beam indices indicating to which beam the `final_beam_tokens` shall be added.
+        pad_token_id (`int`, *optional*):
+            The id of the *padding* token.
+        eos_token_id (`int`, *optional*):
+            The id of the *end-of-sequence* token.
 
     Return:
-        :obj:`torch.LongTensor` of shape :obj:`(batch_size * num_return_sequences, sequence_length)`: The generated
-        sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or shorter if all
-        batches finished early due to the :obj:`eos_token_id`.
+        `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)`: The generated
+        sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter if all
+        batches finished early due to the `eos_token_id`.
 
 """
 
 
 class BeamScorer(ABC):
     """
-    Abstract base class for all beam scorers that are used for :meth:`~transformers.PreTrainedModel.beam_search` and
-    :meth:`~transformers.PreTrainedModel.beam_sample`.
+    Abstract base class for all beam scorers that are used for [`~PreTrainedModel.beam_search`] and
+    [`~PreTrainedModel.beam_sample`].
     """
 
     @abstractmethod
@@ -119,36 +119,34 @@ class BeamScorer(ABC):
 
 class BeamSearchScorer(BeamScorer):
     r"""
-    :class:`transformers.BeamScorer` implementing standard beam search decoding.
+    [`BeamScorer`] implementing standard beam search decoding.
 
-    Adapted in part from `Facebook's XLM beam search code
-    <https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529>`__.
+    Adapted in part from [Facebook's XLM beam search code](https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529).
 
-    Reference for the diverse beam search algorithm and implementation `Ashwin Kalyan's DBS implementation
-    <https://github.com/ashwinkalyan/dbs/blob/master/dbs/beam_utils.lua>`__
+    Reference for the diverse beam search algorithm and implementation [Ashwin Kalyan's DBS implementation](https://github.com/ashwinkalyan/dbs/blob/master/dbs/beam_utils.lua)
 
     Args:
-        batch_size (:obj:`int`):
-            Batch Size of :obj:`input_ids` for which standard beam search decoding is run in parallel.
-        max_length (:obj:`int`):
+        batch_size (`int`):
+            Batch Size of `input_ids` for which standard beam search decoding is run in parallel.
+        max_length (`int`):
             The maximum length of the sequence to be generated.
-        num_beams (:obj:`int`):
+        num_beams (`int`):
             Number of beams for beam search.
-        device (:obj:`torch.device`):
-            Defines the device type (*e.g.*, :obj:`"cpu"` or :obj:`"cuda"`) on which this instance of
-            :obj:`BeamSearchScorer` will be allocated.
-        length_penalty (:obj:`float`, `optional`, defaults to 1.0):
+        device (`torch.device`):
+            Defines the device type (*e.g.*, `"cpu"` or `"cuda"`) on which this instance of
+            `BeamSearchScorer` will be allocated.
+        length_penalty (`float`, *optional*, defaults to 1.0):
             Exponential penalty to the length. 1.0 means no penalty. Set to values < 1.0 in order to encourage the
             model to generate shorter sequences, to a value > 1.0 in order to encourage the model to produce longer
             sequences.
-        do_early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not.
-        num_beam_hyps_to_keep (:obj:`int`, `optional`, defaults to 1):
+        do_early_stopping (`bool`, *optional*, defaults to `False`):
+            Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not.
+        num_beam_hyps_to_keep (`int`, *optional*, defaults to 1):
             The number of beam hypotheses that shall be returned upon calling
-            :meth:`~transformer.BeamSearchScorer.finalize`.
-        num_beam_groups (:obj:`int`):
-            Number of groups to divide :obj:`num_beams` into in order to ensure diversity among different groups of
-            beams. See `this paper <https://arxiv.org/pdf/1610.02424.pdf>`__ for more details.
+            [`~transformer.BeamSearchScorer.finalize`].
+        num_beam_groups (`int`):
+            Number of groups to divide `num_beams` into in order to ensure diversity among different groups of
+            beams. See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
     """
 
     def __init__(
diff --git a/src/transformers/generation_flax_logits_process.py b/src/transformers/generation_flax_logits_process.py
index 1244291775..1b6bd4df6c 100644
--- a/src/transformers/generation_flax_logits_process.py
+++ b/src/transformers/generation_flax_logits_process.py
@@ -29,22 +29,22 @@ logger = get_logger(__name__)
 
 LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""
     Args:
-        input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`):
+        input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using :class:`~transformers.PreTrainedTokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            Indices can be obtained using [`PreTrainedTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
             details.
 
-            `What are input IDs? <../glossary.html#input-ids>`__
-        scores (:obj:`jnp.ndarray` of shape :obj:`(batch_size, config.vocab_size)`):
+            [What are input IDs?](../glossary#input-ids)
+        scores (`jnp.ndarray` of shape `(batch_size, config.vocab_size)`):
             Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam
             search or log softmax for each vocabulary token when using beam search
         kwargs:
             Additional logits processor specific kwargs.
 
     Return:
-        :obj:`jnp.ndarray` of shape :obj:`(batch_size, config.vocab_size)`: The processed prediction scores.
+        `jnp.ndarray` of shape `(batch_size, config.vocab_size)`: The processed prediction scores.
 
 """
 
@@ -73,10 +73,10 @@ class FlaxLogitsWarper(ABC):
 
 class FlaxLogitsProcessorList(list):
     """
-    This class can be used to create a list of :class:`~transformers.FlaxLogitsProcessor` or
-    :class:`~transformers.FlaxLogitsWarper` to subsequently process a :obj:`scores` input tensor. This class inherits
-    from list and adds a specific `__call__` method to apply each :class:`~transformers.FlaxLogitsProcessor` or
-    :class:`~transformers.FlaxLogitsWarper` to the inputs.
+    This class can be used to create a list of [`FlaxLogitsProcessor`] or
+    [`FlaxLogitsWarper`] to subsequently process a `scores` input tensor. This class inherits
+    from list and adds a specific *__call__* method to apply each [`FlaxLogitsProcessor`] or
+    [`FlaxLogitsWarper`] to the inputs.
     """
 
     @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
@@ -97,10 +97,10 @@ class FlaxLogitsProcessorList(list):
 
 class FlaxTemperatureLogitsWarper(FlaxLogitsWarper):
     r"""
-    :class:`transformers.LogitsWarper` for temperature (exponential scaling output probability distribution).
+    [`LogitsWarper`] for temperature (exponential scaling output probability distribution).
 
     Args:
-        temperature (:obj:`float`):
+        temperature (`float`):
             The value used to module the logits distribution.
     """
 
@@ -117,16 +117,16 @@ class FlaxTemperatureLogitsWarper(FlaxLogitsWarper):
 
 class FlaxTopPLogitsWarper(FlaxLogitsWarper):
     """
-    :class:`transformers.LogitsWarper` that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <=
+    [`LogitsWarper`] that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <=
     prob_cut_off.
 
     Args:
-        top_p (:obj:`float`):
-            If set to < 1, only the most probable tokens with probabilities that add up to :obj:`top_p` or higher are
+        top_p (`float`):
+            If set to < 1, only the most probable tokens with probabilities that add up to `top_p` or higher are
             kept for generation.
-        filter_value (:obj:`float`, `optional`, defaults to :obj:`-float("Inf")`):
+        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
             All filtered values will be set to this float value.
-        min_tokens_to_keep (:obj:`int`, `optional`, defaults to 1):
+        min_tokens_to_keep (`int`, *optional*, defaults to 1):
             Minimum number of tokens that cannot be filtered.
     """
 
@@ -159,14 +159,14 @@ class FlaxTopPLogitsWarper(FlaxLogitsWarper):
 
 class FlaxTopKLogitsWarper(FlaxLogitsWarper):
     r"""
-    :class:`transformers.LogitsWarper` that performs top-k, i.e. restricting to the k highest probability elements.
+    [`LogitsWarper`] that performs top-k, i.e. restricting to the k highest probability elements.
 
     Args:
-        top_k (:obj:`int`):
+        top_k (`int`):
             The number of highest probability vocabulary tokens to keep for top-k-filtering.
-        filter_value (:obj:`float`, `optional`, defaults to :obj:`-float("Inf")`):
+        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
             All filtered values will be set to this float value.
-        min_tokens_to_keep (:obj:`int`, `optional`, defaults to 1):
+        min_tokens_to_keep (`int`, *optional*, defaults to 1):
             Minimum number of tokens that cannot be filtered.
     """
 
@@ -195,10 +195,10 @@ class FlaxTopKLogitsWarper(FlaxLogitsWarper):
 
 class FlaxForcedBOSTokenLogitsProcessor(FlaxLogitsProcessor):
     r"""
-    :class:`~transformers.FlaxLogitsProcessor` that enforces the specified token as the first generated token.
+    [`FlaxLogitsProcessor`] that enforces the specified token as the first generated token.
 
     Args:
-        bos_token_id (:obj:`int`):
+        bos_token_id (`int`):
             The id of the token to force as the first generated token.
     """
 
@@ -219,14 +219,14 @@ class FlaxForcedBOSTokenLogitsProcessor(FlaxLogitsProcessor):
 
 class FlaxForcedEOSTokenLogitsProcessor(FlaxLogitsProcessor):
     r"""
-    :class:`~transformers.FlaxLogitsProcessor` that enforces the specified token as the last generated token when
-    :obj:`max_length` is reached.
+    [`FlaxLogitsProcessor`] that enforces the specified token as the last generated token when
+    `max_length` is reached.
 
     Args:
-        max_length (:obj:`int`):
+        max_length (`int`):
             The maximum length of the sequence to be generated.
-        eos_token_id (:obj:`int`):
-            The id of the token to force as the last generated token when :obj:`max_length` is reached.
+        eos_token_id (`int`):
+            The id of the token to force as the last generated token when `max_length` is reached.
     """
 
     def __init__(self, max_length: int, eos_token_id: int):
@@ -247,13 +247,13 @@ class FlaxForcedEOSTokenLogitsProcessor(FlaxLogitsProcessor):
 
 class FlaxMinLengthLogitsProcessor(FlaxLogitsProcessor):
     r"""
-    :class:`transformers.FlaxLogitsProcessor` enforcing a min-length by setting EOS probability to 0.
+    [`FlaxLogitsProcessor`] enforcing a min-length by setting EOS probability to 0.
 
     Args:
-        min_length (:obj:`int`):
-            The minimum length below which the score of :obj:`eos_token_id` is set to :obj:`-float("Inf")`.
-        eos_token_id (:obj:`int`):
-            The id of the `end-of-sequence` token.
+        min_length (`int`):
+            The minimum length below which the score of `eos_token_id` is set to `-float("Inf")`.
+        eos_token_id (`int`):
+            The id of the *end-of-sequence* token.
     """
 
     def __init__(self, min_length: int, eos_token_id: int):
diff --git a/src/transformers/generation_flax_utils.py b/src/transformers/generation_flax_utils.py
index fa5a4225e5..634540a215 100644
--- a/src/transformers/generation_flax_utils.py
+++ b/src/transformers/generation_flax_utils.py
@@ -48,7 +48,7 @@ class FlaxGreedySearchOutput(ModelOutput):
 
 
     Args:
-        sequences (:obj:`jnp.ndarray` of shape :obj:`(batch_size, max_length)`):
+        sequences (`jnp.ndarray` of shape `(batch_size, max_length)`):
             The generated sequences.
     """
 
@@ -62,7 +62,7 @@ class FlaxSampleOutput(ModelOutput):
 
 
     Args:
-        sequences (:obj:`jnp.ndarray` of shape :obj:`(batch_size, max_length)`):
+        sequences (`jnp.ndarray` of shape `(batch_size, max_length)`):
             The generated sequences.
     """
 
@@ -76,9 +76,9 @@ class FlaxBeamSearchOutput(ModelOutput):
 
 
     Args:
-        sequences (:obj:`jnp.ndarray` of shape :obj:`(batch_size, max_length)`):
+        sequences (`jnp.ndarray` of shape `(batch_size, max_length)`):
             The generated sequences.
-        scores (:obj:`jnp.ndarray` of shape :obj:`(batch_size,)`):
+        scores (`jnp.ndarray` of shape `(batch_size,)`):
             The scores (log probabilites) of the generated sequences.
     """
 
@@ -119,7 +119,7 @@ class BeamSearchState:
 class FlaxGenerationMixin:
     """
     A class containing all of the functions supporting generation, to be used as a mixin in
-    :class:`~transformers.FlaxPreTrainedModel`.
+    [`FlaxPreTrainedModel`].
     """
 
     @staticmethod
@@ -149,7 +149,7 @@ class FlaxGenerationMixin:
         """
         This function can be overwritten in the specific modeling_flax_<model-name>.py classes to allow for custom beam
         search behavior. Note that the only model that overwrites this method is
-        :class:`~transformes.FlaxMarianMTModel`.
+        [`~transformes.FlaxMarianMTModel`].
         """
         return logits
 
@@ -181,61 +181,62 @@ class FlaxGenerationMixin:
         Generates sequences for models with a language modeling head. The method currently supports greedy decoding,
         and, multinomial sampling.
 
-        Apart from :obj:`input_ids`, all the arguments below will default to the value of the attribute of the same
-        name inside the :class:`~transformers.PretrainedConfig` of the model. The default values indicated are the
+        Apart from `input_ids`, all the arguments below will default to the value of the attribute of the same
+        name inside the [`PretrainedConfig`] of the model. The default values indicated are the
         default values of those config.
 
-        Most of these parameters are explained in more detail in `this blog post
-        <https://huggingface.co/blog/how-to-generate>`__.
+        Most of these parameters are explained in more detail in [this blog post](https://huggingface.co/blog/how-to-generate).
 
         Parameters:
 
-            input_ids (:obj:`jnp.ndarray` of shape :obj:`(batch_size, sequence_length)`):
+            input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
                 The sequence used as a prompt for the generation.
-            max_length (:obj:`int`, `optional`, defaults to 20):
+            max_length (`int`, *optional*, defaults to 20):
                 The maximum length of the sequence to be generated.
-            do_sample (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            do_sample (`bool`, *optional*, defaults to `False`):
                 Whether or not to use sampling ; use greedy decoding otherwise.
-            temperature (:obj:`float`, `optional`, defaults to 1.0):
+            temperature (`float`, *optional*, defaults to 1.0):
                 The value used to module the next token probabilities.
-            top_k (:obj:`int`, `optional`, defaults to 50):
+            top_k (`int`, *optional*, defaults to 50):
                 The number of highest probability vocabulary tokens to keep for top-k-filtering.
-            top_p (:obj:`float`, `optional`, defaults to 1.0):
-                If set to float < 1, only the most probable tokens with probabilities that add up to :obj:`top_p` or
+            top_p (`float`, *optional*, defaults to 1.0):
+                If set to float < 1, only the most probable tokens with probabilities that add up to `top_p` or
                 higher are kept for generation.
-            pad_token_id (:obj:`int`, `optional`):
-                The id of the `padding` token.
-            bos_token_id (:obj:`int`, `optional`):
-                The id of the `beginning-of-sequence` token.
-            eos_token_id (:obj:`int`, `optional`):
-                The id of the `end-of-sequence` token.
-            num_beams (:obj:`int`, `optional`, defaults to 1):
+            pad_token_id (`int`, *optional*):
+                The id of the *padding* token.
+            bos_token_id (`int`, *optional*):
+                The id of the *beginning-of-sequence* token.
+            eos_token_id (`int`, *optional*):
+                The id of the *end-of-sequence* token.
+            num_beams (`int`, *optional*, defaults to 1):
                 Number of beams for beam search. 1 means no beam search.
-            decoder_start_token_id (:obj:`int`, `optional`):
-                If an encoder-decoder model starts decoding with a different token than `bos`, the id of that token.
-            trace (:obj:`bool`, `optional`, defaults to :obj:`True`):
-                Whether to trace generation. Setting ``trace=False`` should only be used for debugging and will lead to
+            decoder_start_token_id (`int`, *optional*):
+                If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token.
+            trace (`bool`, *optional*, defaults to `True`):
+                Whether to trace generation. Setting `trace=False` should only be used for debugging and will lead to
                 a considerably slower runtime.
-            params (:obj:`Dict[str, jnp.ndarray]`, `optional`):
+            params (`Dict[str, jnp.ndarray]`, *optional*):
                 Optionally the model parameters can be passed. Can be useful for parallelized generation.
             model_kwargs:
-                Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model.
+                Additional model specific kwargs will be forwarded to the `forward` function of the model.
 
         Return:
-            :class:`~transformers.file_utils.ModelOutput`.
+            [`~file_utils.ModelOutput`].
 
-        Examples::
-            >>> from transformers import AutoTokenizer, FlaxAutoModelForCausalLM
+        Examples:
 
-            >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
-            >>> model = FlaxAutoModelForCausalLM.from_pretrained("distilgpt2")
-            >>> input_context = "The dog"
-            >>> # encode input context
-            >>> input_ids = tokenizer(input_context, return_tensors="np").input_ids
-            >>> # generate candidates using sampling
-            >>> outputs = model.generate(input_ids=input_ids, max_length=20, top_k=30, do_sample=True)
-            >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
-        """
+        ```python
+        >>> from transformers import AutoTokenizer, FlaxAutoModelForCausalLM
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+        >>> model = FlaxAutoModelForCausalLM.from_pretrained("distilgpt2")
+        >>> input_context = "The dog"
+        >>> # encode input context
+        >>> input_ids = tokenizer(input_context, return_tensors="np").input_ids
+        >>> # generate candidates using sampling
+        >>> outputs = model.generate(input_ids=input_ids, max_length=20, top_k=30, do_sample=True)
+        >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
+        ```"""
         # set init values
         max_length = max_length if max_length is not None else self.config.max_length
         bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
@@ -326,8 +327,8 @@ class FlaxGenerationMixin:
         self, top_k: int = None, top_p: float = None, temperature: float = None
     ) -> FlaxLogitsProcessorList:
         """
-        This class returns a :class:`~transformers.FlaxLogitsProcessorList` list object that contains all relevant
-        :class:`~transformers.FlaxLogitsWarper` instances used for multinomial sampling.
+        This class returns a [`FlaxLogitsProcessorList`] list object that contains all relevant
+        [`FlaxLogitsWarper`] instances used for multinomial sampling.
         """
 
         # init warp parameters
@@ -358,8 +359,8 @@ class FlaxGenerationMixin:
         forced_eos_token_id: int,
     ) -> FlaxLogitsProcessorList:
         """
-        This class returns a :class:`~transformers.FlaxLogitsProcessorList` list object that contains all relevant
-        :class:`~transformers.FlaxLogitsProcessor` instances used to modify the scores of the language model head.
+        This class returns a [`FlaxLogitsProcessorList`] list object that contains all relevant
+        [`FlaxLogitsProcessor`] instances used to modify the scores of the language model head.
         """
         processors = FlaxLogitsProcessorList()
 
diff --git a/src/transformers/generation_logits_process.py b/src/transformers/generation_logits_process.py
index 4ce7c99444..8a9285f757 100644
--- a/src/transformers/generation_logits_process.py
+++ b/src/transformers/generation_logits_process.py
@@ -30,22 +30,22 @@ logger = get_logger(__name__)
 
 LOGITS_PROCESSOR_INPUTS_DOCSTRING = r"""
     Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using :class:`~transformers.BertTokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            Indices can be obtained using [`BertTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
             details.
 
-            `What are input IDs? <../glossary.html#input-ids>`__
-        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.vocab_size)`):
+            [What are input IDs?](../glossary#input-ids)
+        scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
             Prediction scores of a language modeling head. These can be logits for each vocabulary when not using beam
             search or log softmax for each vocabulary token when using beam search
         kwargs:
             Additional logits processor specific kwargs.
 
     Return:
-        :obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.vocab_size)`: The processed prediction scores.
+        `torch.FloatTensor` of shape `(batch_size, config.vocab_size)`: The processed prediction scores.
 
 """
 
@@ -74,10 +74,10 @@ class LogitsWarper(ABC):
 
 class LogitsProcessorList(list):
     """
-    This class can be used to create a list of :class:`~transformers.LogitsProcessor` or
-    :class:`~transformers.LogitsWarper` to subsequently process a :obj:`scores` input tensor. This class inherits from
-    list and adds a specific `__call__` method to apply each :class:`~transformers.LogitsProcessor` or
-    :class:`~transformers.LogitsWarper` to the inputs.
+    This class can be used to create a list of [`LogitsProcessor`] or
+    [`LogitsWarper`] to subsequently process a `scores` input tensor. This class inherits from
+    list and adds a specific *__call__* method to apply each [`LogitsProcessor`] or
+    [`LogitsWarper`] to the inputs.
     """
 
     @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
@@ -98,13 +98,13 @@ class LogitsProcessorList(list):
 
 class MinLengthLogitsProcessor(LogitsProcessor):
     r"""
-    :class:`transformers.LogitsProcessor` enforcing a min-length by setting EOS probability to 0.
+    [`LogitsProcessor`] enforcing a min-length by setting EOS probability to 0.
 
     Args:
-        min_length (:obj:`int`):
-            The minimum length below which the score of :obj:`eos_token_id` is set to :obj:`-float("Inf")`.
-        eos_token_id (:obj:`int`):
-            The id of the `end-of-sequence` token.
+        min_length (`int`):
+            The minimum length below which the score of `eos_token_id` is set to `-float("Inf")`.
+        eos_token_id (`int`):
+            The id of the *end-of-sequence* token.
     """
 
     def __init__(self, min_length: int, eos_token_id: int):
@@ -126,10 +126,10 @@ class MinLengthLogitsProcessor(LogitsProcessor):
 
 class TemperatureLogitsWarper(LogitsWarper):
     r"""
-    :class:`transformers.LogitsWarper` for temperature (exponential scaling output probability distribution).
+    [`LogitsWarper`] for temperature (exponential scaling output probability distribution).
 
     Args:
-        temperature (:obj:`float`):
+        temperature (`float`):
             The value used to module the logits distribution.
     """
 
@@ -146,12 +146,11 @@ class TemperatureLogitsWarper(LogitsWarper):
 
 class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
     r"""
-    :class:`transformers.LogitsProcessor` enforcing an exponential penalty on repeated sequences.
+    [`LogitsProcessor`] enforcing an exponential penalty on repeated sequences.
 
     Args:
-        repetition_penalty (:obj:`float`):
-            The parameter for repetition penalty. 1.0 means no penalty. See `this paper
-            <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
+        repetition_penalty (`float`):
+            The parameter for repetition penalty. 1.0 means no penalty. See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
     """
 
     def __init__(self, penalty: float):
@@ -172,16 +171,16 @@ class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
 
 class TopPLogitsWarper(LogitsWarper):
     """
-    :class:`transformers.LogitsWarper` that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <=
+    [`LogitsWarper`] that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <=
     prob_cut_off.
 
     Args:
-        top_p (:obj:`float`):
-            If set to < 1, only the most probable tokens with probabilities that add up to :obj:`top_p` or higher are
+        top_p (`float`):
+            If set to < 1, only the most probable tokens with probabilities that add up to `top_p` or higher are
             kept for generation.
-        filter_value (:obj:`float`, `optional`, defaults to :obj:`-float("Inf")`):
+        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
             All filtered values will be set to this float value.
-        min_tokens_to_keep (:obj:`int`, `optional`, defaults to 1):
+        min_tokens_to_keep (`int`, *optional*, defaults to 1):
             Minimum number of tokens that cannot be filtered.
     """
 
@@ -215,14 +214,14 @@ class TopPLogitsWarper(LogitsWarper):
 
 class TopKLogitsWarper(LogitsWarper):
     r"""
-    :class:`transformers.LogitsWarper` that performs top-k, i.e. restricting to the k highest probability elements.
+    [`LogitsWarper`] that performs top-k, i.e. restricting to the k highest probability elements.
 
     Args:
-        top_k (:obj:`int`):
+        top_k (`int`):
             The number of highest probability vocabulary tokens to keep for top-k-filtering.
-        filter_value (:obj:`float`, `optional`, defaults to :obj:`-float("Inf")`):
+        filter_value (`float`, *optional*, defaults to `-float("Inf")`):
             All filtered values will be set to this float value.
-        min_tokens_to_keep (:obj:`int`, `optional`, defaults to 1):
+        min_tokens_to_keep (`int`, *optional*, defaults to 1):
             Minimum number of tokens that cannot be filtered.
     """
 
@@ -279,12 +278,11 @@ def _calc_banned_ngram_tokens(
 
 class NoRepeatNGramLogitsProcessor(LogitsProcessor):
     r"""
-    :class:`transformers.LogitsProcessor` that enforces no repetition of n-grams. See `Fairseq
-    <https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345>`__.
+    [`LogitsProcessor`] that enforces no repetition of n-grams. See [Fairseq](https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345).
 
     Args:
-        ngram_size (:obj:`int`):
-            All ngrams of size :obj:`ngram_size` can only occur once.
+        ngram_size (`int`):
+            All ngrams of size `ngram_size` can only occur once.
     """
 
     def __init__(self, ngram_size: int):
@@ -305,13 +303,13 @@ class NoRepeatNGramLogitsProcessor(LogitsProcessor):
 
 class EncoderNoRepeatNGramLogitsProcessor(LogitsProcessor):
     r"""
-    :class:`transformers.LogitsProcessor` that enforces no repetition of encoder input ids n-grams for the decoder ids.
-    See `ParlAI <https://github.com/facebookresearch/ParlAI/blob/master/parlai/core/torch_generator_agent.py#L1350>`__.
+    [`LogitsProcessor`] that enforces no repetition of encoder input ids n-grams for the decoder ids.
+    See [ParlAI](https://github.com/facebookresearch/ParlAI/blob/master/parlai/core/torch_generator_agent.py#L1350).
 
     Args:
-        encoder_ngram_size (:obj:`int`):
-            All ngrams of size :obj:`ngram_size` can only occur within the encoder input ids.
-        encoder_input_ids (:obj:`int`):
+        encoder_ngram_size (`int`):
+            All ngrams of size `ngram_size` can only occur within the encoder input ids.
+        encoder_input_ids (`int`):
             The encoder_input_ids that should not be repeated within the decoder ids.
     """
 
@@ -346,15 +344,14 @@ class EncoderNoRepeatNGramLogitsProcessor(LogitsProcessor):
 
 class NoBadWordsLogitsProcessor(LogitsProcessor):
     """
-    :class:`transformers.LogitsProcessor` that enforces that specified sequences will never be sampled.
+    [`LogitsProcessor`] that enforces that specified sequences will never be sampled.
 
     Args:
-        bad_words_ids (:obj:`List[List[int]]`):
+        bad_words_ids (`List[List[int]]`):
             List of list of token ids that are not allowed to be generated. In order to get the tokens of the words
-            that should not appear in the generated text, use :obj:`tokenizer(bad_word,
-            add_prefix_space=True).input_ids`.
-        eos_token_id (:obj:`int`):
-            The id of the `end-of-sequence` token.
+            that should not appear in the generated text, use `tokenizer(bad_word, add_prefix_space=True).input_ids`.
+        eos_token_id (`int`):
+            The id of the *end-of-sequence* token.
     """
 
     def __init__(self, bad_words_ids: List[List[int]], eos_token_id: int):
@@ -474,16 +471,16 @@ class NoBadWordsLogitsProcessor(LogitsProcessor):
 
 class PrefixConstrainedLogitsProcessor(LogitsProcessor):
     r"""
-    :class:`transformers.LogitsProcessor` that enforces constrained generation and is useful for prefix-conditioned
-    constrained generation. See `Autoregressive Entity Retrieval <https://arxiv.org/abs/2010.00904>`__ for more
+    [`LogitsProcessor`] that enforces constrained generation and is useful for prefix-conditioned
+    constrained generation. See [Autoregressive Entity Retrieval](https://arxiv.org/abs/2010.00904) for more
     information.
 
     Args:
-        prefix_allowed_tokens_fn: (:obj:`Callable[[int, torch.Tensor], List[int]]`):
+        prefix_allowed_tokens_fn: (`Callable[[int, torch.Tensor], List[int]]`):
             This function constraints the beam search to allowed tokens only at each step. This function takes 2
-            arguments :obj:`inputs_ids` and the batch ID :obj:`batch_id`. It has to return a list with the allowed
-            tokens for the next generation step conditioned on the previously generated tokens :obj:`inputs_ids` and
-            the batch ID :obj:`batch_id`.
+            arguments `inputs_ids` and the batch ID `batch_id`. It has to return a list with the allowed
+            tokens for the next generation step conditioned on the previously generated tokens `inputs_ids` and
+            the batch ID `batch_id`.
     """
 
     def __init__(self, prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], List[int]], num_beams: int):
@@ -501,20 +498,20 @@ class PrefixConstrainedLogitsProcessor(LogitsProcessor):
 
 class HammingDiversityLogitsProcessor(LogitsProcessor):
     r"""
-    :class:`transformers.LogitsProcessor` that enforces diverse beam search. Note that this logits processor is only
-    effective for :meth:`transformers.PreTrainedModel.group_beam_search`. See `Diverse Beam Search: Decoding Diverse
-    Solutions from Neural Sequence Models <https://arxiv.org/pdf/1610.02424.pdf>`__ for more details.
+    [`LogitsProcessor`] that enforces diverse beam search. Note that this logits processor is only
+    effective for [`PreTrainedModel.group_beam_search`]. See [Diverse Beam Search: Decoding Diverse
+    Solutions from Neural Sequence Models](https://arxiv.org/pdf/1610.02424.pdf) for more details.
 
     Args:
-        diversity_penalty (:obj:`float`):
+        diversity_penalty (`float`):
             This value is subtracted from a beam's score if it generates a token same as any beam from other group at a
-            particular time. Note that :obj:`diversity_penalty` is only effective if ``group beam search`` is enabled.
-        num_beams (:obj:`int`):
-            Number of beams used for group beam search. See `this paper <https://arxiv.org/pdf/1610.02424.pdf>`__ for
+            particular time. Note that `diversity_penalty` is only effective if `group beam search` is enabled.
+        num_beams (`int`):
+            Number of beams used for group beam search. See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for
             more details.
-        num_beam_groups (:obj:`int`):
-            Number of groups to divide :obj:`num_beams` into in order to ensure diversity among different groups of
-            beams. See `this paper <https://arxiv.org/pdf/1610.02424.pdf>`__ for more details.
+        num_beam_groups (`int`):
+            Number of groups to divide `num_beams` into in order to ensure diversity among different groups of
+            beams. See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
     """
 
     def __init__(self, diversity_penalty: float, num_beams: int, num_beam_groups: int):
@@ -561,10 +558,10 @@ class HammingDiversityLogitsProcessor(LogitsProcessor):
 
 class ForcedBOSTokenLogitsProcessor(LogitsProcessor):
     r"""
-    :class:`~transformers.LogitsProcessor` that enforces the specified token as the first generated token.
+    [`LogitsProcessor`] that enforces the specified token as the first generated token.
 
     Args:
-        bos_token_id (:obj:`int`):
+        bos_token_id (`int`):
             The id of the token to force as the first generated token.
     """
 
@@ -582,14 +579,14 @@ class ForcedBOSTokenLogitsProcessor(LogitsProcessor):
 
 class ForcedEOSTokenLogitsProcessor(LogitsProcessor):
     r"""
-    :class:`~transformers.LogitsProcessor` that enforces the specified token as the last generated token when
-    :obj:`max_length` is reached.
+    [`LogitsProcessor`] that enforces the specified token as the last generated token when
+    `max_length` is reached.
 
     Args:
-        max_length (:obj:`int`):
+        max_length (`int`):
             The maximum length of the sequence to be generated.
-        eos_token_id (:obj:`int`):
-            The id of the token to force as the last generated token when :obj:`max_length` is reached.
+        eos_token_id (`int`):
+            The id of the token to force as the last generated token when `max_length` is reached.
     """
 
     def __init__(self, max_length: int, eos_token_id: int):
@@ -607,9 +604,9 @@ class ForcedEOSTokenLogitsProcessor(LogitsProcessor):
 
 class InfNanRemoveLogitsProcessor(LogitsProcessor):
     r"""
-    :class:`~transformers.LogitsProcessor` that removes all :obj:`nan` and :obj:`inf` values to avoid the generation
+    [`LogitsProcessor`] that removes all `nan` and `inf` values to avoid the generation
     method to fail. Note that using the logits processor should only be used if necessary since it can slow down the
-    generation method. :obj:`max_length` is reached.
+    generation method. `max_length` is reached.
     """
 
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
diff --git a/src/transformers/generation_stopping_criteria.py b/src/transformers/generation_stopping_criteria.py
index 479a524606..27b92114bd 100644
--- a/src/transformers/generation_stopping_criteria.py
+++ b/src/transformers/generation_stopping_criteria.py
@@ -11,22 +11,22 @@ from .file_utils import add_start_docstrings
 
 STOPPING_CRITERIA_INPUTS_DOCSTRING = r"""
     Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using :class:`~transformers.BertTokenizer`. See
-            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            Indices can be obtained using [`BertTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
             details.
 
-            `What are input IDs? <../glossary.html#input-ids>`__
-        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.vocab_size)`):
+            [What are input IDs?](../glossary#input-ids)
+        scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
             Prediction scores of a language modeling head. These can be scores for each vocabulary token before SoftMax
             or scores for each vocabulary token after SoftMax.
         kwargs:
             Additional stopping criteria specific kwargs.
 
     Return:
-        :obj:`bool`. :obj:`False` indicates we should continue, :obj:`True` indicates we should stop.
+        `bool`. `False` indicates we should continue, `True` indicates we should stop.
 
 """
 
@@ -41,11 +41,11 @@ class StoppingCriteria(ABC):
 
 class MaxLengthCriteria(StoppingCriteria):
     """
-    This class can be used to stop generation whenever the full generated number of tokens exceeds :obj:`max_length`.
+    This class can be used to stop generation whenever the full generated number of tokens exceeds `max_length`.
     Keep in mind for decoder-only type of transformers, this will include the initial prompted tokens.
 
     Args:
-        max_length (:obj:`int`):
+        max_length (`int`):
             The maximum length that the output sequence can have in number of tokens.
     """
 
@@ -59,14 +59,14 @@ class MaxLengthCriteria(StoppingCriteria):
 
 class MaxNewTokensCriteria(StoppingCriteria):
     """
-    This class can be used to stop generation whenever the generated number of tokens exceeds :obj:`max_new_tokens`.
+    This class can be used to stop generation whenever the generated number of tokens exceeds `max_new_tokens`.
     Keep in mind for decoder-only type of transformers, this will **not** include the initial prompted tokens. This is
-    very close to :obj:`MaxLengthCriteria` but ignores the number of initial tokens.
+    very close to `MaxLengthCriteria` but ignores the number of initial tokens.
 
     Args:
-        start_length (:obj:`int`):
+        start_length (`int`):
             The number of initial tokens.
-        max_new_tokens (:obj:`int`):
+        max_new_tokens (`int`):
             The maximum number of tokens to generate.
     """
 
@@ -90,12 +90,12 @@ class MaxTimeCriteria(StoppingCriteria):
     """
     This class can be used to stop generation whenever the full generation exceeds some amount of time. By default, the
     time will start being counted when you initialize this function. You can override this by passing an
-    :obj:`initial_time`.
+    `initial_time`.
 
     Args:
-        max_time (:obj:`float`):
+        max_time (`float`):
             The maximum allowed time in seconds for the generation.
-        initial_time (:obj:`float`, `optional`, defaults to :obj:`time.time()`):
+        initial_time (`float`, *optional*, defaults to `time.time()`):
             The start of the generation allowed time.
     """
 
diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py
index d91ff8ce6f..5cc103c7f7 100644
--- a/src/transformers/generation_tf_utils.py
+++ b/src/transformers/generation_tf_utils.py
@@ -34,19 +34,19 @@ class TFGreedySearchDecoderOnlyOutput(ModelOutput):
 
 
     Args:
-        sequences (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
-            The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
-            shorter if all batches finished early due to the :obj:`eos_token_id`.
-        scores (:obj:`tuple(tf.Tensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+        sequences (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+            shorter if all batches finished early due to the `eos_token_id`.
+        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
-            at each generation step. :obj:`(max_length-input_ids.shape[-1],)`-shaped tuple of :obj:`tf.Tensor` with
-            each tensor of shape :obj:`(batch_size, config.vocab_size)`).
-        attentions (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            at each generation step. `(max_length-input_ids.shape[-1],)`-shaped tuple of `tf.Tensor` with
+            each tensor of shape `(batch_size, config.vocab_size)`).
+        attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`tf.Tensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`.
-        hidden_states (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+        hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`tf.Tensor` of shape :obj:`(batch_size, generated_length, hidden_size)`.
+            `tf.Tensor` of shape `(batch_size, generated_length, hidden_size)`.
     """
 
     sequences: tf.Tensor = None
@@ -64,28 +64,27 @@ class TFGreedySearchEncoderDecoderOutput(ModelOutput):
 
 
     Args:
-        sequences (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
-            The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
-            shorter if all batches finished early due to the :obj:`eos_token_id`.
-        scores (:obj:`tuple(tf.Tensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+        sequences (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+            shorter if all batches finished early due to the `eos_token_id`.
+        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
-            at each generation step. :obj:`(max_length-1,)`-shaped tuple of :obj:`tf.Tensor` with each tensor of shape
-            :obj:`(batch_size, config.vocab_size)`).
-        encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
-            Tuple of :obj:`tf.Tensor` (one for each layer of the decoder) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-        encoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape :obj:`(batch_size, sequence_length, hidden_size)`.
-        decoder_attentions (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            at each generation step. `(max_length-1,)`-shaped tuple of `tf.Tensor` with each tensor of shape
+            `(batch_size, config.vocab_size)`).
+        encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
+        encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+        decoder_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`tf.Tensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`.
-        cross_attentions (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+        cross_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`tf.Tensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`.
-        decoder_hidden_states (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+        decoder_hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`tf.Tensor` of shape :obj:`(batch_size, generated_length, hidden_size)`.
+            `tf.Tensor` of shape `(batch_size, generated_length, hidden_size)`.
     """
 
     sequences: tf.Tensor = None
@@ -104,20 +103,19 @@ class TFSampleDecoderOnlyOutput(ModelOutput):
 
 
     Args:
-        sequences (:obj:`tf.Tensor` of shape :obj:`(batch_size*num_return_sequences, sequence_length)`):
-            The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
-            shorter if all batches finished early due to the :obj:`eos_token_id`.
-        scores (:obj:`tuple(tf.Tensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+        sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+            shorter if all batches finished early due to the `eos_token_id`.
+        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
-            at each generation step. :obj:`(max_length-input_ids.shape[-1],)`-shaped tuple of :obj:`tf.Tensor` with
-            each tensor of shape :obj:`(batch_size*num_return_sequences, config.vocab_size)`).
-        attentions (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            at each generation step. `(max_length-input_ids.shape[-1],)`-shaped tuple of `tf.Tensor` with
+            each tensor of shape `(batch_size*num_return_sequences, config.vocab_size)`).
+        attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`tf.Tensor` of shape :obj:`(num_return_sequences*batch_size, num_heads, generated_length,
-            sequence_length)`.
-        hidden_states (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            `tf.Tensor` of shape `(num_return_sequences*batch_size, num_heads, generated_length, sequence_length)`.
+        hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`tf.Tensor` of shape :obj:`(num_return_sequences*batch_size, generated_length, hidden_size)`.
+            `tf.Tensor` of shape `(num_return_sequences*batch_size, generated_length, hidden_size)`.
     """
 
     sequences: tf.Tensor = None
@@ -135,29 +133,28 @@ class TFSampleEncoderDecoderOutput(ModelOutput):
 
 
     Args:
-        sequences (:obj:`tf.Tensor` of shape :obj:`(batch_size*num_return_sequences, sequence_length)`):
-            The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
-            shorter if all batches finished early due to the :obj:`eos_token_id`.
-        scores (:obj:`tuple(tf.Tensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+        sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+            shorter if all batches finished early due to the `eos_token_id`.
+        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
-            at each generation step. :obj:`(max_length-1,)`-shaped tuple of :obj:`tf.Tensor` with each tensor of shape
-            :obj:`(batch_size*num_return_sequences, config.vocab_size)`).
-        encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
-            Tuple of :obj:`tf.Tensor` (one for each layer of the decoder) of shape
-            :obj:`(batch_size*num_return_sequences, num_heads, sequence_length, sequence_length)`.
-        encoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape :obj:`(batch_size*num_return_sequences, sequence_length, hidden_size)`.
-        decoder_attentions (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            at each generation step. `(max_length-1,)`-shaped tuple of `tf.Tensor` with each tensor of shape
+            `(batch_size*num_return_sequences, config.vocab_size)`).
+        encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer of the decoder) of shape
+            `(batch_size*num_return_sequences, num_heads, sequence_length, sequence_length)`.
+        encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size*num_return_sequences, sequence_length, hidden_size)`.
+        decoder_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`tf.Tensor` of shape :obj:`(batch_size*num_return_sequences, num_heads, generated_length,
-            sequence_length)`.
-        cross_attentions (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            `tf.Tensor` of shape `(batch_size*num_return_sequences, num_heads, generated_length, sequence_length)`.
+        cross_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`tf.Tensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`.
-        decoder_hidden_states (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+        decoder_hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`tf.Tensor` of shape :obj:`(batch_size*num_return_sequences, generated_length, hidden_size)`.
+            `tf.Tensor` of shape `(batch_size*num_return_sequences, generated_length, hidden_size)`.
     """
 
     sequences: tf.Tensor = None
@@ -175,23 +172,22 @@ class TFBeamSearchDecoderOnlyOutput(ModelOutput):
     Base class for outputs of decoder-only generation models using beam search.
 
     Args:
-        sequences (:obj:`tf.Tensor` of shape :obj:`(batch_size*num_return_sequences, sequence_length)`):
-            The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
-            shorter if all batches finished early due to the :obj:`eos_token_id`.
-        sequences_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size*num_return_sequences)`, `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
-            Final beam scores of the generated ``sequences``.
-        scores (:obj:`tuple(tf.Tensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+        sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+            shorter if all batches finished early due to the `eos_token_id`.
+        sequences_scores (`tf.Tensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+            Final beam scores of the generated `sequences`.
+        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
             softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam
-            . :obj:`(max_length-input_ids.shape[-1],)`-shaped tuple of :obj:`tf.Tensor` with each tensor of shape
-            :obj:`(batch_size*num_beams*num_return_sequences, config.vocab_size)`).
-        attentions (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            . `(max_length-input_ids.shape[-1],)`-shaped tuple of `tf.Tensor` with each tensor of shape
+            `(batch_size*num_beams*num_return_sequences, config.vocab_size)`).
+        attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`tf.Tensor` of shape :obj:`(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
-        hidden_states (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            `tf.Tensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
+        hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`tf.Tensor` of shape :obj:`(batch_size*num_beams*num_return_sequences, generated_length,
-            hidden_size)`.
+            `tf.Tensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
     """
 
     sequences: tf.Tensor = None
@@ -209,34 +205,31 @@ class TFBeamSearchEncoderDecoderOutput(ModelOutput):
     attributes (respectively the decoder_attentions and the decoder_hidden_states attributes)
 
     Args:
-        sequences (:obj:`tf.Tensor` of shape :obj:`(batch_size*num_return_sequences, sequence_length)`):
-            The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
-            shorter if all batches finished early due to the :obj:`eos_token_id`.
-        sequences_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size*num_return_sequences)`, `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
-            Final beam scores of the generated ``sequences``.
-        scores (:obj:`tuple(tf.Tensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+        sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+            shorter if all batches finished early due to the `eos_token_id`.
+        sequences_scores (`tf.Tensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+            Final beam scores of the generated `sequences`.
+        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
             softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam
-            . :obj:`(max_length-1,)`-shaped tuple of :obj:`tf.Tensor` with each tensor of shape
-            :obj:`(batch_size*num_beams, config.vocab_size)`).
-        attentions (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
-        encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
-            Tuple of :obj:`tf.Tensor` (one for each layer of the decoder) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-        encoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape :obj:`(batch_size*num_beams*num_return_sequences, sequence_length, hidden_size)`.
-        decoder_attentions (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            . `(max_length-1,)`-shaped tuple of `tf.Tensor` with each tensor of shape
+            `(batch_size*num_beams, config.vocab_size)`).
+        attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+        encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
+        encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size*num_beams*num_return_sequences, sequence_length, hidden_size)`.
+        decoder_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`tf.Tensor` of shape :obj:`(batch_size*num_beams*num_return_sequences, num_heads, generated_length,
-            sequence_length)`.
-        cross_attentions (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            `tf.Tensor` of shape `(batch_size*num_beams*num_return_sequences, num_heads, generated_length, sequence_length)`.
+        cross_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`tf.Tensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`.
-        decoder_hidden_states (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+        decoder_hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`tf.Tensor` of shape :obj:`(batch_size*num_beams*num_return_sequences, generated_length,
-            hidden_size)`.
+            `tf.Tensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
     """
 
     sequences: tf.Tensor = None
@@ -255,22 +248,22 @@ class TFBeamSampleDecoderOnlyOutput(ModelOutput):
     Base class for outputs of decoder-only generation models using beam sample.
 
     Args:
-        sequences (:obj:`tf.Tensor` of shape :obj:`(batch_size*num_return_sequences, sequence_length)`):
-            The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
-            shorter if all batches finished early due to the :obj:`eos_token_id`.
-        sequences_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size * num_return_sequence)`, `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
-            Final beam scores of the generated ``sequences``.
-        scores (:obj:`tuple(tf.Tensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+        sequences (`tf.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+            shorter if all batches finished early due to the `eos_token_id`.
+        sequences_scores (`tf.Tensor` of shape `(batch_size * num_return_sequence)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+            Final beam scores of the generated `sequences`.
+        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
             softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam
-            . :obj:`(max_length-input_ids.shape[-1],)`-shaped tuple of :obj:`tf.Tensor` with each tensor of shape
-            :obj:`(batch_size*num_beams*num_return_sequences, config.vocab_size)`).
-        attentions (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            . `(max_length-input_ids.shape[-1],)`-shaped tuple of `tf.Tensor` with each tensor of shape
+            `(batch_size*num_beams*num_return_sequences, config.vocab_size)`).
+        attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`tf.Tensor` of shape :obj:`(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
-        hidden_states (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            `tf.Tensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
+        hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`tf.Tensor` of shape :obj:`(batch_size*num_beams, generated_length, hidden_size)`.
+            `tf.Tensor` of shape `(batch_size*num_beams, generated_length, hidden_size)`.
     """
 
     sequences: tf.Tensor = None
@@ -288,31 +281,30 @@ class TFBeamSampleEncoderDecoderOutput(ModelOutput):
     encoder_hidden_states attributes (respectively the decoder_attentions and the decoder_hidden_states attributes)
 
     Args:
-        sequences (:obj:`tf.Tensor` of shape :obj:`(batch_size*num_beams, sequence_length)`):
-            The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
-            shorter if all batches finished early due to the :obj:`eos_token_id`.
-        sequences_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size * num_return_sequence)`, `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
-            Final beam scores of the generated ``sequences``.
-        scores (:obj:`tuple(tf.Tensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+        sequences (`tf.Tensor` of shape `(batch_size*num_beams, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+            shorter if all batches finished early due to the `eos_token_id`.
+        sequences_scores (`tf.Tensor` of shape `(batch_size * num_return_sequence)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+            Final beam scores of the generated `sequences`.
+        scores (`tuple(tf.Tensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
             softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam
-            . :obj:`(max_length-1,)`-shaped tuple of :obj:`tf.Tensor` with each tensor of shape
-            :obj:`(batch_size*num_beams, config.vocab_size)`).
-        encoder_attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
-            Tuple of :obj:`tf.Tensor` (one for each layer of the decoder) of shape :obj:`(batch_size, num_heads,
-            sequence_length, sequence_length)`.
-        encoder_hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
-            shape :obj:`(batch_size*num_beams, sequence_length, hidden_size)`.
-        decoder_attentions (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            . `(max_length-1,)`-shaped tuple of `tf.Tensor` with each tensor of shape
+            `(batch_size*num_beams, config.vocab_size)`).
+        encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
+        encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size*num_beams, sequence_length, hidden_size)`.
+        decoder_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`tf.Tensor` of shape :obj:`(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
-        cross_attentions (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            `tf.Tensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
+        cross_attentions (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`tf.Tensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`.
-        decoder_hidden_states (:obj:`tuple(tuple(tf.Tensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            `tf.Tensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+        decoder_hidden_states (`tuple(tuple(tf.Tensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`tf.Tensor` of shape :obj:`(batch_size*num_beams, generated_length, hidden_size)`.
+            `tf.Tensor` of shape `(batch_size*num_beams, generated_length, hidden_size)`.
     """
 
     sequences: tf.Tensor = None
@@ -334,12 +326,12 @@ TFBeamSampleOutput = Union[TFBeamSampleEncoderDecoderOutput, TFBeamSampleDecoder
 class TFGenerationMixin:
     """
     A class containing all of the functions supporting generation, to be used as a mixin in
-    :class:`~transformers.TFPreTrainedModel`.
+    [`TFPreTrainedModel`].
     """
 
     def prepare_inputs_for_generation(self, inputs, **kwargs):
         """
-        Implement in subclasses of :class:`~transformers.TFPreTrainedModel` for custom behavior to prepare inputs in
+        Implement in subclasses of [`TFPreTrainedModel`] for custom behavior to prepare inputs in
         the generate method.
         """
         return {"input_ids": inputs}
@@ -387,148 +379,146 @@ class TFGenerationMixin:
         Generates sequences for models with a language modeling head. The method currently supports greedy decoding,
         beam-search decoding, sampling with temperature, sampling with top-k or nucleus sampling.
 
-        Adapted in part from `Facebook's XLM beam search code
-        <https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529>`__.
+        Adapted in part from [Facebook's XLM beam search code](https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529).
 
-        Apart from :obj:`input_ids` and :obj:`attention_mask`, all the arguments below will default to the value of the
-        attribute of the same name inside the :class:`~transformers.PretrainedConfig` of the model. The default values
+        Apart from `input_ids` and `attention_mask`, all the arguments below will default to the value of the
+        attribute of the same name inside the [`PretrainedConfig`] of the model. The default values
         indicated are the default values of those config.
 
-        Most of these parameters are explained in more detail in `this blog post
-        <https://huggingface.co/blog/how-to-generate>`__.
+        Most of these parameters are explained in more detail in [this blog post](https://huggingface.co/blog/how-to-generate).
 
         Parameters:
 
-            input_ids (:obj:`tf.Tensor` of :obj:`dtype=tf.int32` and shape :obj:`(batch_size, sequence_length)`, `optional`):
-                The sequence used as a prompt for the generation. If :obj:`None` the method initializes it with
-                :obj:`bos_token_id` and a batch size of 1.
-            max_length (:obj:`int`, `optional`, defaults to 20):
+            input_ids (`tf.Tensor` of `dtype=tf.int32` and shape `(batch_size, sequence_length)`, *optional*):
+                The sequence used as a prompt for the generation. If `None` the method initializes it with
+                `bos_token_id` and a batch size of 1.
+            max_length (`int`, *optional*, defaults to 20):
                 The maximum length of the sequence to be generated.
-            min_length (:obj:`int`, `optional`, defaults to 10):
+            min_length (`int`, *optional*, defaults to 10):
                 The minimum length of the sequence to be generated.
-            do_sample (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            do_sample (`bool`, *optional*, defaults to `False`):
                 Whether or not to use sampling ; use greedy decoding otherwise.
-            early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not.
-            num_beams (:obj:`int`, `optional`, defaults to 1):
+            early_stopping (`bool`, *optional*, defaults to `False`):
+                Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not.
+            num_beams (`int`, *optional*, defaults to 1):
                 Number of beams for beam search. 1 means no beam search.
-            temperature (:obj:`float`, `optional`, defaults to 1.0):
+            temperature (`float`, *optional*, defaults to 1.0):
                 The value used to module the next token probabilities.
-            top_k (:obj:`int`, `optional`, defaults to 50):
+            top_k (`int`, *optional*, defaults to 50):
                 The number of highest probability vocabulary tokens to keep for top-k-filtering.
-            top_p (:obj:`float`, `optional`, defaults to 1.0):
-                If set to float < 1, only the most probable tokens with probabilities that add up to ``top_p`` or
+            top_p (`float`, *optional*, defaults to 1.0):
+                If set to float < 1, only the most probable tokens with probabilities that add up to `top_p` or
                 higher are kept for generation.
-            repetition_penalty (:obj:`float`, `optional`, defaults to 1.0):
-                The parameter for repetition penalty. 1.0 means no penalty. See `this paper
-                <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
-            pad_token_id (:obj:`int`, `optional`):
-                The id of the `padding` token.
-            bos_token_id (:obj:`int`, `optional`):
-                The id of the `beginning-of-sequence` token.
-            eos_token_id (:obj:`int`, `optional`):
-                The id of the `end-of-sequence` token.
-            length_penalty (:obj:`float`, `optional`, defaults to 1.0):
+            repetition_penalty (`float`, *optional*, defaults to 1.0):
+                The parameter for repetition penalty. 1.0 means no penalty. See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            pad_token_id (`int`, *optional*):
+                The id of the *padding* token.
+            bos_token_id (`int`, *optional*):
+                The id of the *beginning-of-sequence* token.
+            eos_token_id (`int`, *optional*):
+                The id of the *end-of-sequence* token.
+            length_penalty (`float`, *optional*, defaults to 1.0):
                 Exponential penalty to the length. 1.0 means no penalty.
 
                 Set to values < 1.0 in order to encourage the model to generate shorter sequences, to a value > 1.0 in
                 order to encourage the model to produce longer sequences.
-            no_repeat_ngram_size (:obj:`int`, `optional`, defaults to 0):
+            no_repeat_ngram_size (`int`, *optional*, defaults to 0):
                 If set to int > 0, all ngrams of that size can only occur once.
-            bad_words_ids(:obj:`List[int]`, `optional`):
+            bad_words_ids(`List[int]`, *optional*):
                 List of token ids that are not allowed to be generated. In order to get the tokens of the words that
-                should not appear in the generated text, use :obj:`tokenizer.encode(bad_word, add_prefix_space=True)`.
-            num_return_sequences(:obj:`int`, `optional`, defaults to 1):
+                should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`.
+            num_return_sequences(`int`, *optional*, defaults to 1):
                 The number of independently computed returned sequences for each element in the batch.
-            attention_mask (:obj:`tf.Tensor` of :obj:`dtype=tf.int32` and shape :obj:`(batch_size, sequence_length)`, `optional`):
-                Mask to avoid performing attention on padding token indices. Mask values are in ``[0, 1]``, 1 for
+            attention_mask (`tf.Tensor` of `dtype=tf.int32` and shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values are in `[0, 1]`, 1 for
                 tokens that are not masked, and 0 for masked tokens.
 
-                If not provided, will default to a tensor the same shape as :obj:`input_ids` that masks the pad token.
+                If not provided, will default to a tensor the same shape as `input_ids` that masks the pad token.
 
-                `What are attention masks? <../glossary.html#attention-mask>`__
-            decoder_start_token_id (:obj:`int`, `optional`):
-                If an encoder-decoder model starts decoding with a different token than `bos`, the id of that token.
-            use_cache: (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                [What are attention masks?](../glossary#attention-mask)
+            decoder_start_token_id (`int`, *optional*):
+                If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token.
+            use_cache: (`bool`, *optional*, defaults to `True`):
                 Whether or not the model should use the past last key/values attentions (if applicable to the model) to
                 speed up decoding.
-            output_attentions (:obj:`bool`, `optional`, defaults to `False`):
-                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+            output_attentions (`bool`, *optional*, defaults to *False*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more details.
-            output_hidden_states (:obj:`bool`, `optional`, defaults to `False`):
-                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+            output_hidden_states (`bool`, *optional*, defaults to *False*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                 for more details.
-            output_scores (:obj:`bool`, `optional`, defaults to `False`):
-                Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
-            return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
-                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
-            forced_bos_token_id (:obj:`int`, `optional`):
-                The id of the token to force as the first generated token after the :obj:`decoder_start_token_id`.
-                Useful for multilingual models like :doc:`mBART <../model_doc/mbart>` where the first generated token
+            output_scores (`bool`, *optional*, defaults to *False*):
+                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to *False*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+            forced_bos_token_id (`int`, *optional*):
+                The id of the token to force as the first generated token after the `decoder_start_token_id`.
+                Useful for multilingual models like [mBART](../model_doc/mbart) where the first generated token
                 needs to be the target language token.
-            forced_eos_token_id (:obj:`int`, `optional`):
-                The id of the token to force as the last generated token when :obj:`max_length` is reached.
+            forced_eos_token_id (`int`, *optional*):
+                The id of the token to force as the last generated token when `max_length` is reached.
             model_specific_kwargs:
-                Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model.
+                Additional model specific kwargs will be forwarded to the `forward` function of the model.
 
         Return:
-            :class:`~transformers.file_utils.ModelOutput` or :obj:`tf.Tensor`: A
-            :class:`~transformers.file_utils.ModelOutput` (if ``return_dict_in_generate=True`` or when
-            ``config.return_dict_in_generate=True``) or a :obj:`tf.Tensor`.
+            [`~file_utils.ModelOutput`] or `tf.Tensor`: A
+            [`~file_utils.ModelOutput`] (if `return_dict_in_generate=True` or when
+            `config.return_dict_in_generate=True`) or a `tf.Tensor`.
 
-                If the model is `not` an encoder-decoder model (``model.config.is_encoder_decoder=False``), the
-                possible :class:`~transformers.file_utils.ModelOutput` types are:
+                If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the
+                possible [`~file_utils.ModelOutput`] types are:
 
-                    - :class:`~transformers.generation_utils.TFGreedySearchDecoderOnlyOutput`,
-                    - :class:`~transformers.generation_utils.TFSampleDecoderOnlyOutput`,
-                    - :class:`~transformers.generation_utils.TFBeamSearchDecoderOnlyOutput`,
-                    - :class:`~transformers.generation_utils.TFBeamSampleDecoderOnlyOutput`
+                    - [`~generation_utils.TFGreedySearchDecoderOnlyOutput`],
+                    - [`~generation_utils.TFSampleDecoderOnlyOutput`],
+                    - [`~generation_utils.TFBeamSearchDecoderOnlyOutput`],
+                    - [`~generation_utils.TFBeamSampleDecoderOnlyOutput`]
 
-                If the model is an encoder-decoder model (``model.config.is_encoder_decoder=True``), the possible
-                :class:`~transformers.file_utils.ModelOutput` types are:
+                If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
+                [`~file_utils.ModelOutput`] types are:
 
-                    - :class:`~transformers.generation_utils.TFGreedySearchEncoderDecoderOutput`,
-                    - :class:`~transformers.generation_utils.TFSampleEncoderDecoderOutput`,
-                    - :class:`~transformers.generation_utils.TFBeamSearchEncoderDecoderOutput`,
-                    - :class:`~transformers.generation_utils.TFBeamSampleEncoderDecoderOutput`
+                    - [`~generation_utils.TFGreedySearchEncoderDecoderOutput`],
+                    - [`~generation_utils.TFSampleEncoderDecoderOutput`],
+                    - [`~generation_utils.TFBeamSearchEncoderDecoderOutput`],
+                    - [`~generation_utils.TFBeamSampleEncoderDecoderOutput`]
 
-        Examples::
+        Examples:
 
-            tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
-            model = TFAutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from huggingface.co and cache.
-            outputs = model.generate(max_length=40)  # do greedy decoding
-            print(f'Generated: {tokenizer.decode(outputs[0], skip_special_tokens=True)}')
+        ```python
+        tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
+        model = TFAutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from huggingface.co and cache.
+        outputs = model.generate(max_length=40)  # do greedy decoding
+        print(f'Generated: {tokenizer.decode(outputs[0], skip_special_tokens=True)}')
 
-            tokenizer = AutoTokenizer.from_pretrained('openai-gpt')   # Initialize tokenizer
-            model = TFAutoModelWithLMHead.from_pretrained('openai-gpt')    # Download model and configuration from huggingface.co and cache.
-            input_context = 'The dog'
-            input_ids = tokenizer.encode(input_context, return_tensors='tf')  # encode input context
-            outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5)  # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog'
-            for i in range(3): #  3 output sequences were generated
-                print(f'Generated {i}: {tokenizer.decode(outputs[i], skip_special_tokens=True)}')
+        tokenizer = AutoTokenizer.from_pretrained('openai-gpt')   # Initialize tokenizer
+        model = TFAutoModelWithLMHead.from_pretrained('openai-gpt')    # Download model and configuration from huggingface.co and cache.
+        input_context = 'The dog'
+        input_ids = tokenizer.encode(input_context, return_tensors='tf')  # encode input context
+        outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5)  # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog'
+        for i in range(3): #  3 output sequences were generated
+            print(f'Generated {i}: {tokenizer.decode(outputs[i], skip_special_tokens=True)}')
 
-            tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
-            model = TFAutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from huggingface.co and cache.
-            input_context = 'The dog'
-            input_ids = tokenizer.encode(input_context, return_tensors='tf')  # encode input context
-            outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3, do_sample=True)  # generate 3 candidates using sampling
-            for i in range(3): #  3 output sequences were generated
-                print(f'Generated {i}: {tokenizer.decode(outputs[i], skip_special_tokens=True)}')
+        tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
+        model = TFAutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from huggingface.co and cache.
+        input_context = 'The dog'
+        input_ids = tokenizer.encode(input_context, return_tensors='tf')  # encode input context
+        outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3, do_sample=True)  # generate 3 candidates using sampling
+        for i in range(3): #  3 output sequences were generated
+            print(f'Generated {i}: {tokenizer.decode(outputs[i], skip_special_tokens=True)}')
 
-            tokenizer = AutoTokenizer.from_pretrained('ctrl')   # Initialize tokenizer
-            model = TFAutoModelWithLMHead.from_pretrained('ctrl')    # Download model and configuration from huggingface.co and cache.
-            input_context = 'Legal My neighbor is'  # "Legal" is one of the control codes for ctrl
-            input_ids = tokenizer.encode(input_context, return_tensors='tf')  # encode input context
-            outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2)  # generate sequences
-            print(f'Generated: {tokenizer.decode(outputs[0], skip_special_tokens=True)}')
+        tokenizer = AutoTokenizer.from_pretrained('ctrl')   # Initialize tokenizer
+        model = TFAutoModelWithLMHead.from_pretrained('ctrl')    # Download model and configuration from huggingface.co and cache.
+        input_context = 'Legal My neighbor is'  # "Legal" is one of the control codes for ctrl
+        input_ids = tokenizer.encode(input_context, return_tensors='tf')  # encode input context
+        outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2)  # generate sequences
+        print(f'Generated: {tokenizer.decode(outputs[0], skip_special_tokens=True)}')
 
-            tokenizer = AutoTokenizer.from_pretrained('gpt2')   # Initialize tokenizer
-            model = TFAutoModelWithLMHead.from_pretrained('gpt2')    # Download model and configuration from huggingface.co and cache.
-            input_context = 'My cute dog'
-            bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']]
-            input_ids = tokenizer.encode(input_context, return_tensors='tf')  # encode input context
-            outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids)  # generate sequences without allowing bad_words to be generated
-        """
+        tokenizer = AutoTokenizer.from_pretrained('gpt2')   # Initialize tokenizer
+        model = TFAutoModelWithLMHead.from_pretrained('gpt2')    # Download model and configuration from huggingface.co and cache.
+        input_context = 'My cute dog'
+        bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']]
+        input_ids = tokenizer.encode(input_context, return_tensors='tf')  # encode input context
+        outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids)  # generate sequences without allowing bad_words to be generated
+        ```"""
 
         # We cannot generate if the model does not have a LM head
         if self.get_output_embeddings() is None:
@@ -1448,7 +1438,7 @@ class TFGenerationMixin:
         self, logits, cur_len, max_length, forced_bos_token_id, forced_eos_token_id, **kwargs
     ):
         """
-        Implement in subclasses of :class:`~transformers.PreTrainedModel` for custom behavior to adjust the logits in
+        Implement in subclasses of [`PreTrainedModel`] for custom behavior to adjust the logits in
         the generate method.
         """
         vocab_size = getattr(self.config, "vocab_size", None)
@@ -1546,12 +1536,12 @@ def tf_top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float("In
 
     Args:
         logits: logits distribution shape (batch size, vocabulary size)
-        top_k (:obj:`int`, `optional`, defaults to 0):
+        top_k (`int`, *optional*, defaults to 0):
             If > 0, only keep the top k tokens with highest probability (top-k filtering)
-        top_p (:obj:`float`, `optional`, defaults to 1.0):
+        top_p (`float`, *optional*, defaults to 1.0):
             If < 1.0, only keep the top tokens with cumulative probability >= top_p (nucleus filtering). Nucleus
             filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
-        min_tokens_to_keep (:obj:`int`, `optional`, defaults to 1):
+        min_tokens_to_keep (`int`, *optional*, defaults to 1):
             Minimumber of tokens we keep per batch example in the output.
 
     From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
index 24ac094bfc..f8abfa53b8 100644
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -60,19 +60,19 @@ class GreedySearchDecoderOnlyOutput(ModelOutput):
 
 
     Args:
-        sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
-            shorter if all batches finished early due to the :obj:`eos_token_id`.
-        scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+        sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+            shorter if all batches finished early due to the `eos_token_id`.
+        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
-            at each generation step. :obj:`(max_length-input_ids.shape[-1],)`-shaped tuple of :obj:`torch.FloatTensor`
-            with each tensor of shape :obj:`(batch_size, config.vocab_size)`).
-        attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            at each generation step. `(max_length-input_ids.shape[-1],)`-shaped tuple of `torch.FloatTensor`
+            with each tensor of shape `(batch_size, config.vocab_size)`).
+        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`.
-        hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+        hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`torch.FloatTensor` of shape :obj:`(batch_size, generated_length, hidden_size)`.
+            `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
     """
 
     sequences: torch.LongTensor = None
@@ -90,28 +90,27 @@ class GreedySearchEncoderDecoderOutput(ModelOutput):
 
 
     Args:
-        sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
-            The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
-            shorter if all batches finished early due to the :obj:`eos_token_id`.
-        scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+        sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+            shorter if all batches finished early due to the `eos_token_id`.
+        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
-            at each generation step. :obj:`(max_length-1,)`-shaped tuple of :obj:`torch.FloatTensor` with each tensor
-            of shape :obj:`(batch_size, config.vocab_size)`).
-        encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer of the decoder) of shape :obj:`(batch_size,
-            num_heads, sequence_length, sequence_length)`.
-        encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
-        decoder_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            at each generation step. `(max_length-1,)`-shaped tuple of `torch.FloatTensor` with each tensor
+            of shape `(batch_size, config.vocab_size)`).
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape `(batch_size, sequence_length, hidden_size)`.
+        decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`.
-        cross_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+        cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`.
-        decoder_hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+        decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`torch.FloatTensor` of shape :obj:`(batch_size, generated_length, hidden_size)`.
+            `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
     """
 
     sequences: torch.LongTensor = None
@@ -130,20 +129,19 @@ class SampleDecoderOnlyOutput(ModelOutput):
 
 
     Args:
-        sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size*num_return_sequences, sequence_length)`):
-            The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
-            shorter if all batches finished early due to the :obj:`eos_token_id`.
-        scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+        sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+            shorter if all batches finished early due to the `eos_token_id`.
+        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
-            at each generation step. :obj:`(max_length-input_ids.shape[-1],)`-shaped tuple of :obj:`torch.FloatTensor`
-            with each tensor of shape :obj:`(batch_size*num_return_sequences, config.vocab_size)`).
-        attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            at each generation step. `(max_length-input_ids.shape[-1],)`-shaped tuple of `torch.FloatTensor`
+            with each tensor of shape `(batch_size*num_return_sequences, config.vocab_size)`).
+        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`torch.FloatTensor` of shape :obj:`(num_return_sequences*batch_size, num_heads, generated_length,
-            sequence_length)`.
-        hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            `torch.FloatTensor` of shape `(num_return_sequences*batch_size, num_heads, generated_length, sequence_length)`.
+        hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`torch.FloatTensor` of shape :obj:`(num_return_sequences*batch_size, generated_length, hidden_size)`.
+            `torch.FloatTensor` of shape `(num_return_sequences*batch_size, generated_length, hidden_size)`.
     """
 
     sequences: torch.LongTensor = None
@@ -161,29 +159,28 @@ class SampleEncoderDecoderOutput(ModelOutput):
 
 
     Args:
-        sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size*num_return_sequences, sequence_length)`):
-            The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
-            shorter if all batches finished early due to the :obj:`eos_token_id`.
-        scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+        sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+            shorter if all batches finished early due to the `eos_token_id`.
+        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
-            at each generation step. :obj:`(max_length-1,)`-shaped tuple of :obj:`torch.FloatTensor` with each tensor
-            of shape :obj:`(batch_size*num_return_sequences, config.vocab_size)`).
-        encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer of the decoder) of shape
-            :obj:`(batch_size*num_return_sequences, num_heads, sequence_length, sequence_length)`.
-        encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size*num_return_sequences, sequence_length, hidden_size)`.
-        decoder_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            at each generation step. `(max_length-1,)`-shaped tuple of `torch.FloatTensor` with each tensor
+            of shape `(batch_size*num_return_sequences, config.vocab_size)`).
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape
+            `(batch_size*num_return_sequences, num_heads, sequence_length, sequence_length)`.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape `(batch_size*num_return_sequences, sequence_length, hidden_size)`.
+        decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_return_sequences, num_heads, generated_length,
-            sequence_length)`.
-        cross_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            `torch.FloatTensor` of shape `(batch_size*num_return_sequences, num_heads, generated_length, sequence_length)`.
+        cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`.
-        decoder_hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+        decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_return_sequences, generated_length, hidden_size)`.
+            `torch.FloatTensor` of shape `(batch_size*num_return_sequences, generated_length, hidden_size)`.
     """
 
     sequences: torch.LongTensor = None
@@ -201,24 +198,22 @@ class BeamSearchDecoderOnlyOutput(ModelOutput):
     Base class for outputs of decoder-only generation models using beam search.
 
     Args:
-        sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size*num_return_sequences, sequence_length)`):
-            The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
-            shorter if all batches finished early due to the :obj:`eos_token_id`.
-        sequences_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_return_sequences)`, `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
-            Final beam scores of the generated ``sequences``.
-        scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+        sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+            shorter if all batches finished early due to the `eos_token_id`.
+        sequences_scores (`torch.FloatTensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+            Final beam scores of the generated `sequences`.
+        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
             softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam
-            . :obj:`(max_length-input_ids.shape[-1],)`-shaped tuple of :obj:`torch.FloatTensor` with each tensor of
-            shape :obj:`(batch_size*num_beams*num_return_sequences, config.vocab_size)`).
-        attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            . `(max_length-input_ids.shape[-1],)`-shaped tuple of `torch.FloatTensor` with each tensor of
+            shape `(batch_size*num_beams*num_return_sequences, config.vocab_size)`).
+        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams, num_heads, generated_length,
-            sequence_length)`.
-        hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            `torch.FloatTensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
+        hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams*num_return_sequences, generated_length,
-            hidden_size)`.
+            `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
     """
 
     sequences: torch.LongTensor = None
@@ -236,34 +231,31 @@ class BeamSearchEncoderDecoderOutput(ModelOutput):
     attributes (respectively the decoder_attentions and the decoder_hidden_states attributes)
 
     Args:
-        sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size*num_return_sequences, sequence_length)`):
-            The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
-            shorter if all batches finished early due to the :obj:`eos_token_id`.
-        sequences_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_return_sequences)`, `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
-            Final beam scores of the generated ``sequences``.
-        scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+        sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+            shorter if all batches finished early due to the `eos_token_id`.
+        sequences_scores (`torch.FloatTensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+            Final beam scores of the generated `sequences`.
+        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
             softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam
-            . :obj:`(max_length-1,)`-shaped tuple of :obj:`torch.FloatTensor` with each tensor of shape
-            :obj:`(batch_size*num_beams, config.vocab_size)`).
-        attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
-        encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer of the decoder) of shape :obj:`(batch_size,
-            num_heads, sequence_length, sequence_length)`.
-        encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size*num_beams*num_return_sequences, sequence_length, hidden_size)`.
-        decoder_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            . `(max_length-1,)`-shaped tuple of `torch.FloatTensor` with each tensor of shape
+            `(batch_size*num_beams, config.vocab_size)`).
+        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape `(batch_size*num_beams*num_return_sequences, sequence_length, hidden_size)`.
+        decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams*num_return_sequences, num_heads,
-            generated_length, sequence_length)`.
-        cross_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, num_heads, generated_length, sequence_length)`.
+        cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`.
-        decoder_hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+        decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams*num_return_sequences, generated_length,
-            hidden_size)`.
+            `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
     """
 
     sequences: torch.LongTensor = None
@@ -282,23 +274,22 @@ class BeamSampleDecoderOnlyOutput(ModelOutput):
     Base class for outputs of decoder-only generation models using beam sample.
 
     Args:
-        sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size*num_return_sequences, sequence_length)`):
-            The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
-            shorter if all batches finished early due to the :obj:`eos_token_id`.
-        sequences_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_return_sequence)`, `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
-            Final beam scores of the generated ``sequences``.
-        scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+        sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+            shorter if all batches finished early due to the `eos_token_id`.
+        sequences_scores (`torch.FloatTensor` of shape `(batch_size * num_return_sequence)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+            Final beam scores of the generated `sequences`.
+        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
             softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam
-            . :obj:`(max_length-input_ids.shape[-1],)`-shaped tuple of :obj:`torch.FloatTensor` with each tensor of
-            shape :obj:`(batch_size*num_beams*num_return_sequences, config.vocab_size)`).
-        attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            . `(max_length-input_ids.shape[-1],)`-shaped tuple of `torch.FloatTensor` with each tensor of
+            shape `(batch_size*num_beams*num_return_sequences, config.vocab_size)`).
+        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams, num_heads, generated_length,
-            sequence_length)`.
-        hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            `torch.FloatTensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
+        hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams, generated_length, hidden_size)`.
+            `torch.FloatTensor` of shape `(batch_size*num_beams, generated_length, hidden_size)`.
     """
 
     sequences: torch.LongTensor = None
@@ -316,32 +307,30 @@ class BeamSampleEncoderDecoderOutput(ModelOutput):
     encoder_hidden_states attributes (respectively the decoder_attentions and the decoder_hidden_states attributes)
 
     Args:
-        sequences (:obj:`torch.LongTensor` of shape :obj:`(batch_size*num_beams, sequence_length)`):
-            The generated sequences. The second dimension (sequence_length) is either equal to :obj:`max_length` or
-            shorter if all batches finished early due to the :obj:`eos_token_id`.
-        sequences_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size * num_return_sequence)`, `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
-            Final beam scores of the generated ``sequences``.
-        scores (:obj:`tuple(torch.FloatTensor)` `optional`, returned when ``output_scores=True`` is passed or when ``config.output_scores=True``):
+        sequences (`torch.LongTensor` of shape `(batch_size*num_beams, sequence_length)`):
+            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
+            shorter if all batches finished early due to the `eos_token_id`.
+        sequences_scores (`torch.FloatTensor` of shape `(batch_size * num_return_sequence)`, *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
+            Final beam scores of the generated `sequences`.
+        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True` is passed or when `config.output_scores=True`):
             Processed beam scores for each vocabulary token at each generation step. Beam scores consisting of log
             softmax scores for each vocabulary token and sum of log softmax of previously generated tokens in this beam
-            . :obj:`(max_length-1,)`-shaped tuple of :obj:`torch.FloatTensor` with each tensor of shape
-            :obj:`(batch_size*num_beams, config.vocab_size)`).
-        encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for each layer of the decoder) of shape :obj:`(batch_size,
-            num_heads, sequence_length, sequence_length)`.
-        encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
-            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
-            of shape :obj:`(batch_size*num_beams, sequence_length, hidden_size)`.
-        decoder_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            . `(max_length-1,)`-shaped tuple of `torch.FloatTensor` with each tensor of shape
+            `(batch_size*num_beams, config.vocab_size)`).
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads, sequence_length, sequence_length)`.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape `(batch_size*num_beams, sequence_length, hidden_size)`.
+        decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams, num_heads, generated_length,
-            sequence_length)`.
-        cross_attentions (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_attentions=True`` is passed or ``config.output_attentions=True``):
+            `torch.FloatTensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
+        cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or `config.output_attentions=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_heads, generated_length, sequence_length)`.
-        decoder_hidden_states (:obj:`tuple(tuple(torch.FloatTensor))`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
+        decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
             Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
-            :obj:`torch.FloatTensor` of shape :obj:`(batch_size*num_beams, generated_length, hidden_size)`.
+            `torch.FloatTensor` of shape `(batch_size*num_beams, generated_length, hidden_size)`.
     """
 
     sequences: torch.LongTensor = None
@@ -366,7 +355,7 @@ ENCODER_MODEL_INPUT_NAMES = ["input_ids", "inputs_embeds", "input_values", "inpu
 class GenerationMixin:
     """
     A class containing all of the functions supporting generation, to be used as a mixin in
-    :class:`~transformers.PreTrainedModel`.
+    [`PreTrainedModel`].
     """
 
     def _prepare_model_inputs(
@@ -428,14 +417,14 @@ class GenerationMixin:
 
     def prepare_inputs_for_generation(self, input_ids: torch.LongTensor, **kwargs) -> Dict[str, Any]:
         """
-        Implement in subclasses of :class:`~transformers.PreTrainedModel` for custom behavior to prepare inputs in the
+        Implement in subclasses of [`PreTrainedModel`] for custom behavior to prepare inputs in the
         generate method.
         """
         return {"input_ids": input_ids}
 
     def adjust_logits_during_generation(self, logits: torch.FloatTensor, **kwargs) -> torch.FloatTensor:
         """
-        Implement in subclasses of :class:`~transformers.PreTrainedModel` for custom behavior to adjust the logits in
+        Implement in subclasses of [`PreTrainedModel`] for custom behavior to adjust the logits in
         the generate method.
         """
         return logits
@@ -612,8 +601,8 @@ class GenerationMixin:
         self, top_k: int = None, top_p: float = None, temperature: float = None, num_beams: int = None
     ) -> LogitsProcessorList:
         """
-        This class returns a :class:`~transformers.LogitsProcessorList` list object that contains all relevant
-        :class:`~transformers.LogitsWarper` instances used for multinomial sampling.
+        This class returns a [`LogitsProcessorList`] list object that contains all relevant
+        [`LogitsWarper`] instances used for multinomial sampling.
         """
 
         # init warp parameters
@@ -653,8 +642,8 @@ class GenerationMixin:
         logits_processor: Optional[LogitsProcessorList],
     ) -> LogitsProcessorList:
         """
-        This class returns a :class:`~transformers.LogitsProcessorList` list object that contains all relevant
-        :class:`~transformers.LogitsProcessor` instances used to modify the scores of the language model head.
+        This class returns a [`LogitsProcessorList`] list object that contains all relevant
+        [`LogitsProcessor`] instances used to modify the scores of the language model head.
         """
         processors = LogitsProcessorList()
 
@@ -793,198 +782,196 @@ class GenerationMixin:
         Generates sequences for models with a language modeling head. The method currently supports greedy decoding,
         multinomial sampling, beam-search decoding, and beam-search multinomial sampling.
 
-        Apart from :obj:`inputs`, all the arguments below will default to the value of the attribute of the same name
-        inside the :class:`~transformers.PretrainedConfig` of the model. The default values indicated are the default
+        Apart from `inputs`, all the arguments below will default to the value of the attribute of the same name
+        inside the [`PretrainedConfig`] of the model. The default values indicated are the default
         values of those config.
 
-        Most of these parameters are explained in more detail in `this blog post
-        <https://huggingface.co/blog/how-to-generate>`__.
+        Most of these parameters are explained in more detail in [this blog post](https://huggingface.co/blog/how-to-generate).
 
         Parameters:
 
-            inputs (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, :obj:`(batch_size, sequence_length, feature_dim)` or :obj:`(batch_size, num_channels, height, width)`, `optional`):
-                The sequence used as a prompt for the generation or as model inputs to the encoder. If :obj:`None` the
-                method initializes it with :obj:`bos_token_id` and a batch size of 1. For decoder-only models
-                :obj:`inputs` should of in the format of :obj:`input_ids`. For encoder-decoder models `inputs` can
-                represent any of :obj:`input_ids`, :obj:`input_values`, :obj:`input_features`, or :obj:`pixel_values`.
-            max_length (:obj:`int`, `optional`, defaults to :obj:`model.config.max_length`):
+            inputs (`torch.Tensor` of shape `(batch_size, sequence_length)`, `(batch_size, sequence_length, feature_dim)` or `(batch_size, num_channels, height, width)`, *optional*):
+                The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
+                method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models
+                `inputs` should of in the format of `input_ids`. For encoder-decoder models *inputs* can
+                represent any of `input_ids`, `input_values`, `input_features`, or `pixel_values`.
+            max_length (`int`, *optional*, defaults to `model.config.max_length`):
                 The maximum length of the sequence to be generated.
-            max_new_tokens (:obj:`int`, `optional`, defaults to None):
+            max_new_tokens (`int`, *optional*, defaults to None):
                 The maximum numbers of tokens to generate, ignore the current number of tokens. Use either
-                :obj:`max_new_tokens` or :obj:`max_length` but not both, they serve the same purpose.
-            min_length (:obj:`int`, `optional`, defaults to 10):
+                `max_new_tokens` or `max_length` but not both, they serve the same purpose.
+            min_length (`int`, *optional*, defaults to 10):
                 The minimum length of the sequence to be generated.
-            do_sample (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            do_sample (`bool`, *optional*, defaults to `False`):
                 Whether or not to use sampling ; use greedy decoding otherwise.
-            early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether to stop the beam search when at least ``num_beams`` sentences are finished per batch or not.
-            num_beams (:obj:`int`, `optional`, defaults to 1):
+            early_stopping (`bool`, *optional*, defaults to `False`):
+                Whether to stop the beam search when at least `num_beams` sentences are finished per batch or not.
+            num_beams (`int`, *optional*, defaults to 1):
                 Number of beams for beam search. 1 means no beam search.
-            temperature (:obj:`float`, `optional`, defaults to 1.0):
+            temperature (`float`, *optional*, defaults to 1.0):
                 The value used to module the next token probabilities.
-            top_k (:obj:`int`, `optional`, defaults to 50):
+            top_k (`int`, *optional*, defaults to 50):
                 The number of highest probability vocabulary tokens to keep for top-k-filtering.
-            top_p (:obj:`float`, `optional`, defaults to 1.0):
-                If set to float < 1, only the most probable tokens with probabilities that add up to :obj:`top_p` or
+            top_p (`float`, *optional*, defaults to 1.0):
+                If set to float < 1, only the most probable tokens with probabilities that add up to `top_p` or
                 higher are kept for generation.
-            repetition_penalty (:obj:`float`, `optional`, defaults to 1.0):
-                The parameter for repetition penalty. 1.0 means no penalty. See `this paper
-                <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
-            pad_token_id (:obj:`int`, `optional`):
-                The id of the `padding` token.
-            bos_token_id (:obj:`int`, `optional`):
-                The id of the `beginning-of-sequence` token.
-            eos_token_id (:obj:`int`, `optional`):
-                The id of the `end-of-sequence` token.
-            length_penalty (:obj:`float`, `optional`, defaults to 1.0):
+            repetition_penalty (`float`, *optional*, defaults to 1.0):
+                The parameter for repetition penalty. 1.0 means no penalty. See [this paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            pad_token_id (`int`, *optional*):
+                The id of the *padding* token.
+            bos_token_id (`int`, *optional*):
+                The id of the *beginning-of-sequence* token.
+            eos_token_id (`int`, *optional*):
+                The id of the *end-of-sequence* token.
+            length_penalty (`float`, *optional*, defaults to 1.0):
                 Exponential penalty to the length. 1.0 means no penalty. Set to values < 1.0 in order to encourage the
                 model to generate shorter sequences, to a value > 1.0 in order to encourage the model to produce longer
                 sequences.
-            no_repeat_ngram_size (:obj:`int`, `optional`, defaults to 0):
+            no_repeat_ngram_size (`int`, *optional*, defaults to 0):
                 If set to int > 0, all ngrams of that size can only occur once.
-            encoder_no_repeat_ngram_size (:obj:`int`, `optional`, defaults to 0):
-                If set to int > 0, all ngrams of that size that occur in the ``encoder_input_ids`` cannot occur in the
-                ``decoder_input_ids``.
-            bad_words_ids(:obj:`List[List[int]]`, `optional`):
+            encoder_no_repeat_ngram_size (`int`, *optional*, defaults to 0):
+                If set to int > 0, all ngrams of that size that occur in the `encoder_input_ids` cannot occur in the
+                `decoder_input_ids`.
+            bad_words_ids(`List[List[int]]`, *optional*):
                 List of token ids that are not allowed to be generated. In order to get the tokens of the words that
-                should not appear in the generated text, use :obj:`tokenizer(bad_word,
-                add_prefix_space=True).input_ids`.
-            num_return_sequences(:obj:`int`, `optional`, defaults to 1):
+                should not appear in the generated text, use `tokenizer(bad_word, add_prefix_space=True).input_ids`.
+            num_return_sequences(`int`, *optional*, defaults to 1):
                 The number of independently computed returned sequences for each element in the batch.
-            max_time(:obj:`float`, `optional`, defaults to None):
+            max_time(`float`, *optional*, defaults to None):
                 The maximum amount of time you allow the computation to run for in seconds. generation will still
                 finish the current pass after allocated time has been passed.
-            attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-                Mask to avoid performing attention on padding token indices. Mask values are in ``[0, 1]``, 1 for
+            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values are in `[0, 1]`, 1 for
                 tokens that are not masked, and 0 for masked tokens. If not provided, will default to a tensor the same
-                shape as :obj:`input_ids` that masks the pad token. `What are attention masks?
-                <../glossary.html#attention-mask>`__
-            decoder_start_token_id (:obj:`int`, `optional`):
-                If an encoder-decoder model starts decoding with a different token than `bos`, the id of that token.
-            use_cache: (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                shape as `input_ids` that masks the pad token. [What are attention masks?](../glossary#attention-mask)
+            decoder_start_token_id (`int`, *optional*):
+                If an encoder-decoder model starts decoding with a different token than *bos*, the id of that token.
+            use_cache: (`bool`, *optional*, defaults to `True`):
                 Whether or not the model should use the past last key/values attentions (if applicable to the model) to
                 speed up decoding.
-            num_beam_groups (:obj:`int`, `optional`, defaults to 1):
-                Number of groups to divide :obj:`num_beams` into in order to ensure diversity among different groups of
-                beams. `this paper <https://arxiv.org/pdf/1610.02424.pdf>`__ for more details.
-            diversity_penalty (:obj:`float`, `optional`, defaults to 0.0):
+            num_beam_groups (`int`, *optional*, defaults to 1):
+                Number of groups to divide `num_beams` into in order to ensure diversity among different groups of
+                beams. [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
+            diversity_penalty (`float`, *optional*, defaults to 0.0):
                 This value is subtracted from a beam's score if it generates a token same as any beam from other group
-                at a particular time. Note that :obj:`diversity_penalty` is only effective if ``group beam search`` is
+                at a particular time. Note that `diversity_penalty` is only effective if `group beam search` is
                 enabled.
-            prefix_allowed_tokens_fn: (:obj:`Callable[[int, torch.Tensor], List[int]]`, `optional`):
+            prefix_allowed_tokens_fn: (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
                 If provided, this function constraints the beam search to allowed tokens only at each step. If not
-                provided no constraint is applied. This function takes 2 arguments: the batch ID :obj:`batch_id` and
-                :obj:`input_ids`. It has to return a list with the allowed tokens for the next generation step
-                conditioned on the batch ID :obj:`batch_id` and the previously generated tokens :obj:`inputs_ids`. This
+                provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
+                `input_ids`. It has to return a list with the allowed tokens for the next generation step
+                conditioned on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This
                 argument is useful for constrained generation conditioned on the prefix, as described in
-                `Autoregressive Entity Retrieval <https://arxiv.org/abs/2010.00904>`__.
-            logits_processor (:obj:`LogitsProcessorList`, `optional`):
+                [Autoregressive Entity Retrieval](https://arxiv.org/abs/2010.00904).
+            logits_processor (`LogitsProcessorList`, *optional*):
                  Custom logits processors that complement the default logits processors built from arguments and a
                  model's config. If a logit processor is passed that is already created with the arguments or a model's
                  config an error is thrown. This feature is intended for advanced users.
-            stopping_criteria (:obj:`StoppingCriteriaList`, `optional`):
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
                  Custom stopping criteria that complement the default stopping criteria built from arguments and a
                  model's config. If a stopping criteria is passed that is already created with the arguments or a
                  model's config an error is thrown. This feature is intended for advanced users.
-            output_attentions (:obj:`bool`, `optional`, defaults to `False`):
-                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+            output_attentions (`bool`, *optional*, defaults to *False*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more details.
-            output_hidden_states (:obj:`bool`, `optional`, defaults to `False`):
-                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+            output_hidden_states (`bool`, *optional*, defaults to *False*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                 for more details.
-            output_scores (:obj:`bool`, `optional`, defaults to `False`):
-                Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
-            return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
-                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
-            forced_bos_token_id (:obj:`int`, `optional`):
-                The id of the token to force as the first generated token after the :obj:`decoder_start_token_id`.
-                Useful for multilingual models like :doc:`mBART <../model_doc/mbart>` where the first generated token
+            output_scores (`bool`, *optional*, defaults to *False*):
+                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to *False*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+            forced_bos_token_id (`int`, *optional*):
+                The id of the token to force as the first generated token after the `decoder_start_token_id`.
+                Useful for multilingual models like [mBART](../model_doc/mbart) where the first generated token
                 needs to be the target language token.
-            forced_eos_token_id (:obj:`int`, `optional`):
-                The id of the token to force as the last generated token when :obj:`max_length` is reached.
-            remove_invalid_values (:obj:`bool`, `optional`):
-                Whether to remove possible `nan` and `inf` outputs of the model to prevent the generation method to
-                crash. Note that using ``remove_invalid_values`` can slow down generation.
-            synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            forced_eos_token_id (`int`, *optional*):
+                The id of the token to force as the last generated token when `max_length` is reached.
+            remove_invalid_values (`bool`, *optional*):
+                Whether to remove possible *nan* and *inf* outputs of the model to prevent the generation method to
+                crash. Note that using `remove_invalid_values` can slow down generation.
+            synced_gpus (`bool`, *optional*, defaults to `False`):
                 Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
 
             model_kwargs:
-                Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model. If the
+                Additional model specific kwargs will be forwarded to the `forward` function of the model. If the
                 model is an encoder-decoder model, encoder specific kwargs should not be prefixed and decoder specific
-                kwargs should be prefixed with `decoder_`.
+                kwargs should be prefixed with *decoder_*.
 
         Return:
-            :class:`~transformers.file_utils.ModelOutput` or :obj:`torch.LongTensor`: A
-            :class:`~transformers.file_utils.ModelOutput` (if ``return_dict_in_generate=True`` or when
-            ``config.return_dict_in_generate=True``) or a :obj:`torch.FloatTensor`.
+            [`~file_utils.ModelOutput`] or `torch.LongTensor`: A
+            [`~file_utils.ModelOutput`] (if `return_dict_in_generate=True` or when
+            `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
 
-                If the model is `not` an encoder-decoder model (``model.config.is_encoder_decoder=False``), the
-                possible :class:`~transformers.file_utils.ModelOutput` types are:
+                If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the
+                possible [`~file_utils.ModelOutput`] types are:
 
-                    - :class:`~transformers.generation_utils.GreedySearchDecoderOnlyOutput`,
-                    - :class:`~transformers.generation_utils.SampleDecoderOnlyOutput`,
-                    - :class:`~transformers.generation_utils.BeamSearchDecoderOnlyOutput`,
-                    - :class:`~transformers.generation_utils.BeamSampleDecoderOnlyOutput`
+                    - [`~generation_utils.GreedySearchDecoderOnlyOutput`],
+                    - [`~generation_utils.SampleDecoderOnlyOutput`],
+                    - [`~generation_utils.BeamSearchDecoderOnlyOutput`],
+                    - [`~generation_utils.BeamSampleDecoderOnlyOutput`]
 
-                If the model is an encoder-decoder model (``model.config.is_encoder_decoder=True``), the possible
-                :class:`~transformers.file_utils.ModelOutput` types are:
+                If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
+                [`~file_utils.ModelOutput`] types are:
 
-                    - :class:`~transformers.generation_utils.GreedySearchEncoderDecoderOutput`,
-                    - :class:`~transformers.generation_utils.SampleEncoderDecoderOutput`,
-                    - :class:`~transformers.generation_utils.BeamSearchEncoderDecoderOutput`,
-                    - :class:`~transformers.generation_utils.BeamSampleEncoderDecoderOutput`
+                    - [`~generation_utils.GreedySearchEncoderDecoderOutput`],
+                    - [`~generation_utils.SampleEncoderDecoderOutput`],
+                    - [`~generation_utils.BeamSearchEncoderDecoderOutput`],
+                    - [`~generation_utils.BeamSampleEncoderDecoderOutput`]
 
-        Examples::
-            >>> from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
+        Examples:
 
-            >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
-            >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
-            >>> # do greedy decoding without providing a prompt
-            >>> outputs = model.generate(max_length=40)
-            >>> print("Generated:", tokenizer.decode(outputs[0], skip_special_tokens=True))
+        ```python
+        >>> from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
 
-            >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
-            >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
-            >>> document = (
-            ... "at least two people were killed in a suspected bomb attack on a passenger bus "
-            ... "in the strife-torn southern philippines on monday , the military said."
-            ... )
-            >>> # encode input context
-            >>> input_ids = tokenizer(document, return_tensors="pt").input_ids
-            >>> # generate 3 independent sequences using beam search decoding (5 beams)
-            >>> # with T5 encoder-decoder model conditioned on short news article.
-            >>> outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3)
-            >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
+        >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+        >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+        >>> # do greedy decoding without providing a prompt
+        >>> outputs = model.generate(max_length=40)
+        >>> print("Generated:", tokenizer.decode(outputs[0], skip_special_tokens=True))
 
-            >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
-            >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
-            >>> input_context = "The dog"
-            >>> # encode input context
-            >>> input_ids = tokenizer(input_context, return_tensors="pt").input_ids
-            >>> # generate 3 candidates using sampling
-            >>> outputs = model.generate(input_ids=input_ids, max_length=20, num_return_sequences=3, do_sample=True)
-            >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
+        >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+        >>> document = (
+        ... "at least two people were killed in a suspected bomb attack on a passenger bus "
+        ... "in the strife-torn southern philippines on monday , the military said."
+        ... )
+        >>> # encode input context
+        >>> input_ids = tokenizer(document, return_tensors="pt").input_ids
+        >>> # generate 3 independent sequences using beam search decoding (5 beams)
+        >>> # with T5 encoder-decoder model conditioned on short news article.
+        >>> outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3)
+        >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
 
-            >>> tokenizer = AutoTokenizer.from_pretrained("ctrl")
-            >>> model = AutoModelForCausalLM.from_pretrained("ctrl")
-            >>> # "Legal" is one of the control codes for ctrl
-            >>> input_context = "Legal My neighbor is"
-            >>> # encode input context
-            >>> input_ids = tokenizer(input_context, return_tensors="pt").input_ids
-            >>> outputs = model.generate(input_ids=input_ids, max_length=20, repetition_penalty=1.2)
-            >>> print("Generated:", tokenizer.decode(outputs[0], skip_special_tokens=True))
+        >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+        >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+        >>> input_context = "The dog"
+        >>> # encode input context
+        >>> input_ids = tokenizer(input_context, return_tensors="pt").input_ids
+        >>> # generate 3 candidates using sampling
+        >>> outputs = model.generate(input_ids=input_ids, max_length=20, num_return_sequences=3, do_sample=True)
+        >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
 
-            >>> tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=False)
-            >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
-            >>> input_context = "My cute dog"
-            >>> # get tokens of words that should not be generated
-            >>> bad_words_ids = tokenizer(["idiot", "stupid", "shut up"], add_prefix_space=True).input_ids
-            >>> # encode input context
-            >>> input_ids = tokenizer(input_context, return_tensors="pt").input_ids
-            >>> # generate sequences without allowing bad_words to be generated
-            >>> outputs = model.generate(input_ids=input_ids, max_length=20, do_sample=True, bad_words_ids=bad_words_ids)
-            >>> print("Generated:", tokenizer.decode(outputs[0], skip_special_tokens=True))
-        """
+        >>> tokenizer = AutoTokenizer.from_pretrained("ctrl")
+        >>> model = AutoModelForCausalLM.from_pretrained("ctrl")
+        >>> # "Legal" is one of the control codes for ctrl
+        >>> input_context = "Legal My neighbor is"
+        >>> # encode input context
+        >>> input_ids = tokenizer(input_context, return_tensors="pt").input_ids
+        >>> outputs = model.generate(input_ids=input_ids, max_length=20, repetition_penalty=1.2)
+        >>> print("Generated:", tokenizer.decode(outputs[0], skip_special_tokens=True))
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2", use_fast=False)
+        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+        >>> input_context = "My cute dog"
+        >>> # get tokens of words that should not be generated
+        >>> bad_words_ids = tokenizer(["idiot", "stupid", "shut up"], add_prefix_space=True).input_ids
+        >>> # encode input context
+        >>> input_ids = tokenizer(input_context, return_tensors="pt").input_ids
+        >>> # generate sequences without allowing bad_words to be generated
+        >>> outputs = model.generate(input_ids=input_ids, max_length=20, do_sample=True, bad_words_ids=bad_words_ids)
+        >>> print("Generated:", tokenizer.decode(outputs[0], skip_special_tokens=True))
+        ```"""
         # 1. Set generation parameters if not already defined
         bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
         num_beams = num_beams if num_beams is not None else self.config.num_beams
@@ -1292,75 +1279,76 @@ class GenerationMixin:
 
         Parameters:
 
-            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                 The sequence used as a prompt for the generation.
-            logits_processor (:obj:`LogitsProcessorList`, `optional`):
-                An instance of :class:`~transformers.LogitsProcessorList`. List of instances of class derived from
-                :class:`~transformers.LogitsProcessor` used to modify the prediction scores of the language modeling
+            logits_processor (`LogitsProcessorList`, *optional*):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from
+                [`LogitsProcessor`] used to modify the prediction scores of the language modeling
                 head applied at each generation step.
-            stopping_criteria (:obj:`StoppingCriteriaList`, `optional`):
-                An instance of :class:`~transformers.StoppingCriteriaList`. List of instances of class derived from
-                :class:`~transformers.StoppingCriteria` used to tell if the generation loop should stop.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                An instance of [`StoppingCriteriaList`]. List of instances of class derived from
+                [`StoppingCriteria`] used to tell if the generation loop should stop.
 
-            max_length (:obj:`int`, `optional`, defaults to 20):
-                **DEPRECATED**. Use :obj:`logits_processor` or :obj:`stopping_criteria` directly to cap the number of
+            max_length (`int`, *optional*, defaults to 20):
+                **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of
                 generated tokens. The maximum length of the sequence to be generated.
-            pad_token_id (:obj:`int`, `optional`):
-                The id of the `padding` token.
-            eos_token_id (:obj:`int`, `optional`):
-                The id of the `end-of-sequence` token.
-            output_attentions (:obj:`bool`, `optional`, defaults to `False`):
-                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+            pad_token_id (`int`, *optional*):
+                The id of the *padding* token.
+            eos_token_id (`int`, *optional*):
+                The id of the *end-of-sequence* token.
+            output_attentions (`bool`, *optional*, defaults to *False*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more details.
-            output_hidden_states (:obj:`bool`, `optional`, defaults to `False`):
-                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+            output_hidden_states (`bool`, *optional*, defaults to *False*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                 for more details.
-            output_scores (:obj:`bool`, `optional`, defaults to `False`):
-                Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
-            return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
-                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
-            synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            output_scores (`bool`, *optional*, defaults to *False*):
+                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to *False*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+            synced_gpus (`bool`, *optional*, defaults to `False`):
                 Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
             model_kwargs:
-                Additional model specific keyword arguments will be forwarded to the :obj:`forward` function of the
-                model. If model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`.
+                Additional model specific keyword arguments will be forwarded to the `forward` function of the
+                model. If model is an encoder-decoder model the kwargs should include `encoder_outputs`.
 
         Return:
-            :class:`~transformers.generation_utils.GreedySearchDecoderOnlyOutput`,
-            :class:`~transformers.generation_utils.GreedySearchEncoderDecoderOutput` or obj:`torch.LongTensor`: A
-            :obj:`torch.LongTensor` containing the generated tokens (default behaviour) or a
-            :class:`~transformers.generation_utils.GreedySearchDecoderOnlyOutput` if
-            ``model.config.is_encoder_decoder=False`` and ``return_dict_in_generate=True`` or a
-            :class:`~transformers.generation_utils.GreedySearchEncoderDecoderOutput` if
-            ``model.config.is_encoder_decoder=True``.
+            [`~generation_utils.GreedySearchDecoderOnlyOutput`],
+            [`~generation_utils.GreedySearchEncoderDecoderOutput`] or obj:*torch.LongTensor*: A
+            `torch.LongTensor` containing the generated tokens (default behaviour) or a
+            [`~generation_utils.GreedySearchDecoderOnlyOutput`] if
+            `model.config.is_encoder_decoder=False` and `return_dict_in_generate=True` or a
+            [`~generation_utils.GreedySearchEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
 
-        Examples::
+        Examples:
 
-            >>> from transformers import (
-            ... AutoTokenizer,
-            ... AutoModelForCausalLM,
-            ... LogitsProcessorList,
-            ... MinLengthLogitsProcessor,
-            ... )
+        ```python
+        >>> from transformers import (
+        ... AutoTokenizer,
+        ... AutoModelForCausalLM,
+        ... LogitsProcessorList,
+        ... MinLengthLogitsProcessor,
+        ... )
 
-            >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
-            >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
 
-            >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token
-            >>> model.config.pad_token_id = model.config.eos_token_id
+        >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token
+        >>> model.config.pad_token_id = model.config.eos_token_id
 
-            >>> input_prompt = "Today is a beautiful day, and"
-            >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
+        >>> input_prompt = "Today is a beautiful day, and"
+        >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
 
-            >>> # instantiate logits processors
-            >>> logits_processor = LogitsProcessorList([
-            ...     MinLengthLogitsProcessor(15, eos_token_id=model.config.eos_token_id),
-            ... ])
+        >>> # instantiate logits processors
+        >>> logits_processor = LogitsProcessorList([
+        ...     MinLengthLogitsProcessor(15, eos_token_id=model.config.eos_token_id),
+        ... ])
 
-            >>> outputs = model.greedy_search(input_ids, logits_processor=logits_processor)
+        >>> outputs = model.greedy_search(input_ids, logits_processor=logits_processor)
 
-            >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
-        """
+        >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
+        ```"""
         # init values
         logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
         stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
@@ -1518,85 +1506,86 @@ class GenerationMixin:
 
         Parameters:
 
-            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                 The sequence used as a prompt for the generation.
-            logits_processor (:obj:`LogitsProcessorList`, `optional`):
-                An instance of :class:`~transformers.LogitsProcessorList`. List of instances of class derived from
-                :class:`~transformers.LogitsProcessor` used to modify the prediction scores of the language modeling
+            logits_processor (`LogitsProcessorList`, *optional*):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from
+                [`LogitsProcessor`] used to modify the prediction scores of the language modeling
                 head applied at each generation step.
-            stopping_criteria (:obj:`StoppingCriteriaList`, `optional`):
-                An instance of :class:`~transformers.StoppingCriteriaList`. List of instances of class derived from
-                :class:`~transformers.StoppingCriteria` used to tell if the generation loop should stop.
-            logits_warper (:obj:`LogitsProcessorList`, `optional`):
-                An instance of :class:`~transformers.LogitsProcessorList`. List of instances of class derived from
-                :class:`~transformers.LogitsWarper` used to warp the prediction score distribution of the language
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                An instance of [`StoppingCriteriaList`]. List of instances of class derived from
+                [`StoppingCriteria`] used to tell if the generation loop should stop.
+            logits_warper (`LogitsProcessorList`, *optional*):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from
+                [`LogitsWarper`] used to warp the prediction score distribution of the language
                 modeling head applied before multinomial sampling at each generation step.
-            max_length (:obj:`int`, `optional`, defaults to 20):
-                **DEPRECATED**. Use :obj:`logits_processor` or :obj:`stopping_criteria` directly to cap the number of
+            max_length (`int`, *optional*, defaults to 20):
+                **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of
                 generated tokens. The maximum length of the sequence to be generated.
-            pad_token_id (:obj:`int`, `optional`):
-                The id of the `padding` token.
-            eos_token_id (:obj:`int`, `optional`):
-                The id of the `end-of-sequence` token.
-            output_attentions (:obj:`bool`, `optional`, defaults to `False`):
-                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+            pad_token_id (`int`, *optional*):
+                The id of the *padding* token.
+            eos_token_id (`int`, *optional*):
+                The id of the *end-of-sequence* token.
+            output_attentions (`bool`, *optional*, defaults to *False*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more details.
-            output_hidden_states (:obj:`bool`, `optional`, defaults to `False`):
-                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+            output_hidden_states (`bool`, *optional*, defaults to *False*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                 for more details.
-            output_scores (:obj:`bool`, `optional`, defaults to `False`):
-                Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
-            return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
-                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
-            synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            output_scores (`bool`, *optional*, defaults to *False*):
+                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to *False*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+            synced_gpus (`bool`, *optional*, defaults to `False`):
                 Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
             model_kwargs:
-                Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model. If
-                model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`.
+                Additional model specific kwargs will be forwarded to the `forward` function of the model. If
+                model is an encoder-decoder model the kwargs should include `encoder_outputs`.
 
         Return:
-            :class:`~transformers.generation_utils.SampleDecoderOnlyOutput`,
-            :class:`~transformers.generation_utils.SampleEncoderDecoderOutput` or obj:`torch.LongTensor`: A
-            :obj:`torch.LongTensor` containing the generated tokens (default behaviour) or a
-            :class:`~transformers.generation_utils.SampleDecoderOnlyOutput` if
-            ``model.config.is_encoder_decoder=False`` and ``return_dict_in_generate=True`` or a
-            :class:`~transformers.generation_utils.SampleEncoderDecoderOutput` if
-            ``model.config.is_encoder_decoder=True``.
+            [`~generation_utils.SampleDecoderOnlyOutput`],
+            [`~generation_utils.SampleEncoderDecoderOutput`] or obj:*torch.LongTensor*: A
+            `torch.LongTensor` containing the generated tokens (default behaviour) or a
+            [`~generation_utils.SampleDecoderOnlyOutput`] if
+            `model.config.is_encoder_decoder=False` and `return_dict_in_generate=True` or a
+            [`~generation_utils.SampleEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
 
-        Examples::
+        Examples:
 
-            >>> from transformers import (
-            ...    AutoTokenizer,
-            ...    AutoModelForCausalLM,
-            ...    LogitsProcessorList,
-            ...    MinLengthLogitsProcessor,
-            ...    TopKLogitsWarper,
-            ...    TemperatureLogitsWarper,
-            ... )
+        ```python
+        >>> from transformers import (
+        ...    AutoTokenizer,
+        ...    AutoModelForCausalLM,
+        ...    LogitsProcessorList,
+        ...    MinLengthLogitsProcessor,
+        ...    TopKLogitsWarper,
+        ...    TemperatureLogitsWarper,
+        ... )
 
-            >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
-            >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
+        >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        >>> model = AutoModelForCausalLM.from_pretrained("gpt2")
 
-            >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token
-            >>> model.config.pad_token_id = model.config.eos_token_id
+        >>> # set pad_token_id to eos_token_id because GPT2 does not have a EOS token
+        >>> model.config.pad_token_id = model.config.eos_token_id
 
-            >>> input_prompt = "Today is a beautiful day, and"
-            >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
+        >>> input_prompt = "Today is a beautiful day, and"
+        >>> input_ids = tokenizer(input_prompt, return_tensors="pt").input_ids
 
-            >>> # instantiate logits processors
-            >>> logits_processor = LogitsProcessorList([
-            ...     MinLengthLogitsProcessor(15, eos_token_id=model.config.eos_token_id),
-            ... ])
-            >>> # instantiate logits processors
-            >>> logits_warper = LogitsProcessorList([
-            ...     TopKLogitsWarper(50),
-            ...     TemperatureLogitsWarper(0.7),
-            ... ])
+        >>> # instantiate logits processors
+        >>> logits_processor = LogitsProcessorList([
+        ...     MinLengthLogitsProcessor(15, eos_token_id=model.config.eos_token_id),
+        ... ])
+        >>> # instantiate logits processors
+        >>> logits_warper = LogitsProcessorList([
+        ...     TopKLogitsWarper(50),
+        ...     TemperatureLogitsWarper(0.7),
+        ... ])
 
-            >>> outputs = model.sample(input_ids, logits_processor=logits_processor, logits_warper=logits_warper)
+        >>> outputs = model.sample(input_ids, logits_processor=logits_processor, logits_warper=logits_warper)
 
-            >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
-        """
+        >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
+        ```"""
 
         # init values
         logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
@@ -1759,97 +1748,98 @@ class GenerationMixin:
 
         Parameters:
 
-            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                 The sequence used as a prompt for the generation.
-            beam_scorer (:obj:`BeamScorer`):
-                An derived instance of :class:`~transformers.BeamScorer` that defines how beam hypotheses are
+            beam_scorer (`BeamScorer`):
+                An derived instance of [`BeamScorer`] that defines how beam hypotheses are
                 constructed, stored and sorted during generation. For more information, the documentation of
-                :class:`~transformers.BeamScorer` should be read.
-            logits_processor (:obj:`LogitsProcessorList`, `optional`):
-                An instance of :class:`~transformers.LogitsProcessorList`. List of instances of class derived from
-                :class:`~transformers.LogitsProcessor` used to modify the prediction scores of the language modeling
+                [`BeamScorer`] should be read.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from
+                [`LogitsProcessor`] used to modify the prediction scores of the language modeling
                 head applied at each generation step.
-            stopping_criteria (:obj:`StoppingCriteriaList`, `optional`):
-                An instance of :class:`~transformers.StoppingCriteriaList`. List of instances of class derived from
-                :class:`~transformers.StoppingCriteria` used to tell if the generation loop should stop.
-            max_length (:obj:`int`, `optional`, defaults to 20):
-                **DEPRECATED**. Use :obj:`logits_processor` or :obj:`stopping_criteria` directly to cap the number of
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                An instance of [`StoppingCriteriaList`]. List of instances of class derived from
+                [`StoppingCriteria`] used to tell if the generation loop should stop.
+            max_length (`int`, *optional*, defaults to 20):
+                **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of
                 generated tokens. The maximum length of the sequence to be generated.
-            pad_token_id (:obj:`int`, `optional`):
-                The id of the `padding` token.
-            eos_token_id (:obj:`int`, `optional`):
-                The id of the `end-of-sequence` token.
-            output_attentions (:obj:`bool`, `optional`, defaults to `False`):
-                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+            pad_token_id (`int`, *optional*):
+                The id of the *padding* token.
+            eos_token_id (`int`, *optional*):
+                The id of the *end-of-sequence* token.
+            output_attentions (`bool`, *optional*, defaults to *False*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more details.
-            output_hidden_states (:obj:`bool`, `optional`, defaults to `False`):
-                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+            output_hidden_states (`bool`, *optional*, defaults to *False*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                 for more details.
-            output_scores (:obj:`bool`, `optional`, defaults to `False`):
-                Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
-            return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
-                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
-            synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            output_scores (`bool`, *optional*, defaults to *False*):
+                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to *False*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+            synced_gpus (`bool`, *optional*, defaults to `False`):
                 Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
             model_kwargs:
-                Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model. If
-                model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`.
+                Additional model specific kwargs will be forwarded to the `forward` function of the model. If
+                model is an encoder-decoder model the kwargs should include `encoder_outputs`.
 
         Return:
-            :class:`~transformers.generation_utilsBeamSearchDecoderOnlyOutput`,
-            :class:`~transformers.generation_utils.BeamSearchEncoderDecoderOutput` or obj:`torch.LongTensor`: A
-            :obj:`torch.LongTensor` containing the generated tokens (default behaviour) or a
-            :class:`~transformers.generation_utils.BeamSearchDecoderOnlyOutput` if
-            ``model.config.is_encoder_decoder=False`` and ``return_dict_in_generate=True`` or a
-            :class:`~transformers.generation_utils.BeamSearchEncoderDecoderOutput` if
-            ``model.config.is_encoder_decoder=True``.
+            [`generation_utilsBeamSearchDecoderOnlyOutput`],
+            [`~generation_utils.BeamSearchEncoderDecoderOutput`] or obj:*torch.LongTensor*: A
+            `torch.LongTensor` containing the generated tokens (default behaviour) or a
+            [`~generation_utils.BeamSearchDecoderOnlyOutput`] if
+            `model.config.is_encoder_decoder=False` and `return_dict_in_generate=True` or a
+            [`~generation_utils.BeamSearchEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
 
 
-        Examples::
+        Examples:
 
-            >>> from transformers import (
-            ...    AutoTokenizer,
-            ...    AutoModelForSeq2SeqLM,
-            ...    LogitsProcessorList,
-            ...    MinLengthLogitsProcessor,
-            ...    BeamSearchScorer,
-            ... )
-            >>> import torch
+        ```python
+        >>> from transformers import (
+        ...    AutoTokenizer,
+        ...    AutoModelForSeq2SeqLM,
+        ...    LogitsProcessorList,
+        ...    MinLengthLogitsProcessor,
+        ...    BeamSearchScorer,
+        ... )
+        >>> import torch
 
-            >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
-            >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
 
-            >>> encoder_input_str = "translate English to German: How old are you?"
-            >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
+        >>> encoder_input_str = "translate English to German: How old are you?"
+        >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
 
 
-            >>> # lets run beam search using 3 beams
-            >>> num_beams = 3
-            >>> # define decoder start token ids
-            >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
-            >>> input_ids = input_ids * model.config.decoder_start_token_id
+        >>> # lets run beam search using 3 beams
+        >>> num_beams = 3
+        >>> # define decoder start token ids
+        >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
+        >>> input_ids = input_ids * model.config.decoder_start_token_id
 
-            >>> # add encoder_outputs to model keyword arguments
-            >>> model_kwargs = {
-            ...     "encoder_outputs": model.get_encoder()(encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True)
-            ... }
+        >>> # add encoder_outputs to model keyword arguments
+        >>> model_kwargs = {
+        ...     "encoder_outputs": model.get_encoder()(encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True)
+        ... }
 
-            >>> # instantiate beam scorer
-            >>> beam_scorer = BeamSearchScorer(
-            ...     batch_size=1,
-            ...     num_beams=num_beams,
-            ...     device=model.device,
-            ... )
+        >>> # instantiate beam scorer
+        >>> beam_scorer = BeamSearchScorer(
+        ...     batch_size=1,
+        ...     num_beams=num_beams,
+        ...     device=model.device,
+        ... )
 
-            >>> # instantiate logits processors
-            >>> logits_processor = LogitsProcessorList([
-            ...     MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
-            ... ])
+        >>> # instantiate logits processors
+        >>> logits_processor = LogitsProcessorList([
+        ...     MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
+        ... ])
 
-            >>> outputs = model.beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs)
+        >>> outputs = model.beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs)
 
-            >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
-        """
+        >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
+        ```"""
         # init values
         logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
         stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
@@ -2052,109 +2042,110 @@ class GenerationMixin:
 
         Parameters:
 
-            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                 The sequence used as a prompt for the generation.
-            beam_scorer (:obj:`BeamScorer`):
-                A derived instance of :class:`~transformers.BeamScorer` that defines how beam hypotheses are
+            beam_scorer (`BeamScorer`):
+                A derived instance of [`BeamScorer`] that defines how beam hypotheses are
                 constructed, stored and sorted during generation. For more information, the documentation of
-                :class:`~transformers.BeamScorer` should be read.
-            logits_processor (:obj:`LogitsProcessorList`, `optional`):
-                An instance of :class:`~transformers.LogitsProcessorList`. List of instances of class derived from
-                :class:`~transformers.LogitsProcessor` used to modify the prediction scores of the language modeling
+                [`BeamScorer`] should be read.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from
+                [`LogitsProcessor`] used to modify the prediction scores of the language modeling
                 head applied at each generation step.
-            stopping_criteria (:obj:`StoppingCriteriaList`, `optional`):
-                An instance of :class:`~transformers.StoppingCriteriaList`. List of instances of class derived from
-                :class:`~transformers.StoppingCriteria` used to tell if the generation loop should stop.
-            logits_warper (:obj:`LogitsProcessorList`, `optional`):
-                An instance of :class:`~transformers.LogitsProcessorList`. List of instances of class derived from
-                :class:`~transformers.LogitsWarper` used to warp the prediction score distribution of the language
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                An instance of [`StoppingCriteriaList`]. List of instances of class derived from
+                [`StoppingCriteria`] used to tell if the generation loop should stop.
+            logits_warper (`LogitsProcessorList`, *optional*):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from
+                [`LogitsWarper`] used to warp the prediction score distribution of the language
                 modeling head applied before multinomial sampling at each generation step.
-            max_length (:obj:`int`, `optional`, defaults to 20):
-                **DEPRECATED**. Use :obj:`logits_processor` or :obj:`stopping_criteria` directly to cap the number of
+            max_length (`int`, *optional*, defaults to 20):
+                **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of
                 generated tokens. The maximum length of the sequence to be generated.
-            pad_token_id (:obj:`int`, `optional`):
-                The id of the `padding` token.
-            eos_token_id (:obj:`int`, `optional`):
-                The id of the `end-of-sequence` token.
-            output_attentions (:obj:`bool`, `optional`, defaults to `False`):
-                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+            pad_token_id (`int`, *optional*):
+                The id of the *padding* token.
+            eos_token_id (`int`, *optional*):
+                The id of the *end-of-sequence* token.
+            output_attentions (`bool`, *optional*, defaults to *False*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more details.
-            output_hidden_states (:obj:`bool`, `optional`, defaults to `False`):
-                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+            output_hidden_states (`bool`, *optional*, defaults to *False*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                 for more details.
-            output_scores (:obj:`bool`, `optional`, defaults to `False`):
-                Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
-            return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
-                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
-            synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            output_scores (`bool`, *optional*, defaults to *False*):
+                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to *False*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+            synced_gpus (`bool`, *optional*, defaults to `False`):
                 Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
             model_kwargs:
-                Additional model specific kwargs will be forwarded to the :obj:`forward` function of the model. If
-                model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`.
+                Additional model specific kwargs will be forwarded to the `forward` function of the model. If
+                model is an encoder-decoder model the kwargs should include `encoder_outputs`.
 
         Return:
-            :class:`~transformers.generation_utils.BeamSampleDecoderOnlyOutput`,
-            :class:`~transformers.generation_utils.BeamSampleEncoderDecoderOutput` or obj:`torch.LongTensor`: A
-            :obj:`torch.LongTensor` containing the generated tokens (default behaviour) or a
-            :class:`~transformers.generation_utils.BeamSampleDecoderOnlyOutput` if
-            ``model.config.is_encoder_decoder=False`` and ``return_dict_in_generate=True`` or a
-            :class:`~transformers.generation_utils.BeamSampleEncoderDecoderOutput` if
-            ``model.config.is_encoder_decoder=True``.
+            [`~generation_utils.BeamSampleDecoderOnlyOutput`],
+            [`~generation_utils.BeamSampleEncoderDecoderOutput`] or obj:*torch.LongTensor*: A
+            `torch.LongTensor` containing the generated tokens (default behaviour) or a
+            [`~generation_utils.BeamSampleDecoderOnlyOutput`] if
+            `model.config.is_encoder_decoder=False` and `return_dict_in_generate=True` or a
+            [`~generation_utils.BeamSampleEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
 
-        Examples::
+        Examples:
 
-            >>> from transformers import (
-            ...     AutoTokenizer,
-            ...     AutoModelForSeq2SeqLM,
-            ...     LogitsProcessorList,
-            ...     MinLengthLogitsProcessor,
-            ...     TopKLogitsWarper,
-            ...     TemperatureLogitsWarper,
-            ...     BeamSearchScorer,
-            ... )
-            >>> import torch
+        ```python
+        >>> from transformers import (
+        ...     AutoTokenizer,
+        ...     AutoModelForSeq2SeqLM,
+        ...     LogitsProcessorList,
+        ...     MinLengthLogitsProcessor,
+        ...     TopKLogitsWarper,
+        ...     TemperatureLogitsWarper,
+        ...     BeamSearchScorer,
+        ... )
+        >>> import torch
 
-            >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
-            >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
 
-            >>> encoder_input_str = "translate English to German: How old are you?"
-            >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
+        >>> encoder_input_str = "translate English to German: How old are you?"
+        >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
 
-            >>> # lets run beam search using 3 beams
-            >>> num_beams = 3
-            >>> # define decoder start token ids
-            >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
-            >>> input_ids = input_ids * model.config.decoder_start_token_id
+        >>> # lets run beam search using 3 beams
+        >>> num_beams = 3
+        >>> # define decoder start token ids
+        >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
+        >>> input_ids = input_ids * model.config.decoder_start_token_id
 
-            >>> # add encoder_outputs to model keyword arguments
-            >>> model_kwargs = {
-            ...     "encoder_outputs": model.get_encoder()(encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True)
-            ... }
+        >>> # add encoder_outputs to model keyword arguments
+        >>> model_kwargs = {
+        ...     "encoder_outputs": model.get_encoder()(encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True)
+        ... }
 
-            >>> # instantiate beam scorer
-            >>> beam_scorer = BeamSearchScorer(
-            ...     batch_size=1,
-            ...     max_length=model.config.max_length,
-            ...     num_beams=num_beams,
-            ...     device=model.device,
-            ... )
+        >>> # instantiate beam scorer
+        >>> beam_scorer = BeamSearchScorer(
+        ...     batch_size=1,
+        ...     max_length=model.config.max_length,
+        ...     num_beams=num_beams,
+        ...     device=model.device,
+        ... )
 
-            >>> # instantiate logits processors
-            >>> logits_processor = LogitsProcessorList([
-            ...     MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id)
-            ... ])
-            >>> # instantiate logits processors
-            >>> logits_warper = LogitsProcessorList([
-            ...     TopKLogitsWarper(50),
-            ...     TemperatureLogitsWarper(0.7),
-            ... ])
+        >>> # instantiate logits processors
+        >>> logits_processor = LogitsProcessorList([
+        ...     MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id)
+        ... ])
+        >>> # instantiate logits processors
+        >>> logits_warper = LogitsProcessorList([
+        ...     TopKLogitsWarper(50),
+        ...     TemperatureLogitsWarper(0.7),
+        ... ])
 
-            >>> outputs = model.beam_sample(
-            ...     input_ids, beam_scorer, logits_processor=logits_processor, logits_warper=logits_warper, **model_kwargs
-            ... )
+        >>> outputs = model.beam_sample(
+        ...     input_ids, beam_scorer, logits_processor=logits_processor, logits_warper=logits_warper, **model_kwargs
+        ... )
 
-            >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
-        """
+        >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
+        ```"""
         # init values
         logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
         stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
@@ -2354,102 +2345,103 @@ class GenerationMixin:
 
         Parameters:
 
-            input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                 The sequence used as a prompt for the generation.
-            beam_scorer (:obj:`BeamScorer`):
-                An derived instance of :class:`~transformers.BeamScorer` that defines how beam hypotheses are
+            beam_scorer (`BeamScorer`):
+                An derived instance of [`BeamScorer`] that defines how beam hypotheses are
                 constructed, stored and sorted during generation. For more information, the documentation of
-                :class:`~transformers.BeamScorer` should be read.
-            logits_processor (:obj:`LogitsProcessorList`, `optional`):
-                An instance of :class:`~transformers.LogitsProcessorList`. List of instances of class derived from
-                :class:`~transformers.LogitsProcessor` used to modify the prediction scores of the language modeling
+                [`BeamScorer`] should be read.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from
+                [`LogitsProcessor`] used to modify the prediction scores of the language modeling
                 head applied at each generation step.
-            stopping_criteria (:obj:`StoppingCriteriaList`, `optional`):
-                An instance of :class:`~transformers.StoppingCriteriaList`. List of instances of class derived from
-                :class:`~transformers.StoppingCriteria` used to tell if the generation loop should stop.
-            max_length (:obj:`int`, `optional`, defaults to 20):
-                **DEPRECATED**. Use :obj:`logits_processor` or :obj:`stopping_criteria` directly to cap the number of
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                An instance of [`StoppingCriteriaList`]. List of instances of class derived from
+                [`StoppingCriteria`] used to tell if the generation loop should stop.
+            max_length (`int`, *optional*, defaults to 20):
+                **DEPRECATED**. Use `logits_processor` or `stopping_criteria` directly to cap the number of
                 generated tokens. The maximum length of the sequence to be generated.
-            pad_token_id (:obj:`int`, `optional`):
-                The id of the `padding` token.
-            eos_token_id (:obj:`int`, `optional`):
-                The id of the `end-of-sequence` token.
-            output_attentions (:obj:`bool`, `optional`, defaults to `False`):
-                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+            pad_token_id (`int`, *optional*):
+                The id of the *padding* token.
+            eos_token_id (`int`, *optional*):
+                The id of the *end-of-sequence* token.
+            output_attentions (`bool`, *optional*, defaults to *False*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more details.
-            output_hidden_states (:obj:`bool`, `optional`, defaults to `False`):
-                Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors
+            output_hidden_states (`bool`, *optional*, defaults to *False*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                 for more details.
-            output_scores (:obj:`bool`, `optional`, defaults to `False`):
-                Whether or not to return the prediction scores. See ``scores`` under returned tensors for more details.
-            return_dict_in_generate (:obj:`bool`, `optional`, defaults to `False`):
-                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
-            synced_gpus (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            output_scores (`bool`, *optional*, defaults to *False*):
+                Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
+            return_dict_in_generate (`bool`, *optional*, defaults to *False*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+            synced_gpus (`bool`, *optional*, defaults to `False`):
                 Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
 
             model_kwargs:
-                Additional model specific kwargs that will be forwarded to the :obj:`forward` function of the model. If
-                model is an encoder-decoder model the kwargs should include :obj:`encoder_outputs`.
+                Additional model specific kwargs that will be forwarded to the `forward` function of the model. If
+                model is an encoder-decoder model the kwargs should include `encoder_outputs`.
 
         Return:
-            :class:`~transformers.generation_utils.BeamSearchDecoderOnlyOutput`,
-            :class:`~transformers.generation_utils.BeamSearchEncoderDecoderOutput` or obj:`torch.LongTensor`: A
-            :obj:`torch.LongTensor` containing the generated tokens (default behaviour) or a
-            :class:`~transformers.generation_utils.BeamSearchDecoderOnlyOutput` if
-            :class:`~transformers.generation_utils.BeamSearchDecoderOnlyOutput` if
-            ``model.config.is_encoder_decoder=False`` and ``return_dict_in_generate=True`` or a
-            :class:`~transformers.generation_utils.BeamSearchEncoderDecoderOutput` if
-            ``model.config.is_encoder_decoder=True``.
+            [`~generation_utils.BeamSearchDecoderOnlyOutput`],
+            [`~generation_utils.BeamSearchEncoderDecoderOutput`] or obj:*torch.LongTensor*: A
+            `torch.LongTensor` containing the generated tokens (default behaviour) or a
+            [`~generation_utils.BeamSearchDecoderOnlyOutput`] if
+            [`~generation_utils.BeamSearchDecoderOnlyOutput`] if
+            `model.config.is_encoder_decoder=False` and `return_dict_in_generate=True` or a
+            [`~generation_utils.BeamSearchEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
 
-        Examples::
+        Examples:
 
-            >>> from transformers import (
-            ...    AutoTokenizer,
-            ...    AutoModelForSeq2SeqLM,
-            ...    LogitsProcessorList,
-            ...    MinLengthLogitsProcessor,
-            ...    HammingDiversityLogitsProcessor,
-            ...    BeamSearchScorer,
-            ... )
-            >>> import torch
+        ```python
+        >>> from transformers import (
+        ...    AutoTokenizer,
+        ...    AutoModelForSeq2SeqLM,
+        ...    LogitsProcessorList,
+        ...    MinLengthLogitsProcessor,
+        ...    HammingDiversityLogitsProcessor,
+        ...    BeamSearchScorer,
+        ... )
+        >>> import torch
 
-            >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
-            >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
+        >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+        >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
 
-            >>> encoder_input_str = "translate English to German: How old are you?"
-            >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
+        >>> encoder_input_str = "translate English to German: How old are you?"
+        >>> encoder_input_ids = tokenizer(encoder_input_str, return_tensors="pt").input_ids
 
 
-            >>> # lets run diverse beam search using 6 beams
-            >>> num_beams = 6
-            >>> # define decoder start token ids
-            >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
-            >>> input_ids = input_ids * model.config.decoder_start_token_id
+        >>> # lets run diverse beam search using 6 beams
+        >>> num_beams = 6
+        >>> # define decoder start token ids
+        >>> input_ids = torch.ones((num_beams, 1), device=model.device, dtype=torch.long)
+        >>> input_ids = input_ids * model.config.decoder_start_token_id
 
-            >>> # add encoder_outputs to model keyword arguments
-            >>> model_kwargs = {
-            ...     "encoder_outputs": model.get_encoder()(encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True)
-            ... }
+        >>> # add encoder_outputs to model keyword arguments
+        >>> model_kwargs = {
+        ...     "encoder_outputs": model.get_encoder()(encoder_input_ids.repeat_interleave(num_beams, dim=0), return_dict=True)
+        ... }
 
-            >>> # instantiate beam scorer
-            >>> beam_scorer = BeamSearchScorer(
-            ...     batch_size=1,
-            ...     max_length=model.config.max_length,
-            ...     num_beams=num_beams,
-            ...     device=model.device,
-            ...     num_beam_groups=3
-            ... )
+        >>> # instantiate beam scorer
+        >>> beam_scorer = BeamSearchScorer(
+        ...     batch_size=1,
+        ...     max_length=model.config.max_length,
+        ...     num_beams=num_beams,
+        ...     device=model.device,
+        ...     num_beam_groups=3
+        ... )
 
-            >>> # instantiate logits processors
-            >>> logits_processor = LogitsProcessorList([
-            ...     HammingDiversityLogitsProcessor(5.5, num_beams=6, num_beam_groups=3),
-            ...     MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
-            ... ])
+        >>> # instantiate logits processors
+        >>> logits_processor = LogitsProcessorList([
+        ...     HammingDiversityLogitsProcessor(5.5, num_beams=6, num_beam_groups=3),
+        ...     MinLengthLogitsProcessor(5, eos_token_id=model.config.eos_token_id),
+        ... ])
 
-            >>> outputs = model.group_beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs)
+        >>> outputs = model.group_beam_search(input_ids, beam_scorer, logits_processor=logits_processor, **model_kwargs)
 
-            >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
-        """
+        >>> print("Generated:", tokenizer.batch_decode(outputs, skip_special_tokens=True))
+        ```"""
         # init values
         logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
         stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
@@ -2688,12 +2680,12 @@ def top_k_top_p_filtering(
 
     Args:
         logits: logits distribution shape (batch size, vocabulary size)
-        top_k (:obj:`int`, `optional`, defaults to 0):
+        top_k (`int`, *optional*, defaults to 0):
             If > 0, only keep the top k tokens with highest probability (top-k filtering)
-        top_p (:obj:`float`, `optional`, defaults to 1.0):
+        top_p (`float`, *optional*, defaults to 1.0):
             If < 1.0, only keep the top tokens with cumulative probability >= top_p (nucleus filtering). Nucleus
             filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
-        min_tokens_to_keep (:obj:`int`, `optional`, defaults to 1):
+        min_tokens_to_keep (`int`, *optional*, defaults to 1):
             Minimumber of tokens we keep per batch example in the output.
 
     From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index d6cf5badbe..773f3e1cad 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -41,14 +41,14 @@ def is_torch_tensor(obj):
 
 def load_image(image: Union[str, "PIL.Image.Image"]) -> "PIL.Image.Image":
     """
-    Loads :obj:`image` to a PIL Image.
+    Loads `image` to a PIL Image.
 
     Args:
-        image (:obj:`str` or :obj:`PIL.Image.Image`):
+        image (`str` or `PIL.Image.Image`):
             The image to convert to the PIL Image format.
 
     Returns:
-        :obj:`PIL.Image.Image`: A PIL Image.
+        `PIL.Image.Image`: A PIL Image.
     """
     if isinstance(image, str):
         if image.startswith("http://") or image.startswith("https://"):
@@ -87,15 +87,15 @@ class ImageFeatureExtractionMixin:
 
     def to_pil_image(self, image, rescale=None):
         """
-        Converts :obj:`image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last
+        Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last
         axis if needed.
 
         Args:
-            image (:obj:`PIL.Image.Image` or :obj:`numpy.ndarray` or :obj:`torch.Tensor`):
+            image (`PIL.Image.Image` or `numpy.ndarray` or `torch.Tensor`):
                 The image to convert to the PIL Image format.
-            rescale (:obj:`bool`, `optional`):
+            rescale (`bool`, *optional*):
                 Whether or not to apply the scaling factor (to make pixel values integers between 0 and 255). Will
-                default to :obj:`True` if the image type is a floating type, :obj:`False` otherwise.
+                default to `True` if the image type is a floating type, `False` otherwise.
         """
         self._ensure_format_supported(image)
 
@@ -117,17 +117,17 @@ class ImageFeatureExtractionMixin:
 
     def to_numpy_array(self, image, rescale=None, channel_first=True):
         """
-        Converts :obj:`image` to a numpy array. Optionally rescales it and puts the channel dimension as the first
+        Converts `image` to a numpy array. Optionally rescales it and puts the channel dimension as the first
         dimension.
 
         Args:
-            image (:obj:`PIL.Image.Image` or :obj:`np.ndarray` or :obj:`torch.Tensor`):
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
                 The image to convert to a NumPy array.
-            rescale (:obj:`bool`, `optional`):
+            rescale (`bool`, *optional*):
                 Whether or not to apply the scaling factor (to make pixel values floats between 0. and 1.). Will
-                default to :obj:`True` if the image is a PIL Image or an array/tensor of integers, :obj:`False`
+                default to `True` if the image is a PIL Image or an array/tensor of integers, `False`
                 otherwise.
-            channel_first (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            channel_first (`bool`, *optional*, defaults to `True`):
                 Whether or not to permute the dimensions of the image to put the channel dimension first.
         """
         self._ensure_format_supported(image)
@@ -151,15 +151,15 @@ class ImageFeatureExtractionMixin:
 
     def normalize(self, image, mean, std):
         """
-        Normalizes :obj:`image` with :obj:`mean` and :obj:`std`. Note that this will trigger a conversion of
-        :obj:`image` to a NumPy array if it's a PIL Image.
+        Normalizes `image` with `mean` and `std`. Note that this will trigger a conversion of
+        `image` to a NumPy array if it's a PIL Image.
 
         Args:
-            image (:obj:`PIL.Image.Image` or :obj:`np.ndarray` or :obj:`torch.Tensor`):
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
                 The image to normalize.
-            mean (:obj:`List[float]` or :obj:`np.ndarray` or :obj:`torch.Tensor`):
+            mean (`List[float]` or `np.ndarray` or `torch.Tensor`):
                 The mean (per channel) to use for normalization.
-            std (:obj:`List[float]` or :obj:`np.ndarray` or :obj:`torch.Tensor`):
+            std (`List[float]` or `np.ndarray` or `torch.Tensor`):
                 The standard deviation (per channel) to use for normalization.
         """
         self._ensure_format_supported(image)
@@ -187,14 +187,14 @@ class ImageFeatureExtractionMixin:
 
     def resize(self, image, size, resample=PIL.Image.BILINEAR):
         """
-        Resizes :obj:`image`. Note that this will trigger a conversion of :obj:`image` to a PIL Image.
+        Resizes `image`. Note that this will trigger a conversion of `image` to a PIL Image.
 
         Args:
-            image (:obj:`PIL.Image.Image` or :obj:`np.ndarray` or :obj:`torch.Tensor`):
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
                 The image to resize.
-            size (:obj:`int` or :obj:`Tuple[int, int]`):
+            size (`int` or `Tuple[int, int]`):
                 The size to use for resizing the image.
-            resample (:obj:`int`, `optional`, defaults to :obj:`PIL.Image.BILINEAR`):
+            resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
                 The filter to user for resampling.
         """
         self._ensure_format_supported(image)
@@ -210,13 +210,13 @@ class ImageFeatureExtractionMixin:
 
     def center_crop(self, image, size):
         """
-        Crops :obj:`image` to the given size using a center crop. Note that if the image is too small to be cropped to
+        Crops `image` to the given size using a center crop. Note that if the image is too small to be cropped to
         the size given, it will be padded (so the returned result has the size asked).
 
         Args:
-            image (:obj:`PIL.Image.Image` or :obj:`np.ndarray` or :obj:`torch.Tensor`):
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
                 The image to resize.
-            size (:obj:`int` or :obj:`Tuple[int, int]`):
+            size (`int` or `Tuple[int, int]`):
                 The size to which crop the image.
         """
         self._ensure_format_supported(image)
diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py
index 57bc0251fb..15ef465131 100644
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -264,11 +264,11 @@ def run_hp_search_ray(trainer, n_trials: int, direction: str, **kwargs) -> BestR
     @functools.wraps(trainable)
     def dynamic_modules_import_trainable(*args, **kwargs):
         """
-        Wrapper around ``tune.with_parameters`` to ensure datasets_modules are loaded on each Actor.
+        Wrapper around `tune.with_parameters` to ensure datasets_modules are loaded on each Actor.
 
         Without this, an ImportError will be thrown. See https://github.com/huggingface/transformers/issues/11565.
 
-        Assumes that ``_objective``, defined above, is a function.
+        Assumes that `_objective`, defined above, is a function.
         """
         if is_datasets_available():
             import datasets.load
@@ -372,11 +372,10 @@ def rewrite_logs(d):
 
 class TensorBoardCallback(TrainerCallback):
     """
-    A :class:`~transformers.TrainerCallback` that sends the logs to `TensorBoard
-    <https://www.tensorflow.org/tensorboard>`__.
+    A [`TrainerCallback`] that sends the logs to [TensorBoard](https://www.tensorflow.org/tensorboard).
 
     Args:
-        tb_writer (:obj:`SummaryWriter`, `optional`):
+        tb_writer (`SummaryWriter`, *optional*):
             The writer to use. Will instantiate one if not set.
     """
 
@@ -461,7 +460,7 @@ class TensorBoardCallback(TrainerCallback):
 
 class WandbCallback(TrainerCallback):
     """
-    A :class:`~transformers.TrainerCallback` that sends the logs to `Weight and Biases <https://www.wandb.com/>`__.
+    A [`TrainerCallback`] that sends the logs to [Weight and Biases](https://www.wandb.com/).
     """
 
     def __init__(self):
@@ -478,22 +477,21 @@ class WandbCallback(TrainerCallback):
 
     def setup(self, args, state, model, **kwargs):
         """
-        Setup the optional Weights & Biases (`wandb`) integration.
+        Setup the optional Weights & Biases (*wandb*) integration.
 
-        One can subclass and override this method to customize the setup if needed. Find more information `here
-        <https://docs.wandb.ai/integrations/huggingface>`__. You can also override the following environment variables:
+        One can subclass and override this method to customize the setup if needed. Find more information [here](https://docs.wandb.ai/integrations/huggingface). You can also override the following environment variables:
 
         Environment:
-            WANDB_LOG_MODEL (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            WANDB_LOG_MODEL (`bool`, *optional*, defaults to `False`):
                 Whether or not to log model as artifact at the end of training. Use along with
-                `TrainingArguments.load_best_model_at_end` to upload best model.
-            WANDB_WATCH (:obj:`str`, `optional` defaults to :obj:`"gradients"`):
-                Can be :obj:`"gradients"`, :obj:`"all"` or :obj:`"false"`. Set to :obj:`"false"` to disable gradient
-                logging or :obj:`"all"` to log gradients and parameters.
-            WANDB_PROJECT (:obj:`str`, `optional`, defaults to :obj:`"huggingface"`):
+                *TrainingArguments.load_best_model_at_end* to upload best model.
+            WANDB_WATCH (`str`, *optional* defaults to `"gradients"`):
+                Can be `"gradients"`, `"all"` or `"false"`. Set to `"false"` to disable gradient
+                logging or `"all"` to log gradients and parameters.
+            WANDB_PROJECT (`str`, *optional*, defaults to `"huggingface"`):
                 Set this to a custom string to store results in a different project.
-            WANDB_DISABLED (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not to disable wandb entirely. Set `WANDB_DISABLED=true` to disable.
+            WANDB_DISABLED (`bool`, *optional*, defaults to `False`):
+                Whether or not to disable wandb entirely. Set *WANDB_DISABLED=true* to disable.
         """
         if self._wandb is None:
             return
@@ -585,7 +583,7 @@ class WandbCallback(TrainerCallback):
 
 class CometCallback(TrainerCallback):
     """
-    A :class:`~transformers.TrainerCallback` that sends the logs to `Comet ML <https://www.comet.ml/site/>`__.
+    A [`TrainerCallback`] that sends the logs to [Comet ML](https://www.comet.ml/site/).
     """
 
     def __init__(self):
@@ -599,19 +597,18 @@ class CometCallback(TrainerCallback):
         Setup the optional Comet.ml integration.
 
         Environment:
-            COMET_MODE (:obj:`str`, `optional`):
+            COMET_MODE (`str`, *optional*):
                 Whether to create an online, offline experiment or disable Comet logging. Can be "OFFLINE", "ONLINE",
                 or "DISABLED". Defaults to "ONLINE".
-            COMET_PROJECT_NAME (:obj:`str`, `optional`):
+            COMET_PROJECT_NAME (`str`, *optional*):
                 Comet project name for experiments
-            COMET_OFFLINE_DIRECTORY (:obj:`str`, `optional`):
-                Folder to use for saving offline experiments when :obj:`COMET_MODE` is "OFFLINE"
-            COMET_LOG_ASSETS (:obj:`str`, `optional`):
+            COMET_OFFLINE_DIRECTORY (`str`, *optional*):
+                Folder to use for saving offline experiments when `COMET_MODE` is "OFFLINE"
+            COMET_LOG_ASSETS (`str`, *optional*):
                 Whether or not to log training assets (tf event logs, checkpoints, etc), to Comet. Can be "TRUE", or
                 "FALSE". Defaults to "TRUE".
 
-        For a number of configurable items in the environment, see `here
-        <https://www.comet.ml/docs/python-sdk/advanced/#comet-configuration-variables>`__.
+        For a number of configurable items in the environment, see [here](https://www.comet.ml/docs/python-sdk/advanced/#comet-configuration-variables).
         """
         self._initialized = True
         log_assets = os.getenv("COMET_LOG_ASSETS", "FALSE").upper()
@@ -661,8 +658,7 @@ class CometCallback(TrainerCallback):
 
 class AzureMLCallback(TrainerCallback):
     """
-    A :class:`~transformers.TrainerCallback` that sends the logs to `AzureML
-    <https://pypi.org/project/azureml-sdk/>`__.
+    A [`TrainerCallback`] that sends the logs to [AzureML](https://pypi.org/project/azureml-sdk/).
     """
 
     def __init__(self, azureml_run=None):
@@ -685,7 +681,7 @@ class AzureMLCallback(TrainerCallback):
 
 class MLflowCallback(TrainerCallback):
     """
-    A :class:`~transformers.TrainerCallback` that sends the logs to `MLflow <https://www.mlflow.org/>`__.
+    A [`TrainerCallback`] that sends the logs to [MLflow](https://www.mlflow.org/).
     """
 
     def __init__(self):
@@ -705,11 +701,11 @@ class MLflowCallback(TrainerCallback):
         Setup the optional MLflow integration.
 
         Environment:
-            HF_MLFLOW_LOG_ARTIFACTS (:obj:`str`, `optional`):
+            HF_MLFLOW_LOG_ARTIFACTS (`str`, *optional*):
                 Whether to use MLflow .log_artifact() facility to log artifacts.
 
-                This only makes sense if logging to a remote server, e.g. s3 or GCS. If set to `True` or `1`, will copy
-                whatever is in :class:`~transformers.TrainingArguments`'s ``output_dir`` to the local or remote
+                This only makes sense if logging to a remote server, e.g. s3 or GCS. If set to *True* or *1*, will copy
+                whatever is in [`TrainingArguments`]'s `output_dir` to the local or remote
                 artifact storage. Using it without a remote storage will just copy the files to your artifact location.
         """
         log_artifacts = os.getenv("HF_MLFLOW_LOG_ARTIFACTS", "FALSE").upper()
@@ -774,7 +770,7 @@ class MLflowCallback(TrainerCallback):
 
 class NeptuneCallback(TrainerCallback):
     """
-    A :class:`~transformers.TrainerCallback` that sends the logs to `Neptune <https://neptune.ai>`.
+    A [`TrainerCallback`] that sends the logs to *Neptune <https://neptune.ai>*.
     """
 
     def __init__(self):
@@ -793,13 +789,13 @@ class NeptuneCallback(TrainerCallback):
         Setup the Neptune integration.
 
         Environment:
-            NEPTUNE_PROJECT (:obj:`str`, `required`):
-                The project ID for neptune.ai account. Should be in format `workspace_name/project_name`
-            NEPTUNE_API_TOKEN (:obj:`str`, `required`):
+            NEPTUNE_PROJECT (`str`, *required*):
+                The project ID for neptune.ai account. Should be in format *workspace_name/project_name*
+            NEPTUNE_API_TOKEN (`str`, *required*):
                 API-token for neptune.ai account
-            NEPTUNE_CONNECTION_MODE (:obj:`str`, `optional`):
-                Neptune connection mode. `async` by default
-            NEPTUNE_RUN_NAME (:obj:`str`, `optional`):
+            NEPTUNE_CONNECTION_MODE (`str`, *optional*):
+                Neptune connection mode. *async* by default
+            NEPTUNE_RUN_NAME (`str`, *optional*):
                 The name of run process on Neptune dashboard
         """
         if state.is_world_process_zero:
@@ -831,7 +827,7 @@ class NeptuneCallback(TrainerCallback):
     def __del__(self):
         """
         Environment:
-            NEPTUNE_STOP_TIMEOUT (:obj:`int`, `optional`):
+            NEPTUNE_STOP_TIMEOUT (`int`, *optional*):
                 Number of seconsds to wait for all Neptune.ai tracking calls to finish, before stopping the tracked
                 run. If not set it will wait for all tracking calls to finish.
         """
@@ -845,7 +841,7 @@ class NeptuneCallback(TrainerCallback):
 
 class CodeCarbonCallback(TrainerCallback):
     """
-    A :class:`~transformers.TrainerCallback` that tracks the CO2 emission of training.
+    A [`TrainerCallback`] that tracks the CO2 emission of training.
     """
 
     def __init__(self):
diff --git a/src/transformers/keras_callbacks.py b/src/transformers/keras_callbacks.py
index ff1b938cec..670248524e 100644
--- a/src/transformers/keras_callbacks.py
+++ b/src/transformers/keras_callbacks.py
@@ -29,32 +29,32 @@ class PushToHubCallback(Callback):
         **model_card_args
     ):
         """
-        output_dir (:obj:`str`):
+        output_dir (`str`):
             The output directory where the model predictions and checkpoints will be written and synced with the
             repository on the Hub.
-        save_strategy (:obj:`str` or :class:`~transformers.trainer_utils.IntervalStrategy`, `optional`, defaults to :obj:`"epoch"`):
+        save_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"epoch"`):
             The checkpoint save strategy to adopt during training. Possible values are:
 
-                * :obj:`"no"`: No save is done during training.
-                * :obj:`"epoch"`: Save is done at the end of each epoch.
-                * :obj:`"steps"`: Save is done every :obj:`save_steps`
-        save_steps (:obj:`int`, `optional`):
+                - `"no"`: No save is done during training.
+                - `"epoch"`: Save is done at the end of each epoch.
+                - `"steps"`: Save is done every `save_steps`
+        save_steps (`int`, *optional*):
             The number of steps between saves when using the "steps" save_strategy.
-        tokenizer (:obj:`PreTrainedTokenizerBase`, `optional`):
+        tokenizer (`PreTrainedTokenizerBase`, *optional*):
             The tokenizer used by the model. If supplied, will be uploaded to the repo alongside the weights.
-        hub_model_id (:obj:`str`, `optional`):
-            The name of the repository to keep in sync with the local `output_dir`. It can be a simple model ID in
+        hub_model_id (`str`, *optional*):
+            The name of the repository to keep in sync with the local *output_dir*. It can be a simple model ID in
             which case the model will be pushed in your namespace. Otherwise it should be the whole repository name,
-            for instance :obj:`"user_name/model"`, which allows you to push to an organization you are a member of with
-            :obj:`"organization_name/model"`.
+            for instance `"user_name/model"`, which allows you to push to an organization you are a member of with
+            `"organization_name/model"`.
 
-            Will default to to the name of :obj:`output_dir`.
-        hub_token (:obj:`str`, `optional`):
+            Will default to to the name of `output_dir`.
+        hub_token (`str`, *optional*):
             The token to use to push the model to the Hub. Will default to the token in the cache folder obtained with
-            :obj:`huggingface-cli login`.
-        checkpoint (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            `huggingface-cli login`.
+        checkpoint (`bool`, *optional*, defaults to `False`):
             Whether to save full training checkpoints (including epoch and optimizer state) to allow training to be
-            resumed. Only usable when `save_strategy` is `epoch`.
+            resumed. Only usable when *save_strategy* is *epoch*.
         """
         super().__init__()
         if checkpoint and save_strategy != "epoch":
diff --git a/src/transformers/modelcard.py b/src/transformers/modelcard.py
index dc663ca493..184bd80660 100644
--- a/src/transformers/modelcard.py
+++ b/src/transformers/modelcard.py
@@ -126,53 +126,53 @@ class ModelCard:
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         r"""
-        Instantiate a :class:`~transformers.ModelCard` from a pre-trained model model card.
+        Instantiate a [`ModelCard`] from a pre-trained model model card.
 
         Parameters:
             pretrained_model_name_or_path: either:
 
-                - a string, the `model id` of a pretrained model card hosted inside a model repo on huggingface.co.
-                  Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under a
-                  user or organization name, like ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing a model card file saved using the
-                  :func:`~transformers.ModelCard.save_pretrained` method, e.g.: ``./my_model_directory/``.
-                - a path or url to a saved model card JSON `file`, e.g.: ``./my_model_directory/modelcard.json``.
+                - a string, the *model id* of a pretrained model card hosted inside a model repo on huggingface.co.
+                  Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
+                  user or organization name, like `dbmdz/bert-base-german-cased`.
+                - a path to a *directory* containing a model card file saved using the
+                  [`~ModelCard.save_pretrained`] method, e.g.: `./my_model_directory/`.
+                - a path or url to a saved model card JSON *file*, e.g.: `./my_model_directory/modelcard.json`.
 
-            cache_dir: (`optional`) string:
+            cache_dir: (*optional*) string:
                 Path to a directory in which a downloaded pre-trained model card should be cached if the standard cache
                 should not be used.
 
-            kwargs: (`optional`) dict: key/value pairs with which to update the ModelCard object after loading.
+            kwargs: (*optional*) dict: key/value pairs with which to update the ModelCard object after loading.
 
                 - The values in kwargs of any keys which are model card attributes will be used to override the loaded
                   values.
                 - Behavior concerning key/value pairs whose keys are *not* model card attributes is controlled by the
-                  `return_unused_kwargs` keyword parameter.
+                  *return_unused_kwargs* keyword parameter.
 
-            proxies: (`optional`) dict, default None:
+            proxies: (*optional*) dict, default None:
                 A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128',
                 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request.
 
-            find_from_standard_name: (`optional`) boolean, default True:
+            find_from_standard_name: (*optional*) boolean, default True:
                 If the pretrained_model_name_or_path ends with our standard model or config filenames, replace them
                 with our standard modelcard filename. Can be used to directly feed a model/config url and access the
                 colocated modelcard.
 
-            return_unused_kwargs: (`optional`) bool:
+            return_unused_kwargs: (*optional*) bool:
 
                 - If False, then this function returns just the final model card object.
-                - If True, then this functions returns a tuple `(model card, unused_kwargs)` where `unused_kwargs` is a
+                - If True, then this functions returns a tuple *(model card, unused_kwargs)* where *unused_kwargs* is a
                   dictionary consisting of the key/value pairs whose keys are not model card attributes: ie the part of
-                  kwargs which has not been used to update `ModelCard` and is otherwise ignored.
+                  kwargs which has not been used to update *ModelCard* and is otherwise ignored.
 
-        Examples::
+        Examples:
 
-            modelcard = ModelCard.from_pretrained('bert-base-uncased')    # Download model card from huggingface.co and cache.
-            modelcard = ModelCard.from_pretrained('./test/saved_model/')  # E.g. model card was saved using `save_pretrained('./test/saved_model/')`
-            modelcard = ModelCard.from_pretrained('./test/saved_model/modelcard.json')
-            modelcard = ModelCard.from_pretrained('bert-base-uncased', output_attentions=True, foo=False)
-
-        """
+        ```python
+        modelcard = ModelCard.from_pretrained('bert-base-uncased')    # Download model card from huggingface.co and cache.
+        modelcard = ModelCard.from_pretrained('./test/saved_model/')  # E.g. model card was saved using *save_pretrained('./test/saved_model/')*
+        modelcard = ModelCard.from_pretrained('./test/saved_model/modelcard.json')
+        modelcard = ModelCard.from_pretrained('bert-base-uncased', output_attentions=True, foo=False)
+        ```"""
         # This imports every model so let's do it dynamically here.
         from transformers.models.auto.configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
 
diff --git a/src/transformers/modeling_flax_pytorch_utils.py b/src/transformers/modeling_flax_pytorch_utils.py
index c822f11e98..100e032a38 100644
--- a/src/transformers/modeling_flax_pytorch_utils.py
+++ b/src/transformers/modeling_flax_pytorch_utils.py
@@ -69,7 +69,7 @@ def rename_key_and_reshape_tensor(
     """Rename PT weight names to corresponding Flax weight names and reshape tensor if necessary"""
 
     def is_key_or_prefix_key_in_dict(key: Tuple[str]) -> bool:
-        """Checks if ``key`` of ``(prefix,) + key`` is in random_flax_state_dict"""
+        """Checks if `key` of `(prefix,) + key` is in random_flax_state_dict"""
         return len(set(random_flax_state_dict) & set([key, (model_prefix,) + key])) > 0
 
     # layer norm
diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py
index 2be53474c3..945349f361 100644
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@@ -67,17 +67,17 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
     r"""
     Base class for all models.
 
-    :class:`~transformers.FlaxPreTrainedModel` takes care of storing the configuration of the models and handles
+    [`FlaxPreTrainedModel`] takes care of storing the configuration of the models and handles
     methods for loading, downloading and saving models.
 
     Class attributes (overridden by derived classes):
 
-        - **config_class** (:class:`~transformers.PretrainedConfig`) -- A subclass of
-          :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
-        - **base_model_prefix** (:obj:`str`) -- A string indicating the attribute associated to the base model in
+        - **config_class** ([`PretrainedConfig`]) -- A subclass of
+          [`PretrainedConfig`] to use as configuration class for this model architecture.
+        - **base_model_prefix** (`str`) -- A string indicating the attribute associated to the base model in
           derived classes of the same architecture adding modules on top of the base model.
-        - **main_input_name** (:obj:`str`) -- The name of the principal input to the model (often :obj:`input_ids` for
-          NLP models, :obj:`pixel_values` for vision models and :obj:`input_values` for speech models).
+        - **main_input_name** (`str`) -- The name of the principal input to the model (often `input_ids` for
+          NLP models, `pixel_values` for vision models and `input_values` for speech models).
     """
     config_class = None
     base_model_prefix = ""
@@ -159,7 +159,7 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
 
     def _cast_floating_to(self, params: Union[Dict, FrozenDict], dtype: jnp.dtype, mask: Any = None) -> Any:
         """
-        Helper method to cast floating-point values of given parameter ``PyTree`` to given ``dtype``.
+        Helper method to cast floating-point values of given parameter `PyTree` to given `dtype`.
         """
 
         # taken from https://github.com/deepmind/jmp/blob/3a8318abc3292be38582794dbf7b094e6583b192/jmp/_src/policy.py#L27
@@ -183,94 +183,97 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
 
     def to_bf16(self, params: Union[Dict, FrozenDict], mask: Any = None):
         r"""
-        Cast the floating-point ``params`` to ``jax.numpy.bfloat16``. This returns a new ``params`` tree and does not
-        cast the ``params`` in place.
+        Cast the floating-point `params` to `jax.numpy.bfloat16`. This returns a new `params` tree and does not
+        cast the `params` in place.
 
         This method can be used on TPU to explicitly convert the model parameters to bfloat16 precision to do full
         half-precision training or to save weights in bfloat16 for inference in order to save memory and improve speed.
 
         Arguments:
-            params (:obj:`Union[Dict, FrozenDict]`):
-                A ``PyTree`` of model parameters.
-            mask (:obj:`Union[Dict, FrozenDict]`):
-                A ``PyTree`` with same structure as the ``params`` tree. The leaves should be booleans, :obj:`True` for
-                params you want to cast, and should be :obj:`False` for those you want to skip.
+            params (`Union[Dict, FrozenDict]`):
+                A `PyTree` of model parameters.
+            mask (`Union[Dict, FrozenDict]`):
+                A `PyTree` with same structure as the `params` tree. The leaves should be booleans, `True` for
+                params you want to cast, and should be `False` for those you want to skip.
 
-        Examples::
+        Examples:
 
-            >>> from transformers import FlaxBertModel
-            >>> # load model
-            >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
-            >>> # By default, the model parameters will be in fp32 precision, to cast these to bfloat16 precision
-            >>> model.params = model.to_bf16(model.params)
-            >>> # If you want don't want to cast certain parameters (for example layer norm bias and scale)
-            >>> # then pass the mask as follows
-            >>> from flax import traverse_util
-            >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
-            >>> flat_params = traverse_util.flatten_dict(model.params)
-            >>> mask = {path: (path[-2] != ("LayerNorm", "bias") and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
-            >>> mask = traverse_util.unflatten_dict(mask)
-            >>> model.params = model.to_bf16(model.params, mask)
-        """
+        ```python
+        >>> from transformers import FlaxBertModel
+        >>> # load model
+        >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
+        >>> # By default, the model parameters will be in fp32 precision, to cast these to bfloat16 precision
+        >>> model.params = model.to_bf16(model.params)
+        >>> # If you want don't want to cast certain parameters (for example layer norm bias and scale)
+        >>> # then pass the mask as follows
+        >>> from flax import traverse_util
+        >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
+        >>> flat_params = traverse_util.flatten_dict(model.params)
+        >>> mask = {path: (path[-2] != ("LayerNorm", "bias") and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
+        >>> mask = traverse_util.unflatten_dict(mask)
+        >>> model.params = model.to_bf16(model.params, mask)
+        ```"""
         return self._cast_floating_to(params, jnp.bfloat16, mask)
 
     def to_fp32(self, params: Union[Dict, FrozenDict], mask: Any = None):
         r"""
-        Cast the floating-point ``parmas`` to ``jax.numpy.float32``. This method can be used to explicitly convert the
-        model parameters to fp32 precision. This returns a new ``params`` tree and does not cast the ``params`` in
+        Cast the floating-point `parmas` to `jax.numpy.float32`. This method can be used to explicitly convert the
+        model parameters to fp32 precision. This returns a new `params` tree and does not cast the `params` in
         place.
 
         Arguments:
-            params (:obj:`Union[Dict, FrozenDict]`):
-                A ``PyTree`` of model parameters.
-            mask (:obj:`Union[Dict, FrozenDict]`):
-                A ``PyTree`` with same structure as the ``params`` tree. The leaves should be booleans, :obj:`True` for
-                params you want to cast, and should be :obj:`False` for those you want to skip
+            params (`Union[Dict, FrozenDict]`):
+                A `PyTree` of model parameters.
+            mask (`Union[Dict, FrozenDict]`):
+                A `PyTree` with same structure as the `params` tree. The leaves should be booleans, `True` for
+                params you want to cast, and should be `False` for those you want to skip
 
-        Examples::
+        Examples:
 
-            >>> from transformers import FlaxBertModel
-            >>> # Download model and configuration from huggingface.co
-            >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
-            >>> # By default, the model params will be in fp32, to illustrate the use of this method,
-            >>> # we'll first cast to fp16 and back to fp32
-            >>> model.params = model.to_f16(model.params)
-            >>> # now cast back to fp32
-            >>> model.params = model.to_fp32(model.params)
-        """
+        ```python
+        >>> from transformers import FlaxBertModel
+        >>> # Download model and configuration from huggingface.co
+        >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
+        >>> # By default, the model params will be in fp32, to illustrate the use of this method,
+        >>> # we'll first cast to fp16 and back to fp32
+        >>> model.params = model.to_f16(model.params)
+        >>> # now cast back to fp32
+        >>> model.params = model.to_fp32(model.params)
+        ```"""
         return self._cast_floating_to(params, jnp.float32, mask)
 
     def to_fp16(self, params: Union[Dict, FrozenDict], mask: Any = None):
         r"""
-        Cast the floating-point ``parmas`` to ``jax.numpy.float16``. This returns a new ``params`` tree and does not
-        cast the ``params`` in place.
+        Cast the floating-point `parmas` to `jax.numpy.float16`. This returns a new `params` tree and does not
+        cast the `params` in place.
 
         This method can be used on GPU to explicitly convert the model parameters to float16 precision to do full
         half-precision training or to save weights in float16 for inference in order to save memory and improve speed.
 
         Arguments:
-            params (:obj:`Union[Dict, FrozenDict]`):
-                A ``PyTree`` of model parameters.
-            mask (:obj:`Union[Dict, FrozenDict]`):
-                A ``PyTree`` with same structure as the ``params`` tree. The leaves should be booleans, :obj:`True` for
-                params you want to cast, and should be :obj:`False` for those you want to skip
+            params (`Union[Dict, FrozenDict]`):
+                A `PyTree` of model parameters.
+            mask (`Union[Dict, FrozenDict]`):
+                A `PyTree` with same structure as the `params` tree. The leaves should be booleans, `True` for
+                params you want to cast, and should be `False` for those you want to skip
 
-        Examples::
+        Examples:
 
-            >>> from transformers import FlaxBertModel
-            >>> # load model
-            >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
-            >>> # By default, the model params will be in fp32, to cast these to float16
-            >>> model.params = model.to_fp16(model.params)
-            >>> # If you want don't want to cast certain parameters (for example layer norm bias and scale)
-            >>> # then pass the mask as follows
-            >>> from flax import traverse_util
-            >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
-            >>> flat_params = traverse_util.flatten_dict(model.params)
-            >>> mask = {path: (path[-2] != ("LayerNorm", "bias") and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
-            >>> mask = traverse_util.unflatten_dict(mask)
-            >>> model.params = model.to_fp16(model.params, mask)
-        """
+        ```python
+        >>> from transformers import FlaxBertModel
+        >>> # load model
+        >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
+        >>> # By default, the model params will be in fp32, to cast these to float16
+        >>> model.params = model.to_fp16(model.params)
+        >>> # If you want don't want to cast certain parameters (for example layer norm bias and scale)
+        >>> # then pass the mask as follows
+        >>> from flax import traverse_util
+        >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
+        >>> flat_params = traverse_util.flatten_dict(model.params)
+        >>> mask = {path: (path[-2] != ("LayerNorm", "bias") and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
+        >>> mask = traverse_util.unflatten_dict(mask)
+        >>> model.params = model.to_fp16(model.params, mask)
+        ```"""
         return self._cast_floating_to(params, jnp.float16, mask)
 
     @classmethod
@@ -285,104 +288,104 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
         r"""
         Instantiate a pretrained flax model from a pre-trained model configuration.
 
-        The warning `Weights from XXX not initialized from pretrained model` means that the weights of XXX do not come
+        The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
         pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
         task.
 
-        The warning `Weights from XXX not used in YYY` means that the layer XXX is not used by YYY, therefore those
+        The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
         weights are discarded.
 
         Parameters:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
                 Can be either:
 
-                    - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
-                      a user or organization name, like ``dbmdz/bert-base-german-cased``.
-                    - A path to a `directory` containing model weights saved using
-                      :func:`~transformers.FlaxPreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
-                    - A path or url to a `pt index checkpoint file` (e.g, ``./tf_model/model.ckpt.index``). In this
-                      case, ``from_pt`` should be set to :obj:`True`.
-            dtype (:obj:`jax.numpy.dtype`, `optional`, defaults to :obj:`jax.numpy.float32`):
-                The data type of the computation. Can be one of :obj:`jax.numpy.float32`, :obj:`jax.numpy.float16` (on
-                GPUs) and :obj:`jax.numpy.bfloat16` (on TPUs).
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
+                      a user or organization name, like `dbmdz/bert-base-german-cased`.
+                    - A path to a *directory* containing model weights saved using
+                      [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+                    - A path or url to a *pt index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In this
+                      case, `from_pt` should be set to `True`.
+            dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+                The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on
+                GPUs) and `jax.numpy.bfloat16` (on TPUs).
 
                 This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
-                specified all the computation will be performed with the given ``dtype``.
+                specified all the computation will be performed with the given `dtype`.
 
                 **Note that this only specifies the dtype of the computation and does not influence the dtype of model
                 parameters.**
 
                 If you wish to change the dtype of the model parameters, see
-                :meth:`~transformers.FlaxPreTrainedModel.to_fp16` and
-                :meth:`~transformers.FlaxPreTrainedModel.to_bf16`.
-            model_args (sequence of positional arguments, `optional`):
-                All remaining positional arguments will be passed to the underlying model's ``__init__`` method.
-            config (:obj:`Union[PretrainedConfig, str, os.PathLike]`, `optional`):
+                [`~FlaxPreTrainedModel.to_fp16`] and
+                [`~FlaxPreTrainedModel.to_bf16`].
+            model_args (sequence of positional arguments, *optional*):
+                All remaining positional arguments will be passed to the underlying model's `__init__` method.
+            config (`Union[PretrainedConfig, str, os.PathLike]`, *optional*):
                 Can be either:
 
-                    - an instance of a class derived from :class:`~transformers.PretrainedConfig`,
-                    - a string or path valid as input to :func:`~transformers.PretrainedConfig.from_pretrained`.
+                    - an instance of a class derived from [`PretrainedConfig`],
+                    - a string or path valid as input to [`~PretrainedConfig.from_pretrained`].
 
                 Configuration for the model to use instead of an automatically loaded configuration. Configuration can
                 be automatically loaded when:
 
-                    - The model is a model provided by the library (loaded with the `model id` string of a pretrained
+                    - The model is a model provided by the library (loaded with the *model id* string of a pretrained
                       model).
-                    - The model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded
+                    - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded
                       by supplying the save directory.
-                    - The model is loaded by supplying a local directory as ``pretrained_model_name_or_path`` and a
-                      configuration JSON file named `config.json` is found in the directory.
-            cache_dir (:obj:`Union[str, os.PathLike]`, `optional`):
+                    - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
+                      configuration JSON file named *config.json* is found in the directory.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
                 Path to a directory in which a downloaded pretrained model configuration should be cached if the
                 standard cache should not be used.
-            from_pt (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            from_pt (`bool`, *optional*, defaults to `False`):
                 Load the model weights from a PyTorch checkpoint save file (see docstring of
-                ``pretrained_model_name_or_path`` argument).
-            ignore_mismatched_sizes (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                `pretrained_model_name_or_path` argument).
+            ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`):
                 Whether or not to raise an error if some of the weights from the checkpoint do not have the same size
                 as the weights of the model (if for instance, you are instantiating a model with 10 labels from a
                 checkpoint with 3 labels).
-            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            force_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                 cached versions if they exist.
-            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            resume_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to delete incompletely received files. Will attempt to resume the download if such a
                 file exists.
-            proxies (:obj:`Dict[str, str]`, `optional`):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            local_files_only(`bool`, *optional*, defaults to `False`):
                 Whether or not to only look at local files (i.e., do not try to download the model).
-            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+            revision(`str`, *optional*, defaults to `"main"`):
                 The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                 identifier allowed by git.
-            kwargs (remaining dictionary of keyword arguments, `optional`):
+            kwargs (remaining dictionary of keyword arguments, *optional*):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
-                :obj:`output_attentions=True`). Behaves differently depending on whether a ``config`` is provided or
+                `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
                 automatically loaded:
 
-                    - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the
-                      underlying model's ``__init__`` method (we assume all relevant updates to the configuration have
+                    - If a configuration is provided with `config`, `**kwargs` will be directly passed to the
+                      underlying model's `__init__` method (we assume all relevant updates to the configuration have
                       already been done)
-                    - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class
-                      initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of
-                      ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute
-                      with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration
-                      attribute will be passed to the underlying model's ``__init__`` function.
+                    - If a configuration is not provided, `kwargs` will be first passed to the configuration class
+                      initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of
+                      `kwargs` that corresponds to a configuration attribute will be used to override said attribute
+                      with the supplied `kwargs` value. Remaining keys that do not correspond to any configuration
+                      attribute will be passed to the underlying model's `__init__` function.
 
-        Examples::
+        Examples:
 
-            >>> from transformers import BertConfig, FlaxBertModel
-            >>> # Download model and configuration from huggingface.co and cache.
-            >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
-            >>> # Model was saved using `save_pretrained('./test/saved_model/')` (for example purposes, not runnable).
-            >>> model = FlaxBertModel.from_pretrained('./test/saved_model/')
-            >>> # Loading from a PyTorch checkpoint file instead of a PyTorch model (slower, for example purposes, not runnable).
-            >>> config = BertConfig.from_json_file('./pt_model/config.json')
-            >>> model = FlaxBertModel.from_pretrained('./pt_model/pytorch_model.bin', from_pt=True, config=config)
-        """
+        ```python
+        >>> from transformers import BertConfig, FlaxBertModel
+        >>> # Download model and configuration from huggingface.co and cache.
+        >>> model = FlaxBertModel.from_pretrained('bert-base-cased')
+        >>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable).
+        >>> model = FlaxBertModel.from_pretrained('./test/saved_model/')
+        >>> # Loading from a PyTorch checkpoint file instead of a PyTorch model (slower, for example purposes, not runnable).
+        >>> config = BertConfig.from_json_file('./pt_model/config.json')
+        >>> model = FlaxBertModel.from_pretrained('./pt_model/pytorch_model.bin', from_pt=True, config=config)
+        ```"""
         config = kwargs.pop("config", None)
         cache_dir = kwargs.pop("cache_dir", None)
         from_pt = kwargs.pop("from_pt", False)
@@ -592,24 +595,26 @@ class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin):
     def save_pretrained(self, save_directory: Union[str, os.PathLike], params=None, push_to_hub=False, **kwargs):
         """
         Save a model and its configuration file to a directory, so that it can be re-loaded using the
-        `:func:`~transformers.FlaxPreTrainedModel.from_pretrained`` class method
+        `[`~FlaxPreTrainedModel.from_pretrained`]` class method
 
         Arguments:
-            save_directory (:obj:`str` or :obj:`os.PathLike`):
+            save_directory (`str` or `os.PathLike`):
                 Directory to which to save. Will be created if it doesn't exist.
-            push_to_hub (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            push_to_hub (`bool`, *optional*, defaults to `False`):
                 Whether or not to push your model to the Hugging Face model hub after saving it.
 
-                .. warning::
+                <Tip warning={true}>
 
-                    Using :obj:`push_to_hub=True` will synchronize the repository you are pushing to with
-                    :obj:`save_directory`, which requires :obj:`save_directory` to be a local clone of the repo you are
-                    pushing to if it's an existing folder. Pass along :obj:`temp_dir=True` to use a temporary directory
-                    instead.
+                Using `push_to_hub=True` will synchronize the repository you are pushing to with
+                `save_directory`, which requires `save_directory` to be a local clone of the repo you are
+                pushing to if it's an existing folder. Pass along `temp_dir=True` to use a temporary directory
+                instead.
+
+                </Tip>
 
             kwargs:
                 Additional key word arguments passed along to the
-                :meth:`~transformers.file_utils.PushToHubMixin.push_to_hub` method.
+                [`~file_utils.PushToHubMixin.push_to_hub`] method.
         """
         if os.path.isfile(save_directory):
             logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index bb66e3f62f..b562686b68 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -74,7 +74,7 @@ def dummy_loss(y_true, y_pred):
 
 class TFModelUtilsMixin:
     """
-    A few utilities for :obj:`tf.keras.Model`, to be used as a mixin.
+    A few utilities for `tf.keras.Model`, to be used as a mixin.
     """
 
     def num_parameters(self, only_trainable: bool = False) -> int:
@@ -82,11 +82,11 @@ class TFModelUtilsMixin:
         Get the number of (optionally, trainable) parameters in the model.
 
         Args:
-            only_trainable (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            only_trainable (`bool`, *optional*, defaults to `False`):
                 Whether or not to return only the number of trainable parameters
 
         Returns:
-            :obj:`int`: The number of parameters.
+            `int`: The number of parameters.
         """
         if only_trainable:
             return int(sum(np.prod(w.shape.as_list()) for w in self.trainable_variables))
@@ -100,16 +100,16 @@ def keras_serializable(cls):
 
     This is done by:
 
-    1. Adding a :obj:`transformers_config` dict to the Keras config dictionary in :obj:`get_config` (called by Keras at
+    1. Adding a `transformers_config` dict to the Keras config dictionary in `get_config` (called by Keras at
        serialization time.
-    2. Wrapping :obj:`__init__` to accept that :obj:`transformers_config` dict (passed by Keras at deserialization
+    2. Wrapping `__init__` to accept that `transformers_config` dict (passed by Keras at deserialization
        time) and convert it to a config object for the actual layer initializer.
     3. Registering the class as a custom object in Keras (if the Tensorflow version supports this), so that it does not
-       need to be supplied in :obj:`custom_objects` in the call to :obj:`tf.keras.models.load_model`.
+       need to be supplied in `custom_objects` in the call to `tf.keras.models.load_model`.
 
     Args:
-        cls (a :obj:`tf.keras.layers.Layers subclass`):
-            Typically a :obj:`TF.MainLayer` class in this project, in general must accept a :obj:`config` argument to
+        cls (a `tf.keras.layers.Layers subclass`):
+            Typically a `TF.MainLayer` class in this project, in general must accept a `config` argument to
             its initializer.
 
     Returns:
@@ -163,10 +163,11 @@ class TFCausalLanguageModelingLoss:
     """
     Loss function suitable for causal language modeling (CLM), that is, the task of guessing the next token.
 
-    .. note::
+    <Tip>
 
-        Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
+    Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
 
+    </Tip>
     """
 
     def compute_loss(self, labels, logits):
@@ -199,10 +200,11 @@ class TFTokenClassificationLoss:
     """
     Loss function suitable for token classification.
 
-    .. note::
+    <Tip>
 
-        Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
+    Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
 
+    </Tip>
     """
 
     def compute_loss(self, labels, logits):
@@ -252,9 +254,11 @@ class TFMaskedLanguageModelingLoss(TFCausalLanguageModelingLoss):
     """
     Loss function suitable for masked language modeling (MLM), that is, the task of guessing the masked tokens.
 
-    .. note::
+    <Tip>
 
-         Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
+    Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
+
+    </Tip>
     """
 
 
@@ -262,8 +266,11 @@ class TFNextSentencePredictionLoss:
     """
     Loss function suitable for next sentence prediction (NSP), that is, the task of guessing the next sentence.
 
-    .. note::
-         Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
+    <Tip>
+
+    Any label of -100 will be ignored (along with the corresponding logits) in the loss computation.
+
+    </Tip>
     """
 
     def compute_loss(self, labels, logits):
@@ -285,7 +292,7 @@ def booleans_processing(config, **kwargs):
     graph)
 
     Args:
-        config (:class:`~transformers.PretrainedConfig`):
+        config ([`PretrainedConfig`]):
             The config of the running model.
         **kwargs:
             The boolean parameters
@@ -345,9 +352,9 @@ def input_processing(func, config, input_ids, **kwargs):
     name="input_ids")` otherwise the order of the tensors will not be guaranteed during the training.
 
     Args:
-        func (:obj:`callable`):
+        func (`callable`):
             The callable function of the TensorFlow model.
-        config (:class:`~transformers.PretrainedConfig`):
+        config ([`PretrainedConfig`]):
             The config of the running model.
         **kwargs:
             The inputs of the model.
@@ -491,11 +498,11 @@ def load_tf_weights(model, resolved_archive_file, ignore_mismatched_sizes=False,
     Detect missing and unexpected layers and load the TF weights accordingly to their names and shapes.
 
     Args:
-        model (:obj:`tf.keras.models.Model`):
+        model (`tf.keras.models.Model`):
             The model to load the weights into.
-        resolved_archive_file (:obj:`str`):
+        resolved_archive_file (`str`):
             The location of the H5 file.
-        ignore_mismatched_sizes (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`):
             Whether or not to ignore weights with shapes that don't match between the checkpoint of the model.
 
     Returns:
@@ -641,20 +648,20 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
     r"""
     Base class for all TF models.
 
-    :class:`~transformers.TFPreTrainedModel` takes care of storing the configuration of the models and handles methods
+    [`TFPreTrainedModel`] takes care of storing the configuration of the models and handles methods
     for loading, downloading and saving models as well as a few methods common to all models to:
 
-        * resize the input embeddings,
-        * prune heads in the self-attention heads.
+        - resize the input embeddings,
+        - prune heads in the self-attention heads.
 
     Class attributes (overridden by derived classes):
 
-        - **config_class** (:class:`~transformers.PretrainedConfig`) -- A subclass of
-          :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
-        - **base_model_prefix** (:obj:`str`) -- A string indicating the attribute associated to the base model in
+        - **config_class** ([`PretrainedConfig`]) -- A subclass of
+          [`PretrainedConfig`] to use as configuration class for this model architecture.
+        - **base_model_prefix** (`str`) -- A string indicating the attribute associated to the base model in
           derived classes of the same architecture adding modules on top of the base model.
-        - **main_input_name** (:obj:`str`) -- The name of the principal input to the model (often :obj:`input_ids` for
-          NLP models, :obj:`pixel_values` for vision models and :obj:`input_values` for speech models).
+        - **main_input_name** (`str`) -- The name of the principal input to the model (often `input_ids` for
+          NLP models, `pixel_values` for vision models and `input_values` for speech models).
     """
     config_class = None
     base_model_prefix = ""
@@ -674,7 +681,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
         Dummy inputs to build the network.
 
         Returns:
-            :obj:`Dict[str, tf.Tensor]`: The dummy inputs.
+            `Dict[str, tf.Tensor]`: The dummy inputs.
         """
         return {
             "input_ids": tf.constant(DUMMY_INPUTS),
@@ -729,7 +736,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
         Method used for serving the model.
 
         Args:
-            inputs (:obj:`Dict[str, tf.Tensor]`):
+            inputs (`Dict[str, tf.Tensor]`):
                 The input of the saved model as a dictionary of tensors.
         """
         output = self.call(inputs)
@@ -741,7 +748,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
         Prepare the output of the saved model. Each model must implement this function.
 
         Args:
-            output (:class:`~transformers.TFBaseModelOutput`):
+            output ([`TFBaseModelOutput`]):
                 The output returned by the model.
         """
         raise NotImplementedError
@@ -751,7 +758,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
         Returns the model's input embeddings layer.
 
         Returns:
-            :obj:`tf.Variable`: The embeddings layer mapping vocabulary to hidden states.
+            `tf.Variable`: The embeddings layer mapping vocabulary to hidden states.
         """
         main_layer = getattr(self, self.base_model_prefix, self)
 
@@ -779,12 +786,12 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
         the checkpoint was made.
 
         Args:
-            repo_path_or_name (:obj:`str`):
+            repo_path_or_name (`str`):
                 Can either be a repository name for your {object} in the Hub or a path to a local folder (in which case
                 the repository will have the name of that local folder).
 
         Returns:
-            :obj:`dict`: A dictionary of extra metadata from the checkpoint, most commonly an "epoch" count.
+            `dict`: A dictionary of extra metadata from the checkpoint, most commonly an "epoch" count.
         """
         if getattr(self, "optimizer", None) is None:
             raise RuntimeError(
@@ -971,7 +978,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
         Set model's input embeddings
 
         Args:
-            value (:obj:`tf.Variable`):
+            value (`tf.Variable`):
                 The new weights mapping hidden states to vocabulary.
         """
         main_layer = getattr(self, self.base_model_prefix)
@@ -991,7 +998,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
         Returns the model's output embeddings
 
         Returns:
-            :obj:`tf.Variable`: The new weights mapping vocabulary to hidden states.
+            `tf.Variable`: The new weights mapping vocabulary to hidden states.
         """
         if self.get_lm_head() is not None:
             lm_head = self.get_lm_head()
@@ -1011,7 +1018,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
         Set model's output embeddings
 
         Args:
-            value (:obj:`tf.Variable`):
+            value (`tf.Variable`):
                 The new weights mapping hidden states to vocabulary.
         """
         if self.get_lm_head() is not None:
@@ -1029,7 +1036,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
         embeddings
 
         Return:
-            :obj:`tf.keras.layers.Layer`: The layer that handles the bias, None if not an LM model.
+            `tf.keras.layers.Layer`: The layer that handles the bias, None if not an LM model.
         """
         warnings.warn(
             "The method get_output_layer_with_bias is deprecated. Please use `get_lm_head` instead.", FutureWarning
@@ -1041,7 +1048,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
         Get the concatenated _prefix name of the bias from the model name to the parent layer
 
         Return:
-            :obj:`str`: The _prefix name of the bias.
+            `str`: The _prefix name of the bias.
         """
         warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning)
         return None
@@ -1051,7 +1058,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
         Dict of bias attached to an LM head. The key represents the name of the bias attribute.
 
         Return:
-            :obj:`tf.Variable`: The weights representing the bias, None if not an LM model.
+            `tf.Variable`: The weights representing the bias, None if not an LM model.
         """
         if self.get_lm_head() is not None:
             lm_head = self.get_lm_head()
@@ -1068,7 +1075,7 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
         Set all the bias in the LM head.
 
         Args:
-            value (:obj:`Dict[tf.Variable]`):
+            value (`Dict[tf.Variable]`):
                 All the new bias attached to an LM head.
         """
         if self.get_lm_head() is not None:
@@ -1084,25 +1091,25 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
         The LM Head layer. This method must be overwritten by all the models that have a lm head.
 
         Return:
-            :obj:`tf.keras.layers.Layer`: The LM head layer if the model has one, None if not.
+            `tf.keras.layers.Layer`: The LM head layer if the model has one, None if not.
         """
         return None
 
     def resize_token_embeddings(self, new_num_tokens=None) -> tf.Variable:
         """
-        Resizes input token embeddings matrix of the model if :obj:`new_num_tokens != config.vocab_size`.
+        Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.
 
-        Takes care of tying weights embeddings afterwards if the model class has a :obj:`tie_weights()` method.
+        Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
 
         Arguments:
-            new_num_tokens (:obj:`int`, `optional`):
+            new_num_tokens (`int`, *optional*):
                 The number of new tokens in the embedding matrix. Increasing the size will add newly initialized
-                vectors at the end. Reducing the size will remove vectors from the end. If not provided or :obj:`None`,
-                just returns a pointer to the input tokens :obj:`tf.Variable` module of the model without doing
+                vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`,
+                just returns a pointer to the input tokens `tf.Variable` module of the model without doing
                 anything.
 
         Return:
-            :obj:`tf.Variable`: Pointer to the input tokens Embeddings Module of the model.
+            `tf.Variable`: Pointer to the input tokens Embeddings Module of the model.
         """
         if new_num_tokens is None or new_num_tokens == self.config.vocab_size:
             return self._get_word_embedding_weight(self.get_input_embeddings())
@@ -1166,16 +1173,16 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
         Reducing the size will remove vectors from the end
 
         Args:
-            old_lm_head_bias (:obj:`tf.Variable`):
+            old_lm_head_bias (`tf.Variable`):
                 Old lm head bias to be resized.
-            new_num_tokens (:obj:`int`, `optional`):
+            new_num_tokens (`int`, *optional*):
                 New number of tokens in the linear matrix.
 
                 Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
-                vectors from the end. If not provided or :obj:`None`, just returns None
+                vectors from the end. If not provided or `None`, just returns None
 
         Return:
-            :obj:`tf.Variable`: Pointer to the resized bias.
+            `tf.Variable`: Pointer to the resized bias.
         """
         new_lm_head_bias = {}
 
@@ -1218,16 +1225,16 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
         Reducing the size will remove vectors from the end
 
         Args:
-            old_lm_head_decoder (:obj:`tf.Variable`):
+            old_lm_head_decoder (`tf.Variable`):
                 Old lm head decoder to be resized.
-            new_num_tokens (:obj:`int`, `optional`):
+            new_num_tokens (`int`, *optional*):
                 New number of tokens in the linear matrix.
 
                 Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
-                vectors from the end. If not provided or :obj:`None`, just returns None
+                vectors from the end. If not provided or `None`, just returns None
 
         Return:
-            :obj:`tf.Variable`: Pointer to the resized decoder or None if the output embeddings are different from the
+            `tf.Variable`: Pointer to the resized decoder or None if the output embeddings are different from the
             input ones.
         """
         new_lm_head_decoder = old_lm_head_decoder
@@ -1256,18 +1263,18 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
         initialized vectors at the end. Reducing the size will remove vectors from the end
 
         Args:
-            old_embeddings (:obj:`tf.Variable`):
+            old_embeddings (`tf.Variable`):
                 Old embeddings to be resized.
-            new_num_tokens (:obj:`int`, `optional`):
+            new_num_tokens (`int`, *optional*):
                 New number of tokens in the embedding matrix.
 
                 Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
-                vectors from the end. If not provided or :obj:`None`, just returns a pointer to the input tokens
-                :obj:`tf.Variable`` module of the model without doing anything.
+                vectors from the end. If not provided or `None`, just returns a pointer to the input tokens
+                ``tf.Variable``` module of the model without doing anything.
 
         Return:
-            :obj:`tf.Variable`: Pointer to the resized Embedding Module or the old Embedding Module if
-            :obj:`new_num_tokens` is :obj:`None`
+            `tf.Variable`: Pointer to the resized Embedding Module or the old Embedding Module if
+            `new_num_tokens` is `None`
         """
         old_embedding_dim = shape_list(old_embeddings)[1]
         init_range = getattr(self.config, "initializer_range", 0.02)
@@ -1289,9 +1296,9 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
         Prunes heads of the base model.
 
         Arguments:
-            heads_to_prune (:obj:`Dict[int, List[int]]`):
-                Dictionary with keys being selected layer indices (:obj:`int`) and associated values being the list of
-                heads to prune in said layer (list of :obj:`int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads
+            heads_to_prune (`Dict[int, List[int]]`):
+                Dictionary with keys being selected layer indices (`int`) and associated values being the list of
+                heads to prune in said layer (list of `int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads
                 0 and 2 on layer 1 and heads 2 and 3 on layer 2.
         """
         raise NotImplementedError
@@ -1299,30 +1306,32 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
     def save_pretrained(self, save_directory, saved_model=False, version=1, push_to_hub=False, **kwargs):
         """
         Save a model and its configuration file to a directory, so that it can be re-loaded using the
-        :func:`~transformers.TFPreTrainedModel.from_pretrained` class method.
+        [`~TFPreTrainedModel.from_pretrained`] class method.
 
         Arguments:
-            save_directory (:obj:`str`):
+            save_directory (`str`):
                 Directory to which to save. Will be created if it doesn't exist.
-            saved_model (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            saved_model (`bool`, *optional*, defaults to `False`):
                 If the model has to be saved in saved model format as well or not.
-            version (:obj:`int`, `optional`, defaults to 1):
+            version (`int`, *optional*, defaults to 1):
                 The version of the saved model. A saved model needs to be versioned in order to be properly loaded by
                 TensorFlow Serving as detailed in the official documentation
                 https://www.tensorflow.org/tfx/serving/serving_basic
-            push_to_hub (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            push_to_hub (`bool`, *optional*, defaults to `False`):
                 Whether or not to push your model to the Hugging Face model hub after saving it.
 
-                .. warning::
+                <Tip warning={true}>
 
-                    Using :obj:`push_to_hub=True` will synchronize the repository you are pushing to with
-                    :obj:`save_directory`, which requires :obj:`save_directory` to be a local clone of the repo you are
-                    pushing to if it's an existing folder. Pass along :obj:`temp_dir=True` to use a temporary directory
-                    instead.
+                Using `push_to_hub=True` will synchronize the repository you are pushing to with
+                `save_directory`, which requires `save_directory` to be a local clone of the repo you are
+                pushing to if it's an existing folder. Pass along `temp_dir=True` to use a temporary directory
+                instead.
+
+                </Tip>
 
             kwargs:
                 Additional key word arguments passed along to the
-                :meth:`~transformers.file_utils.PushToHubMixin.push_to_hub` method.
+                [`~file_utils.PushToHubMixin.push_to_hub`] method.
         """
         if os.path.isfile(save_directory):
             logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
@@ -1357,113 +1366,113 @@ class TFPreTrainedModel(tf.keras.Model, TFModelUtilsMixin, TFGenerationMixin, Pu
         r"""
         Instantiate a pretrained TF 2.0 model from a pre-trained model configuration.
 
-        The warning `Weights from XXX not initialized from pretrained model` means that the weights of XXX do not come
+        The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
         pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
         task.
 
-        The warning `Weights from XXX not used in YYY` means that the layer XXX is not used by YYY, therefore those
+        The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
         weights are discarded.
 
         Parameters:
-            pretrained_model_name_or_path (:obj:`str`, `optional`):
+            pretrained_model_name_or_path (`str`, *optional*):
                 Can be either:
 
-                    - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
-                      a user or organization name, like ``dbmdz/bert-base-german-cased``.
-                    - A path to a `directory` containing model weights saved using
-                      :func:`~transformers.TFPreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
-                    - A path or url to a `PyTorch state_dict save file` (e.g, ``./pt_model/pytorch_model.bin``). In
-                      this case, ``from_pt`` should be set to :obj:`True` and a configuration object should be provided
-                      as ``config`` argument. This loading path is slower than converting the PyTorch model in a
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
+                      a user or organization name, like `dbmdz/bert-base-german-cased`.
+                    - A path to a *directory* containing model weights saved using
+                      [`~TFPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+                    - A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In
+                      this case, `from_pt` should be set to `True` and a configuration object should be provided
+                      as `config` argument. This loading path is slower than converting the PyTorch model in a
                       TensorFlow model using the provided conversion scripts and loading the TensorFlow model
                       afterwards.
-                    - :obj:`None` if you are both providing the configuration and state dictionary (resp. with keyword
-                      arguments ``config`` and ``state_dict``).
-            model_args (sequence of positional arguments, `optional`):
-                All remaining positional arguments will be passed to the underlying model's ``__init__`` method.
-            config (:obj:`Union[PretrainedConfig, str]`, `optional`):
+                    - `None` if you are both providing the configuration and state dictionary (resp. with keyword
+                      arguments `config` and `state_dict`).
+            model_args (sequence of positional arguments, *optional*):
+                All remaining positional arguments will be passed to the underlying model's `__init__` method.
+            config (`Union[PretrainedConfig, str]`, *optional*):
                 Can be either:
 
-                    - an instance of a class derived from :class:`~transformers.PretrainedConfig`,
-                    - a string valid as input to :func:`~transformers.PretrainedConfig.from_pretrained`.
+                    - an instance of a class derived from [`PretrainedConfig`],
+                    - a string valid as input to [`~PretrainedConfig.from_pretrained`].
 
                 Configuration for the model to use instead of an automatically loaded configuration. Configuration can
                 be automatically loaded when:
 
-                    - The model is a model provided by the library (loaded with the `model id` string of a pretrained
+                    - The model is a model provided by the library (loaded with the *model id* string of a pretrained
                       model).
-                    - The model was saved using :func:`~transformers.TFPreTrainedModel.save_pretrained` and is reloaded
+                    - The model was saved using [`~TFPreTrainedModel.save_pretrained`] and is reloaded
                       by supplying the save directory.
-                    - The model is loaded by supplying a local directory as ``pretrained_model_name_or_path`` and a
-                      configuration JSON file named `config.json` is found in the directory.
-            from_pt: (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                    - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
+                      configuration JSON file named *config.json* is found in the directory.
+            from_pt: (`bool`, *optional*, defaults to `False`):
                 Load the model weights from a PyTorch state_dict save file (see docstring of
-                ``pretrained_model_name_or_path`` argument).
-            ignore_mismatched_sizes (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                `pretrained_model_name_or_path` argument).
+            ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`):
                 Whether or not to raise an error if some of the weights from the checkpoint do not have the same size
                 as the weights of the model (if for instance, you are instantiating a model with 10 labels from a
                 checkpoint with 3 labels).
-            cache_dir (:obj:`str`, `optional`):
+            cache_dir (`str`, *optional*):
                 Path to a directory in which a downloaded pretrained model configuration should be cached if the
                 standard cache should not be used.
-            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            force_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                 cached versions if they exist.
-            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            resume_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to delete incompletely received files. Will attempt to resume the download if such a
                 file exists.
-            proxies: (:obj:`Dict[str, str], `optional`):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
+            proxies: (`Dict[str, str], `optional`): A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
+                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. output_loading_info(`bool`, *optional*, defaults to `False`):
                 Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
-            local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
+            local_files_only(`bool`, *optional*, defaults to `False`):
                 Whether or not to only look at local files (e.g., not try doanloading the model).
-            use_auth_token (:obj:`str` or `bool`, `optional`):
-                The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
-                generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
-            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+            use_auth_token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token
+                generated when running `transformers-cli login` (stored in `~/.huggingface`).
+            revision(`str`, *optional*, defaults to `"main"`):
                 The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                 identifier allowed by git.
-            mirror(:obj:`str`, `optional`):
+            mirror(`str`, *optional*):
                 Mirror source to accelerate downloads in China. If you are from China and have an accessibility
                 problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
                 Please refer to the mirror site for more information.
-            kwargs (remaining dictionary of keyword arguments, `optional`):
+            kwargs (remaining dictionary of keyword arguments, *optional*):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
-                :obj:`output_attentions=True`). Behaves differently depending on whether a ``config`` is provided or
+                `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
                 automatically loaded:
 
-                    - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the
-                      underlying model's ``__init__`` method (we assume all relevant updates to the configuration have
+                    - If a configuration is provided with `config`, `**kwargs` will be directly passed to the
+                      underlying model's `__init__` method (we assume all relevant updates to the configuration have
                       already been done)
-                    - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class
-                      initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of
-                      ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute
-                      with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration
-                      attribute will be passed to the underlying model's ``__init__`` function.
+                    - If a configuration is not provided, `kwargs` will be first passed to the configuration class
+                      initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of
+                      `kwargs` that corresponds to a configuration attribute will be used to override said attribute
+                      with the supplied `kwargs` value. Remaining keys that do not correspond to any configuration
+                      attribute will be passed to the underlying model's `__init__` function.
 
-        .. note::
+        <Tip>
 
-            Passing :obj:`use_auth_token=True` is required when you want to use a private model.
+        Passing `use_auth_token=True` is required when you want to use a private model.
 
-        Examples::
+        </Tip>
 
-            >>> from transformers import BertConfig, TFBertModel
-            >>> # Download model and configuration from huggingface.co and cache.
-            >>> model = TFBertModel.from_pretrained('bert-base-uncased')
-            >>> # Model was saved using `save_pretrained('./test/saved_model/')` (for example purposes, not runnable).
-            >>> model = TFBertModel.from_pretrained('./test/saved_model/')
-            >>> # Update configuration during loading.
-            >>> model = TFBertModel.from_pretrained('bert-base-uncased', output_attentions=True)
-            >>> assert model.config.output_attentions == True
-            >>> # Loading from a Pytorch model file instead of a TensorFlow checkpoint (slower, for example purposes, not runnable).
-            >>> config = BertConfig.from_json_file('./pt_model/my_pt_model_config.json')
-            >>> model = TFBertModel.from_pretrained('./pt_model/my_pytorch_model.bin', from_pt=True, config=config)
+        Examples:
 
-        """
+        ```python
+        >>> from transformers import BertConfig, TFBertModel
+        >>> # Download model and configuration from huggingface.co and cache.
+        >>> model = TFBertModel.from_pretrained('bert-base-uncased')
+        >>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable).
+        >>> model = TFBertModel.from_pretrained('./test/saved_model/')
+        >>> # Update configuration during loading.
+        >>> model = TFBertModel.from_pretrained('bert-base-uncased', output_attentions=True)
+        >>> assert model.config.output_attentions == True
+        >>> # Loading from a Pytorch model file instead of a TensorFlow checkpoint (slower, for example purposes, not runnable).
+        >>> config = BertConfig.from_json_file('./pt_model/my_pt_model_config.json')
+        >>> model = TFBertModel.from_pretrained('./pt_model/my_pytorch_model.bin', from_pt=True, config=config)
+        ```"""
         config = kwargs.pop("config", None)
         cache_dir = kwargs.pop("cache_dir", None)
         from_pt = kwargs.pop("from_pt", False)
@@ -1685,14 +1694,14 @@ class TFConv1D(tf.keras.layers.Layer):
     Basically works like a linear layer but the weights are transposed.
 
     Args:
-        nf (:obj:`int`):
+        nf (`int`):
             The number of output features.
-        nx (:obj:`int`):
+        nx (`int`):
             The number of input features.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation to use to initialize the weights.
         kwargs:
-            Additional keyword arguments passed along to the :obj:`__init__` of :obj:`tf.keras.layers.Layer`.
+            Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`.
     """
 
     def __init__(self, nf, nx, initializer_range=0.02, **kwargs):
@@ -1726,15 +1735,15 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
     modeling.
 
     Args:
-        vocab_size (:obj:`int`):
+        vocab_size (`int`):
             The size of the vocabulary, e.g., the number of unique tokens.
-        hidden_size (:obj:`int`):
+        hidden_size (`int`):
             The size of the embedding vectors.
-        initializer_range (:obj:`float`, `optional`):
+        initializer_range (`float`, *optional*):
             The standard deviation to use when initializing the weights. If no value is provided, it will default to
-            :math:`1/\sqrt{hidden\_size}`.
+            \\(1/\sqrt{hidden\_size}\\).
         kwargs:
-            Additional keyword arguments passed along to the :obj:`__init__` of :obj:`tf.keras.layers.Layer`.
+            Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`.
     """
 
     def __init__(self, vocab_size: int, hidden_size: int, initializer_range: Optional[float] = None, **kwargs):
@@ -1768,25 +1777,24 @@ class TFSharedEmbeddings(tf.keras.layers.Layer):
         Get token embeddings of inputs or decode final hidden state.
 
         Args:
-            inputs (:obj:`tf.Tensor`):
-                In embedding mode, should be an int64 tensor with shape :obj:`[batch_size, length]`.
+            inputs (`tf.Tensor`):
+                In embedding mode, should be an int64 tensor with shape `[batch_size, length]`.
 
-                In linear mode, should be a float tensor with shape :obj:`[batch_size, length, hidden_size]`.
-            mode (:obj:`str`, defaults to :obj:`"embedding"`):
-               A valid value is either :obj:`"embedding"` or :obj:`"linear"`, the first one indicates that the layer
+                In linear mode, should be a float tensor with shape `[batch_size, length, hidden_size]`.
+            mode (`str`, defaults to `"embedding"`):
+               A valid value is either `"embedding"` or `"linear"`, the first one indicates that the layer
                should be used as an embedding layer, the second one that the layer should be used as a linear decoder.
 
         Returns:
-            :obj:`tf.Tensor`: In embedding mode, the output is a float32 embedding tensor, with shape
-            :obj:`[batch_size, length, embedding_size]`.
+            `tf.Tensor`: In embedding mode, the output is a float32 embedding tensor, with shape
+            `[batch_size, length, embedding_size]`.
 
-            In linear mode, the output is a float32 with shape :obj:`[batch_size, length, vocab_size]`.
+            In linear mode, the output is a float32 with shape `[batch_size, length, vocab_size]`.
 
         Raises:
-            ValueError: if :obj:`mode` is not valid.
+            ValueError: if `mode` is not valid.
 
-        Shared weights logic is adapted from `here
-        <https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24>`__.
+        Shared weights logic is adapted from [here](https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24).
         """
         if mode == "embedding":
             return self._embedding(inputs)
@@ -1821,31 +1829,31 @@ class TFSequenceSummary(tf.keras.layers.Layer):
     Compute a single vector summary of a sequence hidden states.
 
     Args:
-        config (:class:`~transformers.PretrainedConfig`):
+        config ([`PretrainedConfig`]):
             The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
             config class of your model for the default values it uses):
 
-            - **summary_type** (:obj:`str`) -- The method to use to make this summary. Accepted values are:
+            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:
 
-                - :obj:`"last"` -- Take the last token hidden state (like XLNet)
-                - :obj:`"first"` -- Take the first token hidden state (like Bert)
-                - :obj:`"mean"` -- Take the mean of all tokens hidden states
-                - :obj:`"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
-                - :obj:`"attn"` -- Not implemented now, use multi-head attention
+                - `"last"` -- Take the last token hidden state (like XLNet)
+                - `"first"` -- Take the first token hidden state (like Bert)
+                - `"mean"` -- Take the mean of all tokens hidden states
+                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
+                - `"attn"` -- Not implemented now, use multi-head attention
 
-            - **summary_use_proj** (:obj:`bool`) -- Add a projection after the vector extraction.
-            - **summary_proj_to_labels** (:obj:`bool`) -- If :obj:`True`, the projection outputs to
-              :obj:`config.num_labels` classes (otherwise to :obj:`config.hidden_size`).
-            - **summary_activation** (:obj:`Optional[str]`) -- Set to :obj:`"tanh"` to add a tanh activation to the
-              output, another string or :obj:`None` will add no activation.
-            - **summary_first_dropout** (:obj:`float`) -- Optional dropout probability before the projection and
+            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
+            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to
+              `config.num_labels` classes (otherwise to `config.hidden_size`).
+            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the
+              output, another string or `None` will add no activation.
+            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and
               activation.
-            - **summary_last_dropout** (:obj:`float`)-- Optional dropout probability after the projection and
+            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and
               activation.
 
-        initializer_range (:obj:`float`, defaults to 0.02): The standard deviation to use to initialize the weights.
+        initializer_range (`float`, defaults to 0.02): The standard deviation to use to initialize the weights.
         kwargs:
-            Additional keyword arguments passed along to the :obj:`__init__` of :obj:`tf.keras.layers.Layer`.
+            Additional keyword arguments passed along to the `__init__` of `tf.keras.layers.Layer`.
     """
 
     def __init__(self, config: PretrainedConfig, initializer_range: float = 0.02, **kwargs):
@@ -1937,10 +1945,10 @@ def shape_list(tensor: tf.Tensor) -> List[int]:
     Deal with dynamic shape in tensorflow cleanly.
 
     Args:
-        tensor (:obj:`tf.Tensor`): The tensor we want the shape of.
+        tensor (`tf.Tensor`): The tensor we want the shape of.
 
     Returns:
-        :obj:`List[int]`: The shape of the tensor as a list.
+        `List[int]`: The shape of the tensor as a list.
     """
     dynamic = tf.shape(tensor)
 
@@ -1954,13 +1962,13 @@ def shape_list(tensor: tf.Tensor) -> List[int]:
 
 def get_initializer(initializer_range: float = 0.02) -> tf.initializers.TruncatedNormal:
     """
-    Creates a :obj:`tf.initializers.TruncatedNormal` with the given range.
+    Creates a `tf.initializers.TruncatedNormal` with the given range.
 
     Args:
-        initializer_range (`float`, defaults to 0.02): Standard deviation of the initializer range.
+        initializer_range (*float*, defaults to 0.02): Standard deviation of the initializer range.
 
     Returns:
-        :obj:`tf.initializers.TruncatedNormal`: The truncated normal initializer.
+        `tf.initializers.TruncatedNormal`: The truncated normal initializer.
     """
     return tf.keras.initializers.TruncatedNormal(stddev=initializer_range)
 
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 2cc37a6f94..e4d58ff501 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -89,16 +89,16 @@ def find_pruneable_heads_and_indices(
     heads: List[int], n_heads: int, head_size: int, already_pruned_heads: Set[int]
 ) -> Tuple[Set[int], torch.LongTensor]:
     """
-    Finds the heads and their indices taking :obj:`already_pruned_heads` into account.
+    Finds the heads and their indices taking `already_pruned_heads` into account.
 
     Args:
-        heads (:obj:`List[int]`): List of the indices of heads to prune.
-        n_heads (:obj:`int`): The number of heads in the model.
-        head_size (:obj:`int`): The size of each head.
-        already_pruned_heads (:obj:`Set[int]`): A set of already pruned heads.
+        heads (`List[int]`): List of the indices of heads to prune.
+        n_heads (`int`): The number of heads in the model.
+        head_size (`int`): The size of each head.
+        already_pruned_heads (`Set[int]`): A set of already pruned heads.
 
     Returns:
-        :obj:`Tuple[Set[int], torch.LongTensor]`: A tuple with the remaining heads and their corresponding indices.
+        `Tuple[Set[int], torch.LongTensor]`: A tuple with the remaining heads and their corresponding indices.
     """
     mask = torch.ones(n_heads, head_size)
     heads = set(heads) - already_pruned_heads  # Convert to set and remove already pruned heads
@@ -143,7 +143,7 @@ def get_parameter_dtype(parameter: Union[nn.Module, GenerationMixin, "ModuleUtil
 
 class ModuleUtilsMixin:
     """
-    A few utilities for :obj:`torch.nn.Modules`, to be used as a mixin.
+    A few utilities for `torch.nn.Modules`, to be used as a mixin.
     """
 
     @staticmethod
@@ -176,8 +176,8 @@ class ModuleUtilsMixin:
         """
         Add a memory hook before and after each sub-module forward pass to record increase in memory consumption.
 
-        Increase in memory consumption is stored in a :obj:`mem_rss_diff` attribute for each module and can be reset to
-        zero with :obj:`model.reset_memory_hooks_state()`.
+        Increase in memory consumption is stored in a `mem_rss_diff` attribute for each module and can be reset to
+        zero with `model.reset_memory_hooks_state()`.
         """
         for module in self.modules():
             module.register_forward_pre_hook(self._hook_rss_memory_pre_forward)
@@ -186,8 +186,8 @@ class ModuleUtilsMixin:
 
     def reset_memory_hooks_state(self):
         """
-        Reset the :obj:`mem_rss_diff` attribute of each module (see
-        :func:`~transformers.modeling_utils.ModuleUtilsMixin.add_memory_hooks`).
+        Reset the `mem_rss_diff` attribute of each module (see
+        [`~modeling_utils.ModuleUtilsMixin.add_memory_hooks`]).
         """
         for module in self.modules():
             module.mem_rss_diff = 0
@@ -197,7 +197,7 @@ class ModuleUtilsMixin:
     @property
     def device(self) -> device:
         """
-        :obj:`torch.device`: The device on which the module is (assuming that all the module parameters are on the same
+        `torch.device`: The device on which the module is (assuming that all the module parameters are on the same
         device).
         """
         return get_parameter_device(self)
@@ -205,7 +205,7 @@ class ModuleUtilsMixin:
     @property
     def dtype(self) -> torch.dtype:
         """
-        :obj:`torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
+        `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
         """
         return get_parameter_dtype(self)
 
@@ -214,10 +214,10 @@ class ModuleUtilsMixin:
         Invert an attention mask (e.g., switches 0. and 1.).
 
         Args:
-            encoder_attention_mask (:obj:`torch.Tensor`): An attention mask.
+            encoder_attention_mask (`torch.Tensor`): An attention mask.
 
         Returns:
-            :obj:`torch.Tensor`: The inverted attention mask.
+            `torch.Tensor`: The inverted attention mask.
         """
         if encoder_attention_mask.dim() == 3:
             encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
@@ -246,15 +246,15 @@ class ModuleUtilsMixin:
         Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
 
         Arguments:
-            attention_mask (:obj:`torch.Tensor`):
+            attention_mask (`torch.Tensor`):
                 Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
-            input_shape (:obj:`Tuple[int]`):
+            input_shape (`Tuple[int]`):
                 The shape of the input to the model.
-            device: (:obj:`torch.device`):
+            device: (`torch.device`):
                 The device of the input to the model.
 
         Returns:
-            :obj:`torch.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
+            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
         """
         # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
         # ourselves in which case we just need to make it broadcastable to all heads.
@@ -308,16 +308,16 @@ class ModuleUtilsMixin:
         Prepare the head mask if needed.
 
         Args:
-            head_mask (:obj:`torch.Tensor` with shape :obj:`[num_heads]` or :obj:`[num_hidden_layers x num_heads]`, `optional`):
+            head_mask (`torch.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*):
                 The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
-            num_hidden_layers (:obj:`int`):
+            num_hidden_layers (`int`):
                 The number of hidden layers in the model.
-            is_attention_chunked: (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            is_attention_chunked: (`bool`, *optional*, defaults to `False`):
                 Whether or not the attentions scores are computed by chunks or not.
 
         Returns:
-            :obj:`torch.Tensor` with shape :obj:`[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or
-            list with :obj:`[None]` for each layer.
+            `torch.Tensor` with shape `[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or
+            list with `[None]` for each layer.
         """
         if head_mask is not None:
             head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
@@ -344,14 +344,14 @@ class ModuleUtilsMixin:
         Get number of (optionally, trainable or non-embeddings) parameters in the module.
 
         Args:
-            only_trainable (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            only_trainable (`bool`, *optional*, defaults to `False`):
                 Whether or not to return only the number of trainable parameters
 
-            exclude_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            exclude_embeddings (`bool`, *optional*, defaults to `False`):
                 Whether or not to return only the number of non-embeddings parameters
 
         Returns:
-            :obj:`int`: The number of parameters.
+            `int`: The number of parameters.
         """
 
         if exclude_embeddings:
@@ -370,10 +370,10 @@ class ModuleUtilsMixin:
         Helper function to estimate the total number of tokens from the model inputs.
 
         Args:
-            inputs (:obj:`dict`): The model inputs.
+            inputs (`dict`): The model inputs.
 
         Returns:
-            :obj:`int`: The total number of tokens.
+            `int`: The total number of tokens.
         """
         if self.main_input_name in input_dict:
             return input_dict[self.main_input_name].numel()
@@ -389,22 +389,21 @@ class ModuleUtilsMixin:
         """
         Get number of (optionally, non-embeddings) floating-point operations for the forward and backward passes of a
         batch with this transformer model. Default approximation neglects the quadratic dependency on the number of
-        tokens (valid if :obj:`12 * d_model << sequence_length`) as laid out in `this paper
-        <https://arxiv.org/pdf/2001.08361.pdf>`__ section 2.1. Should be overridden for transformers with parameter
+        tokens (valid if `12 * d_model << sequence_length`) as laid out in [this paper](https://arxiv.org/pdf/2001.08361.pdf) section 2.1. Should be overridden for transformers with parameter
         re-use e.g. Albert or Universal Transformers, or if doing long-range modeling with very high sequence lengths.
 
         Args:
-            batch_size (:obj:`int`):
+            batch_size (`int`):
                 The batch size for the forward pass.
 
-            sequence_length (:obj:`int`):
+            sequence_length (`int`):
                 The number of tokens in each line of the batch.
 
-            exclude_embeddings (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            exclude_embeddings (`bool`, *optional*, defaults to `True`):
                 Whether or not to count embedding and softmax operations.
 
         Returns:
-            :obj:`int`: The number of floating-point operations.
+            `int`: The number of floating-point operations.
         """
 
         return 6 * self.estimate_tokens(input_dict) * self.num_parameters(exclude_embeddings=exclude_embeddings)
@@ -414,30 +413,30 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
     r"""
     Base class for all models.
 
-    :class:`~transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods
+    [`PreTrainedModel`] takes care of storing the configuration of the models and handles methods
     for loading, downloading and saving models as well as a few methods common to all models to:
 
-        * resize the input embeddings,
-        * prune heads in the self-attention heads.
+        - resize the input embeddings,
+        - prune heads in the self-attention heads.
 
     Class attributes (overridden by derived classes):
 
-        - **config_class** (:class:`~transformers.PretrainedConfig`) -- A subclass of
-          :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
-        - **load_tf_weights** (:obj:`Callable`) -- A python `method` for loading a TensorFlow checkpoint in a PyTorch
+        - **config_class** ([`PretrainedConfig`]) -- A subclass of
+          [`PretrainedConfig`] to use as configuration class for this model architecture.
+        - **load_tf_weights** (`Callable`) -- A python *method* for loading a TensorFlow checkpoint in a PyTorch
           model, taking as arguments:
 
-            - **model** (:class:`~transformers.PreTrainedModel`) -- An instance of the model on which to load the
+            - **model** ([`PreTrainedModel`]) -- An instance of the model on which to load the
               TensorFlow checkpoint.
-            - **config** (:class:`~transformers.PreTrainedConfig`) -- An instance of the configuration associated to
+            - **config** ([`PreTrainedConfig`]) -- An instance of the configuration associated to
               the model.
-            - **path** (:obj:`str`) -- A path to the TensorFlow checkpoint.
+            - **path** (`str`) -- A path to the TensorFlow checkpoint.
 
-        - **base_model_prefix** (:obj:`str`) -- A string indicating the attribute associated to the base model in
+        - **base_model_prefix** (`str`) -- A string indicating the attribute associated to the base model in
           derived classes of the same architecture adding modules on top of the base model.
-        - **is_parallelizable** (:obj:`bool`) -- A flag indicating whether this model supports model parallelization.
-        - **main_input_name** (:obj:`str`) -- The name of the principal input to the model (often :obj:`input_ids` for
-          NLP models, :obj:`pixel_values` for vision models and :obj:`input_values` for speech models).
+        - **is_parallelizable** (`bool`) -- A flag indicating whether this model supports model parallelization.
+        - **main_input_name** (`str`) -- The name of the principal input to the model (often `input_ids` for
+          NLP models, `pixel_values` for vision models and `input_values` for speech models).
     """
     config_class = None
     base_model_prefix = ""
@@ -459,7 +458,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
     @property
     def dummy_inputs(self) -> Dict[str, torch.Tensor]:
         """
-        :obj:`Dict[str, torch.Tensor]`: Dummy inputs to do a forward pass in the network.
+        `Dict[str, torch.Tensor]`: Dummy inputs to do a forward pass in the network.
         """
         return {"input_ids": torch.tensor(DUMMY_INPUTS)}
 
@@ -502,8 +501,8 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
         All context managers that the model should be initialized under go here.
 
         Args:
-            torch_dtype (:obj:`torch.dtype`, `optional`):
-                Override the default ``torch.dtype`` and load the model under this dtype.
+            torch_dtype (`torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model under this dtype.
         """
         torch_dtype = kwargs.pop("torch_dtype", None)
 
@@ -536,15 +535,15 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
         under specific dtype.
 
         Args:
-            dtype (:obj:`torch.dtype`):
+            dtype (`torch.dtype`):
                 a floating dtype to set to.
 
         Returns:
-            :obj:`torch.dtype`: the original ``dtype`` that can be used to restore ``torch.set_default_dtype(dtype)``
-            if it was modified. If it wasn't, returns :obj:`None`.
+            `torch.dtype`: the original `dtype` that can be used to restore `torch.set_default_dtype(dtype)`
+            if it was modified. If it wasn't, returns `None`.
 
-        Note ``set_default_dtype`` currently only works with floating-point types and asserts if for example,
-        ``torch.int64`` is passed. So if a non-float ``dtype`` is passed this functions will throw an exception.
+        Note `set_default_dtype` currently only works with floating-point types and asserts if for example,
+        `torch.int64` is passed. So if a non-float `dtype` is passed this functions will throw an exception.
         """
         if not dtype.is_floating_point:
             raise ValueError(
@@ -559,7 +558,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
     @property
     def base_model(self) -> nn.Module:
         """
-        :obj:`torch.nn.Module`: The main body of the model.
+        `torch.nn.Module`: The main body of the model.
         """
         return getattr(self, self.base_model_prefix, self)
 
@@ -568,7 +567,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
         Returns the model's input embeddings.
 
         Returns:
-            :obj:`nn.Module`: A torch module mapping vocabulary to hidden states.
+            `nn.Module`: A torch module mapping vocabulary to hidden states.
         """
         base_model = getattr(self, self.base_model_prefix, self)
         if base_model is not self:
@@ -581,7 +580,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
         Set model's input embeddings.
 
         Args:
-            value (:obj:`nn.Module`): A module mapping vocabulary to hidden states.
+            value (`nn.Module`): A module mapping vocabulary to hidden states.
         """
         base_model = getattr(self, self.base_model_prefix, self)
         if base_model is not self:
@@ -594,7 +593,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
         Returns the model's output embeddings.
 
         Returns:
-            :obj:`nn.Module`: A torch module mapping hidden states to vocabulary.
+            `nn.Module`: A torch module mapping hidden states to vocabulary.
         """
         return None  # Overwrite for models with output embeddings
 
@@ -608,7 +607,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
         """
         Tie the weights between the input embeddings and the output embeddings.
 
-        If the :obj:`torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning
+        If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning
         the weights instead.
         """
         output_embeddings = self.get_output_embeddings()
@@ -719,19 +718,19 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
 
     def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> nn.Embedding:
         """
-        Resizes input token embeddings matrix of the model if :obj:`new_num_tokens != config.vocab_size`.
+        Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`.
 
-        Takes care of tying weights embeddings afterwards if the model class has a :obj:`tie_weights()` method.
+        Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
 
         Arguments:
-            new_num_tokens (:obj:`int`, `optional`):
+            new_num_tokens (`int`, *optional*):
                 The number of new tokens in the embedding matrix. Increasing the size will add newly initialized
-                vectors at the end. Reducing the size will remove vectors from the end. If not provided or :obj:`None`,
-                just returns a pointer to the input tokens :obj:`torch.nn.Embedding` module of the model without doing
+                vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`,
+                just returns a pointer to the input tokens `torch.nn.Embedding` module of the model without doing
                 anything.
 
         Return:
-            :obj:`torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
+            `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model.
         """
         model_embeds = self._resize_token_embeddings(new_num_tokens)
         if new_num_tokens is None:
@@ -767,18 +766,18 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
         initialized vectors at the end. Reducing the size will remove vectors from the end
 
         Args:
-            old_embeddings (:obj:`torch.nn.Embedding`):
+            old_embeddings (`torch.nn.Embedding`):
                 Old embeddings to be resized.
-            new_num_tokens (:obj:`int`, `optional`):
+            new_num_tokens (`int`, *optional*):
                 New number of tokens in the embedding matrix.
 
                 Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
-                vectors from the end. If not provided or :obj:`None`, just returns a pointer to the input tokens
-                :obj:`torch.nn.Embedding`` module of the model without doing anything.
+                vectors from the end. If not provided or `None`, just returns a pointer to the input tokens
+                ``torch.nn.Embedding``` module of the model without doing anything.
 
         Return:
-            :obj:`torch.nn.Embedding`: Pointer to the resized Embedding Module or the old Embedding Module if
-            :obj:`new_num_tokens` is :obj:`None`
+            `torch.nn.Embedding`: Pointer to the resized Embedding Module or the old Embedding Module if
+            `new_num_tokens` is `None`
         """
         if new_num_tokens is None:
             return old_embeddings
@@ -830,21 +829,19 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
         vectors at the end. Reducing the size will remove vectors from the end
 
         Args:
-            old_lm_head (:obj:`torch.nn.Linear`):
+            old_lm_head (`torch.nn.Linear`):
                 Old lm head liner layer to be resized.
-            new_num_tokens (:obj:`int`, `optional`):
+            new_num_tokens (`int`, *optional*):
                 New number of tokens in the linear matrix.
 
                 Increasing the size will add newly initialized vectors at the end. Reducing the size will remove
-                vectors from the end. If not provided or :obj:`None`, just returns a pointer to the input tokens
-                :obj:`torch.nn.Linear`` module of the model without doing anything.
-            transposed (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether ``old_lm_head`` is transposed or not. If True ``old_lm_head.size()`` is ``lm_head_dim,
-                vocab_size`` else ``vocab_size, lm_head_dim``.
+                vectors from the end. If not provided or `None`, just returns a pointer to the input tokens
+                ``torch.nn.Linear``` module of the model without doing anything. transposed (`bool`, *optional*, defaults to `False`): Whether `old_lm_head` is transposed or not. If True `old_lm_head.size()` is `lm_head_dim,
+                vocab_size` else `vocab_size, lm_head_dim`.
 
         Return:
-            :obj:`torch.nn.Linear`: Pointer to the resized Linear Module or the old Linear Module if
-            :obj:`new_num_tokens` is :obj:`None`
+            `torch.nn.Linear`: Pointer to the resized Linear Module or the old Linear Module if
+            `new_num_tokens` is `None`
         """
         if new_num_tokens is None:
             return old_lm_head
@@ -946,9 +943,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
         Prunes heads of the base model.
 
         Arguments:
-            heads_to_prune (:obj:`Dict[int, List[int]]`):
-                Dictionary with keys being selected layer indices (:obj:`int`) and associated values being the list of
-                heads to prune in said layer (list of :obj:`int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads
+            heads_to_prune (`Dict[int, List[int]]`):
+                Dictionary with keys being selected layer indices (`int`) and associated values being the list of
+                heads to prune in said layer (list of `int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads
                 0 and 2 on layer 1 and heads 2 and 3 on layer 2.
         """
         # save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads
@@ -1000,35 +997,37 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
     ):
         """
         Save a model and its configuration file to a directory, so that it can be re-loaded using the
-        `:func:`~transformers.PreTrainedModel.from_pretrained`` class method.
+        `[`~PreTrainedModel.from_pretrained`]` class method.
 
         Arguments:
-            save_directory (:obj:`str` or :obj:`os.PathLike`):
+            save_directory (`str` or `os.PathLike`):
                 Directory to which to save. Will be created if it doesn't exist.
-            save_config (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            save_config (`bool`, *optional*, defaults to `True`):
                 Whether or not to save the config of the model. Useful when in distributed training like TPUs and need
-                to call this function on all processes. In this case, set :obj:`save_config=True` only on the main
+                to call this function on all processes. In this case, set `save_config=True` only on the main
                 process to avoid race conditions.
-            state_dict (nested dictionary of :obj:`torch.Tensor`):
-                The state dictionary of the model to save. Will default to :obj:`self.state_dict()`, but can be used to
+            state_dict (nested dictionary of `torch.Tensor`):
+                The state dictionary of the model to save. Will default to `self.state_dict()`, but can be used to
                 only save parts of the model or if special precautions need to be taken when recovering the state
                 dictionary of a model (like when using model parallelism).
-            save_function (:obj:`Callable`):
+            save_function (`Callable`):
                 The function to use to save the state dictionary. Useful on distributed training like TPUs when one
-                need to replace :obj:`torch.save` by another method.
-            push_to_hub (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                need to replace `torch.save` by another method.
+            push_to_hub (`bool`, *optional*, defaults to `False`):
                 Whether or not to push your model to the Hugging Face model hub after saving it.
 
-                .. warning::
+                <Tip warning={true}>
 
-                    Using :obj:`push_to_hub=True` will synchronize the repository you are pushing to with
-                    :obj:`save_directory`, which requires :obj:`save_directory` to be a local clone of the repo you are
-                    pushing to if it's an existing folder. Pass along :obj:`temp_dir=True` to use a temporary directory
-                    instead.
+                Using `push_to_hub=True` will synchronize the repository you are pushing to with
+                `save_directory`, which requires `save_directory` to be a local clone of the repo you are
+                pushing to if it's an existing folder. Pass along `temp_dir=True` to use a temporary directory
+                instead.
+
+                </Tip>
 
             kwargs:
                 Additional key word arguments passed along to the
-                :meth:`~transformers.file_utils.PushToHubMixin.push_to_hub` method.
+                [`~file_utils.PushToHubMixin.push_to_hub`] method.
         """
         if os.path.isfile(save_directory):
             logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
@@ -1080,152 +1079,155 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
         r"""
         Instantiate a pretrained pytorch model from a pre-trained model configuration.
 
-        The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated). To
-        train the model, you should first set it back in training mode with ``model.train()``.
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To
+        train the model, you should first set it back in training mode with `model.train()`.
 
-        The warning `Weights from XXX not initialized from pretrained model` means that the weights of XXX do not come
+        The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come
         pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning
         task.
 
-        The warning `Weights from XXX not used in YYY` means that the layer XXX is not used by YYY, therefore those
+        The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those
         weights are discarded.
 
         Parameters:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`, `optional`):
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
                 Can be either:
 
-                    - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
-                      a user or organization name, like ``dbmdz/bert-base-german-cased``.
-                    - A path to a `directory` containing model weights saved using
-                      :func:`~transformers.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
-                    - A path or url to a `tensorflow index checkpoint file` (e.g, ``./tf_model/model.ckpt.index``). In
-                      this case, ``from_tf`` should be set to :obj:`True` and a configuration object should be provided
-                      as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
+                      a user or organization name, like `dbmdz/bert-base-german-cased`.
+                    - A path to a *directory* containing model weights saved using
+                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+                    - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
+                      this case, `from_tf` should be set to `True` and a configuration object should be provided
+                      as `config` argument. This loading path is slower than converting the TensorFlow checkpoint in
                       a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
-                    - A path or url to a model folder containing a `flax checkpoint file` in `.msgpack` format (e.g,
-                      ``./flax_model/`` containing ``flax_model.msgpack``). In this case, ``from_flax`` should be set
-                      to :obj:`True`.
-                    - :obj:`None` if you are both providing the configuration and state dictionary (resp. with keyword
-                      arguments ``config`` and ``state_dict``).
-            model_args (sequence of positional arguments, `optional`):
-                All remaining positional arguments will be passed to the underlying model's ``__init__`` method.
-            config (:obj:`Union[PretrainedConfig, str, os.PathLike]`, `optional`):
+                    - A path or url to a model folder containing a *flax checkpoint file* in *.msgpack* format (e.g,
+                      `./flax_model/` containing `flax_model.msgpack`). In this case, `from_flax` should be set
+                      to `True`.
+                    - `None` if you are both providing the configuration and state dictionary (resp. with keyword
+                      arguments `config` and `state_dict`).
+            model_args (sequence of positional arguments, *optional*):
+                All remaining positional arguments will be passed to the underlying model's `__init__` method.
+            config (`Union[PretrainedConfig, str, os.PathLike]`, *optional*):
                 Can be either:
 
-                    - an instance of a class derived from :class:`~transformers.PretrainedConfig`,
-                    - a string or path valid as input to :func:`~transformers.PretrainedConfig.from_pretrained`.
+                    - an instance of a class derived from [`PretrainedConfig`],
+                    - a string or path valid as input to [`~PretrainedConfig.from_pretrained`].
 
                 Configuration for the model to use instead of an automatically loaded configuration. Configuration can
                 be automatically loaded when:
 
-                    - The model is a model provided by the library (loaded with the `model id` string of a pretrained
+                    - The model is a model provided by the library (loaded with the *model id* string of a pretrained
                       model).
-                    - The model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded
+                    - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded
                       by supplying the save directory.
-                    - The model is loaded by supplying a local directory as ``pretrained_model_name_or_path`` and a
-                      configuration JSON file named `config.json` is found in the directory.
-            state_dict (:obj:`Dict[str, torch.Tensor]`, `optional`):
+                    - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
+                      configuration JSON file named *config.json* is found in the directory.
+            state_dict (`Dict[str, torch.Tensor]`, *optional*):
                 A state dictionary to use instead of a state dictionary loaded from saved weights file.
 
                 This option can be used if you want to create a model from a pretrained configuration but load your own
                 weights. In this case though, you should check if using
-                :func:`~transformers.PreTrainedModel.save_pretrained` and
-                :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
-            cache_dir (:obj:`Union[str, os.PathLike]`, `optional`):
+                [`~PreTrainedModel.save_pretrained`] and
+                [`~PreTrainedModel.from_pretrained`] is not a simpler option.
+            cache_dir (`Union[str, os.PathLike]`, *optional*):
                 Path to a directory in which a downloaded pretrained model configuration should be cached if the
                 standard cache should not be used.
-            from_tf (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            from_tf (`bool`, *optional*, defaults to `False`):
                 Load the model weights from a TensorFlow checkpoint save file (see docstring of
-                ``pretrained_model_name_or_path`` argument).
-            from_flax (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                `pretrained_model_name_or_path` argument).
+            from_flax (`bool`, *optional*, defaults to `False`):
                 Load the model weights from a Flax checkpoint save file (see docstring of
-                ``pretrained_model_name_or_path`` argument).
-            ignore_mismatched_sizes (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                `pretrained_model_name_or_path` argument).
+            ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`):
                 Whether or not to raise an error if some of the weights from the checkpoint do not have the same size
                 as the weights of the model (if for instance, you are instantiating a model with 10 labels from a
                 checkpoint with 3 labels).
-            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            force_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                 cached versions if they exist.
-            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            resume_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to delete incompletely received files. Will attempt to resume the download if such a
                 file exists.
-            proxies (:obj:`Dict[str, str]`, `optional`):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
                 Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
-            local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
+            local_files_only(`bool`, *optional*, defaults to `False`):
                 Whether or not to only look at local files (i.e., do not try to download the model).
-            use_auth_token (:obj:`str` or `bool`, `optional`):
-                The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
-                generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
-            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+            use_auth_token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token
+                generated when running `transformers-cli login` (stored in `~/.huggingface`).
+            revision(`str`, *optional*, defaults to `"main"`):
                 The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                 identifier allowed by git.
-            mirror(:obj:`str`, `optional`):
+            mirror(`str`, *optional*):
                 Mirror source to accelerate downloads in China. If you are from China and have an accessibility
                 problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
                 Please refer to the mirror site for more information.
-            _fast_init(:obj:`bool`, `optional`, defaults to `:obj:`True`):
+            _fast_init(`bool`, *optional*, defaults to ```True`):
                 Whether or not to disable fast initialization.
-            low_cpu_mem_usage(:obj:`bool`, `optional`, defaults to `:obj:`False`):
+            low_cpu_mem_usage(`bool``, *optional*, defaults to ```False`):
                 Tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
                 This is an experimental feature and a subject to change at any moment.
-            torch_dtype (:obj:`str` or :obj:`torch.dtype`, `optional`):
-                Override the default ``torch.dtype`` and load the model under this dtype. If ``"auto"`` is passed the
+            torch_dtype (`str` or `torch.dtype`, *optional*):
+                Override the default `torch.dtype` and load the model under this dtype. If `"auto"` is passed the
                 dtype will be automatically derived from the model's weights.
 
-                .. warning::
+                <Tip warning={true}>
 
-                    One should only disable `_fast_init` to ensure backwards compatibility with
-                    ``transformers.__version__ < 4.6.0`` for seeded model initialization. This argument will be removed
-                    at the next major version. See `pull request 11471
-                    <https://github.com/huggingface/transformers/pull/11471>`__ for more information.
+                One should only disable *_fast_init* to ensure backwards compatibility with
+                `transformers.__version__ < 4.6.0` for seeded model initialization. This argument will be removed
+                at the next major version. See [pull request 11471](https://github.com/huggingface/transformers/pull/11471) for more information.
 
-            kwargs (remaining dictionary of keyword arguments, `optional`):
+                </Tip>
+
+            kwargs (remaining dictionary of keyword arguments, *optional*):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
-                :obj:`output_attentions=True`). Behaves differently depending on whether a ``config`` is provided or
+                `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
                 automatically loaded:
 
-                    - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the
-                      underlying model's ``__init__`` method (we assume all relevant updates to the configuration have
+                    - If a configuration is provided with `config`, `**kwargs` will be directly passed to the
+                      underlying model's `__init__` method (we assume all relevant updates to the configuration have
                       already been done)
-                    - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class
-                      initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of
-                      ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute
-                      with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration
-                      attribute will be passed to the underlying model's ``__init__`` function.
+                    - If a configuration is not provided, `kwargs` will be first passed to the configuration class
+                      initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of
+                      `kwargs` that corresponds to a configuration attribute will be used to override said attribute
+                      with the supplied `kwargs` value. Remaining keys that do not correspond to any configuration
+                      attribute will be passed to the underlying model's `__init__` function.
 
-        .. note::
+        <Tip>
 
-            Passing :obj:`use_auth_token=True` is required when you want to use a private model.
+        Passing `use_auth_token=True`` is required when you want to use a private model.
 
-        .. note::
+        </Tip>
 
-            Activate the special `"offline-mode"
-            <https://huggingface.co/transformers/installation.html#offline-mode>`__ to use this method in a firewalled
-            environment.
+        <Tip>
 
-        Examples::
+        Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to use this method in a firewalled
+        environment.
 
-            >>> from transformers import BertConfig, BertModel
-            >>> # Download model and configuration from huggingface.co and cache.
-            >>> model = BertModel.from_pretrained('bert-base-uncased')
-            >>> # Model was saved using `save_pretrained('./test/saved_model/')` (for example purposes, not runnable).
-            >>> model = BertModel.from_pretrained('./test/saved_model/')
-            >>> # Update configuration during loading.
-            >>> model = BertModel.from_pretrained('bert-base-uncased', output_attentions=True)
-            >>> assert model.config.output_attentions == True
-            >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower, for example purposes, not runnable).
-            >>> config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
-            >>> model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
-            >>> # Loading from a Flax checkpoint file instead of a PyTorch model (slower)
-            >>> model = BertModel.from_pretrained('bert-base-uncased', from_flax=True)
+        </Tip>
 
-        """
+        Examples:
+
+        ```python
+        >>> from transformers import BertConfig, BertModel
+        >>> # Download model and configuration from huggingface.co and cache.
+        >>> model = BertModel.from_pretrained('bert-base-uncased')
+        >>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable).
+        >>> model = BertModel.from_pretrained('./test/saved_model/')
+        >>> # Update configuration during loading.
+        >>> model = BertModel.from_pretrained('bert-base-uncased', output_attentions=True)
+        >>> assert model.config.output_attentions == True
+        >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower, for example purposes, not runnable).
+        >>> config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
+        >>> model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+        >>> # Loading from a Flax checkpoint file instead of a PyTorch model (slower)
+        >>> model = BertModel.from_pretrained('bert-base-uncased', from_flax=True)
+        ```"""
         config = kwargs.pop("config", None)
         state_dict = kwargs.pop("state_dict", None)
         cache_dir = kwargs.pop("cache_dir", None)
@@ -1747,8 +1749,8 @@ class Conv1D(nn.Module):
     Basically works like a linear layer but the weights are transposed.
 
     Args:
-        nf (:obj:`int`): The number of output features.
-        nx (:obj:`int`): The number of input features.
+        nf (`int`): The number of output features.
+        nx (`int`): The number of input features.
     """
 
     def __init__(self, nf, nx):
@@ -1771,8 +1773,8 @@ class PoolerStartLogits(nn.Module):
     Compute SQuAD start logits from sequence hidden states.
 
     Args:
-        config (:class:`~transformers.PretrainedConfig`):
-            The config used by the model, will be used to grab the :obj:`hidden_size` of the model.
+        config ([`PretrainedConfig`]):
+            The config used by the model, will be used to grab the `hidden_size` of the model.
     """
 
     def __init__(self, config: PretrainedConfig):
@@ -1784,14 +1786,14 @@ class PoolerStartLogits(nn.Module):
     ) -> torch.FloatTensor:
         """
         Args:
-            hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`):
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
                 The final hidden states of the model.
-            p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`):
+            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
                 Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
                 should be masked.
 
         Returns:
-            :obj:`torch.FloatTensor`: The start logits for SQuAD.
+            `torch.FloatTensor`: The start logits for SQuAD.
         """
         x = self.dense(hidden_states).squeeze(-1)
 
@@ -1809,9 +1811,9 @@ class PoolerEndLogits(nn.Module):
     Compute SQuAD end logits from sequence hidden states.
 
     Args:
-        config (:class:`~transformers.PretrainedConfig`):
-            The config used by the model, will be used to grab the :obj:`hidden_size` of the model and the
-            :obj:`layer_norm_eps` to use.
+        config ([`PretrainedConfig`]):
+            The config used by the model, will be used to grab the `hidden_size` of the model and the
+            `layer_norm_eps` to use.
     """
 
     def __init__(self, config: PretrainedConfig):
@@ -1830,23 +1832,25 @@ class PoolerEndLogits(nn.Module):
     ) -> torch.FloatTensor:
         """
         Args:
-            hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`):
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
                 The final hidden states of the model.
-            start_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`, `optional`):
+            start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
                 The hidden states of the first tokens for the labeled span.
-            start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                 The position of the first token for the labeled span.
-            p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`):
+            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
                 Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
                 should be masked.
 
-        .. note::
+        <Tip>
 
-            One of ``start_states`` or ``start_positions`` should be not obj:`None`. If both are set,
-            ``start_positions`` overrides ``start_states``.
+        One of `start_states` or `start_positions` should be not obj:*None*. If both are set,
+        `start_positions` overrides `start_states`.
+
+        </Tip>
 
         Returns:
-            :obj:`torch.FloatTensor`: The end logits for SQuAD.
+            `torch.FloatTensor`: The end logits for SQuAD.
         """
         assert (
             start_states is not None or start_positions is not None
@@ -1876,8 +1880,8 @@ class PoolerAnswerClass(nn.Module):
     Compute SQuAD 2.0 answer class from classification and start tokens hidden states.
 
     Args:
-        config (:class:`~transformers.PretrainedConfig`):
-            The config used by the model, will be used to grab the :obj:`hidden_size` of the model.
+        config ([`PretrainedConfig`]):
+            The config used by the model, will be used to grab the `hidden_size` of the model.
     """
 
     def __init__(self, config):
@@ -1895,22 +1899,24 @@ class PoolerAnswerClass(nn.Module):
     ) -> torch.FloatTensor:
         """
         Args:
-            hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`):
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
                 The final hidden states of the model.
-            start_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`, `optional`):
+            start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*):
                 The hidden states of the first tokens for the labeled span.
-            start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                 The position of the first token for the labeled span.
-            cls_index (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-                Position of the CLS token for each sentence in the batch. If :obj:`None`, takes the last token.
+            cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Position of the CLS token for each sentence in the batch. If `None`, takes the last token.
 
-        .. note::
+        <Tip>
 
-            One of ``start_states`` or ``start_positions`` should be not obj:`None`. If both are set,
-            ``start_positions`` overrides ``start_states``.
+        One of `start_states` or `start_positions` should be not obj:*None*. If both are set,
+        `start_positions` overrides `start_states`.
+
+        </Tip>
 
         Returns:
-            :obj:`torch.FloatTensor`: The SQuAD 2.0 answer class.
+            `torch.FloatTensor`: The SQuAD 2.0 answer class.
         """
         # No dependency on end_feature so that we can obtain one single `cls_logits` for each sample.
         hsz = hidden_states.shape[-1]
@@ -1937,23 +1943,23 @@ class PoolerAnswerClass(nn.Module):
 @dataclass
 class SquadHeadOutput(ModelOutput):
     """
-    Base class for outputs of question answering models using a :class:`~transformers.modeling_utils.SQuADHead`.
+    Base class for outputs of question answering models using a [`~modeling_utils.SQuADHead`].
 
     Args:
-        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned if both :obj:`start_positions` and :obj:`end_positions` are provided):
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided):
             Classification loss as the sum of start token, end token (and is_impossible if provided) classification
             losses.
-        start_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
+        start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
             Log probabilities for the top config.start_n_top start token possibilities (beam-search).
-        start_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
+        start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
             Indices for the top config.start_n_top start token possibilities (beam-search).
-        end_top_log_probs (``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
-            Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities
+        end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities
             (beam-search).
-        end_top_index (``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
-            Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
-        cls_logits (``torch.FloatTensor`` of shape ``(batch_size,)``, `optional`, returned if ``start_positions`` or ``end_positions`` is not provided):
-            Log probabilities for the ``is_impossible`` label of the answers.
+        end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search).
+        cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided):
+            Log probabilities for the `is_impossible` label of the answers.
 
     """
 
@@ -1970,9 +1976,9 @@ class SQuADHead(nn.Module):
     A SQuAD head inspired by XLNet.
 
     Args:
-        config (:class:`~transformers.PretrainedConfig`):
-            The config used by the model, will be used to grab the :obj:`hidden_size` of the model and the
-            :obj:`layer_norm_eps` to use.
+        config ([`PretrainedConfig`]):
+            The config used by the model, will be used to grab the `hidden_size` of the model and the
+            `layer_norm_eps` to use.
     """
 
     def __init__(self, config):
@@ -1997,21 +2003,21 @@ class SQuADHead(nn.Module):
     ) -> Union[SquadHeadOutput, Tuple[torch.FloatTensor]]:
         """
         Args:
-            hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len, hidden_size)`):
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`):
                 Final hidden states of the model on the sequence tokens.
-            start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                 Positions of the first token for the labeled span.
-            end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                 Positions of the last token for the labeled span.
-            cls_index (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-                Position of the CLS token for each sentence in the batch. If :obj:`None`, takes the last token.
-            is_impossible (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+            cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Position of the CLS token for each sentence in the batch. If `None`, takes the last token.
+            is_impossible (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
                 Whether the question has a possible answer in the paragraph or not.
-            p_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, seq_len)`, `optional`):
+            p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*):
                 Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token
                 should be masked.
-            return_dict (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+            return_dict (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
 
         Returns:
         """
@@ -2087,26 +2093,26 @@ class SequenceSummary(nn.Module):
     Compute a single vector summary of a sequence hidden states.
 
     Args:
-        config (:class:`~transformers.PretrainedConfig`):
+        config ([`PretrainedConfig`]):
             The config used by the model. Relevant arguments in the config class of the model are (refer to the actual
             config class of your model for the default values it uses):
 
-            - **summary_type** (:obj:`str`) -- The method to use to make this summary. Accepted values are:
+            - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are:
 
-                - :obj:`"last"` -- Take the last token hidden state (like XLNet)
-                - :obj:`"first"` -- Take the first token hidden state (like Bert)
-                - :obj:`"mean"` -- Take the mean of all tokens hidden states
-                - :obj:`"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
-                - :obj:`"attn"` -- Not implemented now, use multi-head attention
+                - `"last"` -- Take the last token hidden state (like XLNet)
+                - `"first"` -- Take the first token hidden state (like Bert)
+                - `"mean"` -- Take the mean of all tokens hidden states
+                - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2)
+                - `"attn"` -- Not implemented now, use multi-head attention
 
-            - **summary_use_proj** (:obj:`bool`) -- Add a projection after the vector extraction.
-            - **summary_proj_to_labels** (:obj:`bool`) -- If :obj:`True`, the projection outputs to
-              :obj:`config.num_labels` classes (otherwise to :obj:`config.hidden_size`).
-            - **summary_activation** (:obj:`Optional[str]`) -- Set to :obj:`"tanh"` to add a tanh activation to the
-              output, another string or :obj:`None` will add no activation.
-            - **summary_first_dropout** (:obj:`float`) -- Optional dropout probability before the projection and
+            - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction.
+            - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to
+              `config.num_labels` classes (otherwise to `config.hidden_size`).
+            - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the
+              output, another string or `None` will add no activation.
+            - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and
               activation.
-            - **summary_last_dropout** (:obj:`float`)-- Optional dropout probability after the projection and
+            - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and
               activation.
     """
 
@@ -2146,14 +2152,14 @@ class SequenceSummary(nn.Module):
         Compute a single vector summary of a sequence hidden states.
 
         Args:
-            hidden_states (:obj:`torch.FloatTensor` of shape :obj:`[batch_size, seq_len, hidden_size]`):
+            hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`):
                 The hidden states of the last layer.
-            cls_index (:obj:`torch.LongTensor` of shape :obj:`[batch_size]` or :obj:`[batch_size, ...]` where ... are optional leading dimensions of :obj:`hidden_states`, `optional`):
-                Used if :obj:`summary_type == "cls_index"` and takes the last token of the sequence as classification
+            cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*):
+                Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification
                 token.
 
         Returns:
-            :obj:`torch.FloatTensor`: The summary of the sequence hidden states.
+            `torch.FloatTensor`: The summary of the sequence hidden states.
         """
         if self.summary_type == "last":
             output = hidden_states[:, -1]
@@ -2189,7 +2195,7 @@ def unwrap_model(model: nn.Module) -> nn.Module:
     Recursively unwraps a model from potential containers (as used in distributed training).
 
     Args:
-        model (:obj:`torch.nn.Module`): The model to unwrap.
+        model (`torch.nn.Module`): The model to unwrap.
     """
     # since there could be multiple levels of wrapping, unwrap recursively
     if hasattr(model, "module"):
@@ -2205,12 +2211,12 @@ def prune_linear_layer(layer: nn.Linear, index: torch.LongTensor, dim: int = 0)
     Used to remove heads.
 
     Args:
-        layer (:obj:`torch.nn.Linear`): The layer to prune.
-        index (:obj:`torch.LongTensor`): The indices to keep in the layer.
-        dim (:obj:`int`, `optional`, defaults to 0): The dimension on which to keep the indices.
+        layer (`torch.nn.Linear`): The layer to prune.
+        index (`torch.LongTensor`): The indices to keep in the layer.
+        dim (`int`, *optional*, defaults to 0): The dimension on which to keep the indices.
 
     Returns:
-        :obj:`torch.nn.Linear`: The pruned layer as a new layer with :obj:`requires_grad=True`.
+        `torch.nn.Linear`: The pruned layer as a new layer with `requires_grad=True`.
     """
     index = index.to(layer.weight.device)
     W = layer.weight.index_select(dim, index).clone().detach()
@@ -2240,12 +2246,12 @@ def prune_conv1d_layer(layer: Conv1D, index: torch.LongTensor, dim: int = 1) ->
     Used to remove heads.
 
     Args:
-        layer (:class:`~transformers.modeling_utils.Conv1D`): The layer to prune.
-        index (:obj:`torch.LongTensor`): The indices to keep in the layer.
-        dim (:obj:`int`, `optional`, defaults to 1): The dimension on which to keep the indices.
+        layer ([`~modeling_utils.Conv1D`]): The layer to prune.
+        index (`torch.LongTensor`): The indices to keep in the layer.
+        dim (`int`, *optional*, defaults to 1): The dimension on which to keep the indices.
 
     Returns:
-        :class:`~transformers.modeling_utils.Conv1D`: The pruned layer as a new layer with :obj:`requires_grad=True`.
+        [`~modeling_utils.Conv1D`]: The pruned layer as a new layer with `requires_grad=True`.
     """
     index = index.to(layer.weight.device)
     W = layer.weight.index_select(dim, index).clone().detach()
@@ -2274,13 +2280,13 @@ def prune_layer(
     Used to remove heads.
 
     Args:
-        layer (:obj:`Union[torch.nn.Linear, Conv1D]`): The layer to prune.
-        index (:obj:`torch.LongTensor`): The indices to keep in the layer.
-        dim (:obj:`int`, `optional`): The dimension on which to keep the indices.
+        layer (`Union[torch.nn.Linear, Conv1D]`): The layer to prune.
+        index (`torch.LongTensor`): The indices to keep in the layer.
+        dim (`int`, *optional*): The dimension on which to keep the indices.
 
     Returns:
-        :obj:`torch.nn.Linear` or :class:`~transformers.modeling_utils.Conv1D`: The pruned layer as a new layer with
-        :obj:`requires_grad=True`.
+        `torch.nn.Linear` or [`~modeling_utils.Conv1D`]: The pruned layer as a new layer with
+        `requires_grad=True`.
     """
     if isinstance(layer, nn.Linear):
         return prune_linear_layer(layer, index, dim=0 if dim is None else dim)
@@ -2294,37 +2300,38 @@ def apply_chunking_to_forward(
     forward_fn: Callable[..., torch.Tensor], chunk_size: int, chunk_dim: int, *input_tensors
 ) -> torch.Tensor:
     """
-    This function chunks the :obj:`input_tensors` into smaller input tensor parts of size :obj:`chunk_size` over the
-    dimension :obj:`chunk_dim`. It then applies a layer :obj:`forward_fn` to each chunk independently to save memory.
+    This function chunks the `input_tensors` into smaller input tensor parts of size `chunk_size` over the
+    dimension `chunk_dim`. It then applies a layer `forward_fn` to each chunk independently to save memory.
 
-    If the :obj:`forward_fn` is independent across the :obj:`chunk_dim` this function will yield the same result as
-    directly applying :obj:`forward_fn` to :obj:`input_tensors`.
+    If the `forward_fn` is independent across the `chunk_dim` this function will yield the same result as
+    directly applying `forward_fn` to `input_tensors`.
 
     Args:
-        forward_fn (:obj:`Callable[..., torch.Tensor]`):
+        forward_fn (`Callable[..., torch.Tensor]`):
             The forward function of the model.
-        chunk_size (:obj:`int`):
-            The chunk size of a chunked tensor: :obj:`num_chunks = len(input_tensors[0]) / chunk_size`.
-        chunk_dim (:obj:`int`):
-            The dimension over which the :obj:`input_tensors` should be chunked.
-        input_tensors (:obj:`Tuple[torch.Tensor]`):
-            The input tensors of ``forward_fn`` which will be chunked
+        chunk_size (`int`):
+            The chunk size of a chunked tensor: `num_chunks = len(input_tensors[0]) / chunk_size`.
+        chunk_dim (`int`):
+            The dimension over which the `input_tensors` should be chunked.
+        input_tensors (`Tuple[torch.Tensor]`):
+            The input tensors of `forward_fn` which will be chunked
 
     Returns:
-        :obj:`torch.Tensor`: A tensor with the same shape as the :obj:`forward_fn` would have given if applied`.
+        `torch.Tensor`: A tensor with the same shape as the `forward_fn` would have given if applied`.
 
 
-    Examples::
+    Examples:
 
-        # rename the usual forward() fn to forward_chunk()
-        def forward_chunk(self, hidden_states):
-            hidden_states = self.decoder(hidden_states)
-            return hidden_states
+    ```python
+    # rename the usual forward() fn to forward_chunk()
+    def forward_chunk(self, hidden_states):
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
 
-        # implement a chunked forward function
-        def forward(self, hidden_states):
-            return apply_chunking_to_forward(self.forward_chunk, self.chunk_size_lm_head, self.seq_len_dim, hidden_states)
-    """
+    # implement a chunked forward function
+    def forward(self, hidden_states):
+        return apply_chunking_to_forward(self.forward_chunk, self.chunk_size_lm_head, self.seq_len_dim, hidden_states)
+    ```"""
 
     assert len(input_tensors) > 0, f"{input_tensors} has to be a tuple/list of tensors"
 
diff --git a/src/transformers/models/albert/configuration_albert.py b/src/transformers/models/albert/configuration_albert.py
index 2bf3171d0d..4f9b6be85e 100644
--- a/src/transformers/models/albert/configuration_albert.py
+++ b/src/transformers/models/albert/configuration_albert.py
@@ -35,79 +35,78 @@ ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class AlbertConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.AlbertModel` or a
-    :class:`~transformers.TFAlbertModel`. It is used to instantiate an ALBERT model according to the specified
+    This is the configuration class to store the configuration of a [`AlbertModel`] or a
+    [`TFAlbertModel`]. It is used to instantiate an ALBERT model according to the specified
     arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
-    configuration to that of the ALBERT `xxlarge <https://huggingface.co/albert-xxlarge-v2>`__ architecture.
+    configuration to that of the ALBERT [xxlarge](https://huggingface.co/albert-xxlarge-v2) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 30000):
+        vocab_size (`int`, *optional*, defaults to 30000):
             Vocabulary size of the ALBERT model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.AlbertModel` or
-            :class:`~transformers.TFAlbertModel`.
-        embedding_size (:obj:`int`, `optional`, defaults to 128):
+            `inputs_ids` passed when calling [`AlbertModel`] or
+            [`TFAlbertModel`].
+        embedding_size (`int`, *optional*, defaults to 128):
             Dimensionality of vocabulary embeddings.
-        hidden_size (:obj:`int`, `optional`, defaults to 4096):
+        hidden_size (`int`, *optional*, defaults to 4096):
             Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_hidden_groups (:obj:`int`, `optional`, defaults to 1):
+        num_hidden_groups (`int`, *optional*, defaults to 1):
             Number of groups for the hidden layers, parameters in the same group are shared.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 64):
+        num_attention_heads (`int`, *optional*, defaults to 64):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 16384):
+        intermediate_size (`int`, *optional*, defaults to 16384):
             The dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
-        inner_group_num (:obj:`int`, `optional`, defaults to 1):
+        inner_group_num (`int`, *optional*, defaults to 1):
             The number of inner repetition of attention and ffn.
-        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu_new"`):
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu_new"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.AlbertModel` or
-            :class:`~transformers.TFAlbertModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`AlbertModel`] or
+            [`TFAlbertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        classifier_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        classifier_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for attached classifiers.
-        position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
-            Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
-            :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
-            :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
-            <https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
-            `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
-            <https://arxiv.org/abs/2009.13658>`__.
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`,
+            `"relative_key_query"`. For positional embeddings use `"absolute"`. For more information on
+            `"relative_key"`, please refer to [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). For more information on `"relative_key_query"`, please refer to
+            *Method 4* in [Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
 
-    Examples::
+    Examples:
 
-        >>> from transformers import AlbertConfig, AlbertModel
-        >>> # Initializing an ALBERT-xxlarge style configuration
-        >>> albert_xxlarge_configuration = AlbertConfig()
+    ```python
+    >>> from transformers import AlbertConfig, AlbertModel
+    >>> # Initializing an ALBERT-xxlarge style configuration
+    >>> albert_xxlarge_configuration = AlbertConfig()
 
-        >>> # Initializing an ALBERT-base style configuration
-        >>> albert_base_configuration = AlbertConfig(
-        ...      hidden_size=768,
-        ...      num_attention_heads=12,
-        ...      intermediate_size=3072,
-        ...  )
+    >>> # Initializing an ALBERT-base style configuration
+    >>> albert_base_configuration = AlbertConfig(
+    ...      hidden_size=768,
+    ...      num_attention_heads=12,
+    ...      intermediate_size=3072,
+    ...  )
 
-        >>> # Initializing a model from the ALBERT-base style configuration
-        >>> model = AlbertModel(albert_xxlarge_configuration)
+    >>> # Initializing a model from the ALBERT-base style configuration
+    >>> model = AlbertModel(albert_xxlarge_configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
 
     model_type = "albert"
 
diff --git a/src/transformers/models/albert/modeling_flax_albert.py b/src/transformers/models/albert/modeling_flax_albert.py
index 7ff4552163..c7ae117a1b 100644
--- a/src/transformers/models/albert/modeling_flax_albert.py
+++ b/src/transformers/models/albert/modeling_flax_albert.py
@@ -742,18 +742,20 @@ class FlaxAlbertForPreTraining(FlaxAlbertPreTrainedModel):
 FLAX_ALBERT_FOR_PRETRAINING_DOCSTRING = """
     Returns:
 
-    Example::
+    Example:
 
-        >>> from transformers import AlbertTokenizer, FlaxAlbertForPreTraining
+    ```python
+    >>> from transformers import AlbertTokenizer, FlaxAlbertForPreTraining
 
-        >>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-        >>> model = FlaxAlbertForPreTraining.from_pretrained('albert-base-v2')
+    >>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
+    >>> model = FlaxAlbertForPreTraining.from_pretrained('albert-base-v2')
 
-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
-        >>> outputs = model(**inputs)
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
+    >>> outputs = model(**inputs)
 
-        >>> prediction_logits = outputs.prediction_logits
-        >>> seq_relationship_logits = outputs.sop_logits
+    >>> prediction_logits = outputs.prediction_logits
+    >>> seq_relationship_logits = outputs.sop_logits
+    ```
 """
 
 overwrite_call_docstring(
diff --git a/src/transformers/models/albert/modeling_tf_albert.py b/src/transformers/models/albert/modeling_tf_albert.py
index 56ced8ec4d..05c755188f 100644
--- a/src/transformers/models/albert/modeling_tf_albert.py
+++ b/src/transformers/models/albert/modeling_tf_albert.py
@@ -885,20 +885,21 @@ class TFAlbertForPreTraining(TFAlbertPreTrainedModel, TFAlbertPreTrainingLoss):
         r"""
         Return:
 
-        Example::
+        Example:
 
-            >>> import tensorflow as tf
-            >>> from transformers import AlbertTokenizer, TFAlbertForPreTraining
+        ```python
+        >>> import tensorflow as tf
+        >>> from transformers import AlbertTokenizer, TFAlbertForPreTraining
 
-            >>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
-            >>> model = TFAlbertForPreTraining.from_pretrained('albert-base-v2')
+        >>> tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
+        >>> model = TFAlbertForPreTraining.from_pretrained('albert-base-v2')
 
-            >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
-            >>> outputs = model(input_ids)
+        >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
+        >>> outputs = model(input_ids)
 
-            >>> prediction_logits = outputs.prediction_logits
-            >>> sop_logits = outputs.sop_logits
-        """
+        >>> prediction_logits = outputs.prediction_logits
+        >>> sop_logits = outputs.sop_logits
+        ```"""
 
         inputs = input_processing(
             func=self.call,
diff --git a/src/transformers/models/albert/tokenization_albert.py b/src/transformers/models/albert/tokenization_albert.py
index 6caa93db57..1d91860d0d 100644
--- a/src/transformers/models/albert/tokenization_albert.py
+++ b/src/transformers/models/albert/tokenization_albert.py
@@ -58,68 +58,73 @@ SPIECE_UNDERLINE = "▁"
 
 class AlbertTokenizer(PreTrainedTokenizer):
     """
-    Construct an ALBERT tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
+    Construct an ALBERT tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
             Whether or not to lowercase the input when tokenizing.
-        remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        remove_space (`bool`, *optional*, defaults to `True`):
             Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
-        keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        keep_accents (`bool`, *optional*, defaults to `False`):
             Whether or not to keep accents when tokenizing.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        bos_token (`str`, *optional*, defaults to `"[CLS]"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"[SEP]"`):
             The end of sequence token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        sp_model_kwargs (:obj:`dict`, `optional`):
-            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
-            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
 
-            - ``enable_sampling``: Enable subword regularization.
-            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
 
-              - ``nbest_size = {0,1}``: No sampling is performed.
-              - ``nbest_size > 1``: samples from the nbest_size results.
-              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                 using forward-filtering-and-backward-sampling algorithm.
 
-            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
               BPE-dropout.
 
     Attributes:
-        sp_model (:obj:`SentencePieceProcessor`):
-            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -252,17 +257,17 @@ class AlbertTokenizer(PreTrainedTokenizer):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. An ALBERT sequence has the following format:
 
-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
@@ -275,18 +280,18 @@ class AlbertTokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
 
         if already_has_special_tokens:
@@ -305,21 +310,21 @@ class AlbertTokenizer(PreTrainedTokenizer):
         Create a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
         sequence pair mask has the following format:
 
-        ::
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
 
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
-
-        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
             sequence(s).
         """
         sep = [self.sep_token_id]
diff --git a/src/transformers/models/albert/tokenization_albert_fast.py b/src/transformers/models/albert/tokenization_albert_fast.py
index 9eb91ec555..3c5adfbd21 100644
--- a/src/transformers/models/albert/tokenization_albert_fast.py
+++ b/src/transformers/models/albert/tokenization_albert_fast.py
@@ -72,44 +72,46 @@ SPIECE_UNDERLINE = "▁"
 
 class AlbertTokenizerFast(PreTrainedTokenizerFast):
     """
-    Construct a "fast" ALBERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
-    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__. This tokenizer
-    inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main methods. Users should
+    Construct a "fast" ALBERT tokenizer (backed by HuggingFace's *tokenizers* library). Based on [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This tokenizer
+    inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
     refer to this superclass for more information regarding those methods
 
     Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
             Whether or not to lowercase the input when tokenizing.
-        remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        remove_space (`bool`, *optional*, defaults to `True`):
             Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
-        keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        keep_accents (`bool`, *optional*, defaults to `False`):
             Whether or not to keep accents when tokenizing.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        bos_token (`str`, *optional*, defaults to `"[CLS]"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
-            .. note::
+            <Tip>
 
-               When building a sequence using special tokens, this is not the token that is used for the beginning of
-               sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"[SEP]"`):
             The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token
-            that is used for the end of sequence. The token used is the :obj:`sep_token`.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            that is used for the end of sequence. The token used is the `sep_token`.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
     """
@@ -172,17 +174,17 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. An ALBERT sequence has the following format:
 
-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
@@ -197,21 +199,21 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
         Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
         sequence pair mask has the following format:
 
-        ::
-
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
 
         if token_ids_1 is None, only returns the first portion of the mask (0s).
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
             sequence(s).
         """
         sep = [self.sep_token_id]
diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py
index 34124fc272..b9afb0d363 100644
--- a/src/transformers/models/auto/auto_factory.py
+++ b/src/transformers/models/auto/auto_factory.py
@@ -28,10 +28,10 @@ logger = logging.get_logger(__name__)
 
 CLASS_DOCSTRING = """
     This is a generic model class that will be instantiated as one of the model classes of the library when created
-    with the :meth:`~transformers.BaseAutoModelClass.from_pretrained` class method or the
-    :meth:`~transformers.BaseAutoModelClass.from_config` class method.
+    with the [`~BaseAutoModelClass.from_pretrained`] class method or the
+    [`~BaseAutoModelClass.from_config`] class method.
 
-    This class cannot be instantiated directly using ``__init__()`` (throws an error).
+    This class cannot be instantiated directly using `__init__()` (throws an error).
 """
 
 FROM_CONFIG_DOCSTRING = """
@@ -39,309 +39,314 @@ FROM_CONFIG_DOCSTRING = """
 
         Note:
             Loading a model from its configuration file does **not** load the model weights. It only affects the
-            model's configuration. Use :meth:`~transformers.BaseAutoModelClass.from_pretrained` to load the model
+            model's configuration. Use [`~BaseAutoModelClass.from_pretrained`] to load the model
             weights.
 
         Args:
-            config (:class:`~transformers.PretrainedConfig`):
+            config ([`PretrainedConfig`]):
                 The model class to instantiate is selected based on the configuration class:
 
                 List options
 
-        Examples::
+        Examples:
 
-            >>> from transformers import AutoConfig, BaseAutoModelClass
-            >>> # Download configuration from huggingface.co and cache.
-            >>> config = AutoConfig.from_pretrained('checkpoint_placeholder')
-            >>> model = BaseAutoModelClass.from_config(config)
+        ```python
+        >>> from transformers import AutoConfig, BaseAutoModelClass
+        >>> # Download configuration from huggingface.co and cache.
+        >>> config = AutoConfig.from_pretrained('checkpoint_placeholder')
+        >>> model = BaseAutoModelClass.from_config(config)
+        ```
 """
 
 FROM_PRETRAINED_TORCH_DOCSTRING = """
         Instantiate one of the model classes of the library from a pretrained model.
 
-        The model class to instantiate is selected based on the :obj:`model_type` property of the config object (either
-        passed as an argument or loaded from :obj:`pretrained_model_name_or_path` if possible), or when it's missing,
-        by falling back to using pattern matching on :obj:`pretrained_model_name_or_path`:
+        The model class to instantiate is selected based on the `model_type` property of the config object (either
+        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing,
+        by falling back to using pattern matching on `pretrained_model_name_or_path`:
 
         List options
 
-        The model is set in evaluation mode by default using ``model.eval()`` (so for instance, dropout modules are
-        deactivated). To train the model, you should first set it back in training mode with ``model.train()``
+        The model is set in evaluation mode by default using `model.eval()` (so for instance, dropout modules are
+        deactivated). To train the model, you should first set it back in training mode with `model.train()`
 
         Args:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
                 Can be either:
 
-                    - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
-                      a user or organization name, like ``dbmdz/bert-base-german-cased``.
-                    - A path to a `directory` containing model weights saved using
-                      :func:`~transformers.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
-                    - A path or url to a `tensorflow index checkpoint file` (e.g, ``./tf_model/model.ckpt.index``). In
-                      this case, ``from_tf`` should be set to :obj:`True` and a configuration object should be provided
-                      as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
+                      a user or organization name, like `dbmdz/bert-base-german-cased`.
+                    - A path to a *directory* containing model weights saved using
+                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+                    - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In
+                      this case, `from_tf` should be set to `True` and a configuration object should be provided
+                      as `config` argument. This loading path is slower than converting the TensorFlow checkpoint in
                       a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
-            model_args (additional positional arguments, `optional`):
-                Will be passed along to the underlying model ``__init__()`` method.
-            config (:class:`~transformers.PretrainedConfig`, `optional`):
+            model_args (additional positional arguments, *optional*):
+                Will be passed along to the underlying model `__init__()` method.
+            config ([`PretrainedConfig`], *optional*):
                 Configuration for the model to use instead of an automatically loaded configuration. Configuration can
                 be automatically loaded when:
 
-                    - The model is a model provided by the library (loaded with the `model id` string of a pretrained
+                    - The model is a model provided by the library (loaded with the *model id* string of a pretrained
                       model).
-                    - The model was saved using :meth:`~transformers.PreTrainedModel.save_pretrained` and is reloaded
+                    - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded
                       by supplying the save directory.
-                    - The model is loaded by supplying a local directory as ``pretrained_model_name_or_path`` and a
-                      configuration JSON file named `config.json` is found in the directory.
-            state_dict (`Dict[str, torch.Tensor]`, `optional`):
+                    - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
+                      configuration JSON file named *config.json* is found in the directory.
+            state_dict (*Dict[str, torch.Tensor]*, *optional*):
                 A state dictionary to use instead of a state dictionary loaded from saved weights file.
 
                 This option can be used if you want to create a model from a pretrained configuration but load your own
                 weights. In this case though, you should check if using
-                :func:`~transformers.PreTrainedModel.save_pretrained` and
-                :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
-            cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
+                [`~PreTrainedModel.save_pretrained`] and
+                [`~PreTrainedModel.from_pretrained`] is not a simpler option.
+            cache_dir (`str` or `os.PathLike`, *optional*):
                 Path to a directory in which a downloaded pretrained model configuration should be cached if the
                 standard cache should not be used.
-            from_tf (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            from_tf (`bool`, *optional*, defaults to `False`):
                 Load the model weights from a TensorFlow checkpoint save file (see docstring of
-                ``pretrained_model_name_or_path`` argument).
-            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                `pretrained_model_name_or_path` argument).
+            force_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                 cached versions if they exist.
-            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            resume_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to delete incompletely received files. Will attempt to resume the download if such a
                 file exists.
-            proxies (:obj:`Dict[str, str]`, `optional`):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
                 Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
-            local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
+            local_files_only(`bool`, *optional*, defaults to `False`):
                 Whether or not to only look at local files (e.g., not try downloading the model).
-            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+            revision(`str`, *optional*, defaults to `"main"`):
                 The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                 identifier allowed by git.
-            trust_remote_code (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            trust_remote_code (`bool`, *optional*, defaults to `False`):
                 Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
-                should only be set to :obj:`True` for repositories you trust and in which you have read the code, as it
+                should only be set to `True` for repositories you trust and in which you have read the code, as it
                 will execute code present on the Hub on your local machine.
-            kwargs (additional keyword arguments, `optional`):
+            kwargs (additional keyword arguments, *optional*):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
-                :obj:`output_attentions=True`). Behaves differently depending on whether a ``config`` is provided or
+                `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
                 automatically loaded:
 
-                    - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the
-                      underlying model's ``__init__`` method (we assume all relevant updates to the configuration have
+                    - If a configuration is provided with `config`, `**kwargs` will be directly passed to the
+                      underlying model's `__init__` method (we assume all relevant updates to the configuration have
                       already been done)
-                    - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class
-                      initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of
-                      ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute
-                      with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration
-                      attribute will be passed to the underlying model's ``__init__`` function.
+                    - If a configuration is not provided, `kwargs` will be first passed to the configuration class
+                      initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of
+                      `kwargs` that corresponds to a configuration attribute will be used to override said attribute
+                      with the supplied `kwargs` value. Remaining keys that do not correspond to any configuration
+                      attribute will be passed to the underlying model's `__init__` function.
 
-        Examples::
+        Examples:
 
-            >>> from transformers import AutoConfig, BaseAutoModelClass
+        ```python
+        >>> from transformers import AutoConfig, BaseAutoModelClass
 
-            >>> # Download model and configuration from huggingface.co and cache.
-            >>> model = BaseAutoModelClass.from_pretrained('checkpoint_placeholder')
+        >>> # Download model and configuration from huggingface.co and cache.
+        >>> model = BaseAutoModelClass.from_pretrained('checkpoint_placeholder')
 
-            >>> # Update configuration during loading
-            >>> model = BaseAutoModelClass.from_pretrained('checkpoint_placeholder', output_attentions=True)
-            >>> model.config.output_attentions
-            True
+        >>> # Update configuration during loading
+        >>> model = BaseAutoModelClass.from_pretrained('checkpoint_placeholder', output_attentions=True)
+        >>> model.config.output_attentions
+        True
 
-            >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            >>> config = AutoConfig.from_pretrained('./tf_model/shortcut_placeholder_tf_model_config.json')
-            >>> model = BaseAutoModelClass.from_pretrained('./tf_model/shortcut_placeholder_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+        >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+        >>> config = AutoConfig.from_pretrained('./tf_model/shortcut_placeholder_tf_model_config.json')
+        >>> model = BaseAutoModelClass.from_pretrained('./tf_model/shortcut_placeholder_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+        ```
 """
 
 FROM_PRETRAINED_TF_DOCSTRING = """
         Instantiate one of the model classes of the library from a pretrained model.
 
-        The model class to instantiate is selected based on the :obj:`model_type` property of the config object (either
-        passed as an argument or loaded from :obj:`pretrained_model_name_or_path` if possible), or when it's missing,
-        by falling back to using pattern matching on :obj:`pretrained_model_name_or_path`:
+        The model class to instantiate is selected based on the `model_type` property of the config object (either
+        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing,
+        by falling back to using pattern matching on `pretrained_model_name_or_path`:
 
         List options
 
         Args:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
                 Can be either:
 
-                    - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
-                      a user or organization name, like ``dbmdz/bert-base-german-cased``.
-                    - A path to a `directory` containing model weights saved using
-                      :func:`~transformers.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
-                    - A path or url to a `PyTorch state_dict save file` (e.g, ``./pt_model/pytorch_model.bin``). In
-                      this case, ``from_pt`` should be set to :obj:`True` and a configuration object should be provided
-                      as ``config`` argument. This loading path is slower than converting the PyTorch model in a
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
+                      a user or organization name, like `dbmdz/bert-base-german-cased`.
+                    - A path to a *directory* containing model weights saved using
+                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+                    - A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In
+                      this case, `from_pt` should be set to `True` and a configuration object should be provided
+                      as `config` argument. This loading path is slower than converting the PyTorch model in a
                       TensorFlow model using the provided conversion scripts and loading the TensorFlow model
                       afterwards.
-            model_args (additional positional arguments, `optional`):
-                Will be passed along to the underlying model ``__init__()`` method.
-            config (:class:`~transformers.PretrainedConfig`, `optional`):
+            model_args (additional positional arguments, *optional*):
+                Will be passed along to the underlying model `__init__()` method.
+            config ([`PretrainedConfig`], *optional*):
                 Configuration for the model to use instead of an automatically loaded configuration. Configuration can
                 be automatically loaded when:
 
-                    - The model is a model provided by the library (loaded with the `model id` string of a pretrained
+                    - The model is a model provided by the library (loaded with the *model id* string of a pretrained
                       model).
-                    - The model was saved using :meth:`~transformers.PreTrainedModel.save_pretrained` and is reloaded
+                    - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded
                       by supplying the save directory.
-                    - The model is loaded by supplying a local directory as ``pretrained_model_name_or_path`` and a
-                      configuration JSON file named `config.json` is found in the directory.
-            cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
+                    - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
+                      configuration JSON file named *config.json* is found in the directory.
+            cache_dir (`str` or `os.PathLike`, *optional*):
                 Path to a directory in which a downloaded pretrained model configuration should be cached if the
                 standard cache should not be used.
-            from_pt (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            from_pt (`bool`, *optional*, defaults to `False`):
                 Load the model weights from a PyTorch checkpoint save file (see docstring of
-                ``pretrained_model_name_or_path`` argument).
-            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                `pretrained_model_name_or_path` argument).
+            force_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                 cached versions if they exist.
-            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            resume_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to delete incompletely received files. Will attempt to resume the download if such a
                 file exists.
-            proxies (:obj:`Dict[str, str]`, `optional`):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
                 Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
-            local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
+            local_files_only(`bool`, *optional*, defaults to `False`):
                 Whether or not to only look at local files (e.g., not try downloading the model).
-            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+            revision(`str`, *optional*, defaults to `"main"`):
                 The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                 identifier allowed by git.
-            trust_remote_code (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            trust_remote_code (`bool`, *optional*, defaults to `False`):
                 Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
-                should only be set to :obj:`True` for repositories you trust and in which you have read the code, as it
+                should only be set to `True` for repositories you trust and in which you have read the code, as it
                 will execute code present on the Hub on your local machine.
-            kwargs (additional keyword arguments, `optional`):
+            kwargs (additional keyword arguments, *optional*):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
-                :obj:`output_attentions=True`). Behaves differently depending on whether a ``config`` is provided or
+                `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
                 automatically loaded:
 
-                    - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the
-                      underlying model's ``__init__`` method (we assume all relevant updates to the configuration have
+                    - If a configuration is provided with `config`, `**kwargs` will be directly passed to the
+                      underlying model's `__init__` method (we assume all relevant updates to the configuration have
                       already been done)
-                    - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class
-                      initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of
-                      ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute
-                      with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration
-                      attribute will be passed to the underlying model's ``__init__`` function.
+                    - If a configuration is not provided, `kwargs` will be first passed to the configuration class
+                      initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of
+                      `kwargs` that corresponds to a configuration attribute will be used to override said attribute
+                      with the supplied `kwargs` value. Remaining keys that do not correspond to any configuration
+                      attribute will be passed to the underlying model's `__init__` function.
 
-        Examples::
+        Examples:
 
-            >>> from transformers import AutoConfig, BaseAutoModelClass
+        ```python
+        >>> from transformers import AutoConfig, BaseAutoModelClass
 
-            >>> # Download model and configuration from huggingface.co and cache.
-            >>> model = BaseAutoModelClass.from_pretrained('checkpoint_placeholder')
+        >>> # Download model and configuration from huggingface.co and cache.
+        >>> model = BaseAutoModelClass.from_pretrained('checkpoint_placeholder')
 
-            >>> # Update configuration during loading
-            >>> model = BaseAutoModelClass.from_pretrained('checkpoint_placeholder', output_attentions=True)
-            >>> model.config.output_attentions
-            True
+        >>> # Update configuration during loading
+        >>> model = BaseAutoModelClass.from_pretrained('checkpoint_placeholder', output_attentions=True)
+        >>> model.config.output_attentions
+        True
 
-            >>> # Loading from a PyTorch checkpoint file instead of a TensorFlow model (slower)
-            >>> config = AutoConfig.from_pretrained('./pt_model/shortcut_placeholder_pt_model_config.json')
-            >>> model = BaseAutoModelClass.from_pretrained('./pt_model/shortcut_placeholder_pytorch_model.bin', from_pt=True, config=config)
+        >>> # Loading from a PyTorch checkpoint file instead of a TensorFlow model (slower)
+        >>> config = AutoConfig.from_pretrained('./pt_model/shortcut_placeholder_pt_model_config.json')
+        >>> model = BaseAutoModelClass.from_pretrained('./pt_model/shortcut_placeholder_pytorch_model.bin', from_pt=True, config=config)
+        ```
 """
 
 FROM_PRETRAINED_FLAX_DOCSTRING = """
         Instantiate one of the model classes of the library from a pretrained model.
 
-        The model class to instantiate is selected based on the :obj:`model_type` property of the config object (either
-        passed as an argument or loaded from :obj:`pretrained_model_name_or_path` if possible), or when it's missing,
-        by falling back to using pattern matching on :obj:`pretrained_model_name_or_path`:
+        The model class to instantiate is selected based on the `model_type` property of the config object (either
+        passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing,
+        by falling back to using pattern matching on `pretrained_model_name_or_path`:
 
         List options
 
         Args:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
                 Can be either:
 
-                    - A string, the `model id` of a pretrained model hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
-                      a user or organization name, like ``dbmdz/bert-base-german-cased``.
-                    - A path to a `directory` containing model weights saved using
-                      :func:`~transformers.PreTrainedModel.save_pretrained`, e.g., ``./my_model_directory/``.
-                    - A path or url to a `PyTorch state_dict save file` (e.g, ``./pt_model/pytorch_model.bin``). In
-                      this case, ``from_pt`` should be set to :obj:`True` and a configuration object should be provided
-                      as ``config`` argument. This loading path is slower than converting the PyTorch model in a
+                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
+                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
+                      a user or organization name, like `dbmdz/bert-base-german-cased`.
+                    - A path to a *directory* containing model weights saved using
+                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
+                    - A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In
+                      this case, `from_pt` should be set to `True` and a configuration object should be provided
+                      as `config` argument. This loading path is slower than converting the PyTorch model in a
                       TensorFlow model using the provided conversion scripts and loading the TensorFlow model
                       afterwards.
-            model_args (additional positional arguments, `optional`):
-                Will be passed along to the underlying model ``__init__()`` method.
-            config (:class:`~transformers.PretrainedConfig`, `optional`):
+            model_args (additional positional arguments, *optional*):
+                Will be passed along to the underlying model `__init__()` method.
+            config ([`PretrainedConfig`], *optional*):
                 Configuration for the model to use instead of an automatically loaded configuration. Configuration can
                 be automatically loaded when:
 
-                    - The model is a model provided by the library (loaded with the `model id` string of a pretrained
+                    - The model is a model provided by the library (loaded with the *model id* string of a pretrained
                       model).
-                    - The model was saved using :meth:`~transformers.PreTrainedModel.save_pretrained` and is reloaded
+                    - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded
                       by supplying the save directory.
-                    - The model is loaded by supplying a local directory as ``pretrained_model_name_or_path`` and a
-                      configuration JSON file named `config.json` is found in the directory.
-            cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
+                    - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
+                      configuration JSON file named *config.json* is found in the directory.
+            cache_dir (`str` or `os.PathLike`, *optional*):
                 Path to a directory in which a downloaded pretrained model configuration should be cached if the
                 standard cache should not be used.
-            from_pt (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            from_pt (`bool`, *optional*, defaults to `False`):
                 Load the model weights from a PyTorch checkpoint save file (see docstring of
-                ``pretrained_model_name_or_path`` argument).
-            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                `pretrained_model_name_or_path` argument).
+            force_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to force the (re-)download of the model weights and configuration files, overriding the
                 cached versions if they exist.
-            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            resume_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to delete incompletely received files. Will attempt to resume the download if such a
                 file exists.
-            proxies (:obj:`Dict[str, str]`, `optional`):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            output_loading_info(:obj:`bool`, `optional`, defaults to :obj:`False`):
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            output_loading_info(`bool`, *optional*, defaults to `False`):
                 Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages.
-            local_files_only(:obj:`bool`, `optional`, defaults to :obj:`False`):
+            local_files_only(`bool`, *optional*, defaults to `False`):
                 Whether or not to only look at local files (e.g., not try downloading the model).
-            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+            revision(`str`, *optional*, defaults to `"main"`):
                 The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                 identifier allowed by git.
-            trust_remote_code (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            trust_remote_code (`bool`, *optional*, defaults to `False`):
                 Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
-                should only be set to :obj:`True` for repositories you trust and in which you have read the code, as it
+                should only be set to `True` for repositories you trust and in which you have read the code, as it
                 will execute code present on the Hub on your local machine.
-            kwargs (additional keyword arguments, `optional`):
+            kwargs (additional keyword arguments, *optional*):
                 Can be used to update the configuration object (after it being loaded) and initiate the model (e.g.,
-                :obj:`output_attentions=True`). Behaves differently depending on whether a ``config`` is provided or
+                `output_attentions=True`). Behaves differently depending on whether a `config` is provided or
                 automatically loaded:
 
-                    - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the
-                      underlying model's ``__init__`` method (we assume all relevant updates to the configuration have
+                    - If a configuration is provided with `config`, `**kwargs` will be directly passed to the
+                      underlying model's `__init__` method (we assume all relevant updates to the configuration have
                       already been done)
-                    - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class
-                      initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of
-                      ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute
-                      with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration
-                      attribute will be passed to the underlying model's ``__init__`` function.
+                    - If a configuration is not provided, `kwargs` will be first passed to the configuration class
+                      initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of
+                      `kwargs` that corresponds to a configuration attribute will be used to override said attribute
+                      with the supplied `kwargs` value. Remaining keys that do not correspond to any configuration
+                      attribute will be passed to the underlying model's `__init__` function.
 
-        Examples::
+        Examples:
 
-            >>> from transformers import AutoConfig, BaseAutoModelClass
+        ```python
+        >>> from transformers import AutoConfig, BaseAutoModelClass
 
-            >>> # Download model and configuration from huggingface.co and cache.
-            >>> model = BaseAutoModelClass.from_pretrained('checkpoint_placeholder')
+        >>> # Download model and configuration from huggingface.co and cache.
+        >>> model = BaseAutoModelClass.from_pretrained('checkpoint_placeholder')
 
-            >>> # Update configuration during loading
-            >>> model = BaseAutoModelClass.from_pretrained('checkpoint_placeholder', output_attentions=True)
-            >>> model.config.output_attentions
-            True
+        >>> # Update configuration during loading
+        >>> model = BaseAutoModelClass.from_pretrained('checkpoint_placeholder', output_attentions=True)
+        >>> model.config.output_attentions
+        True
 
-            >>> # Loading from a PyTorch checkpoint file instead of a TensorFlow model (slower)
-            >>> config = AutoConfig.from_pretrained('./pt_model/shortcut_placeholder_pt_model_config.json')
-            >>> model = BaseAutoModelClass.from_pretrained('./pt_model/shortcut_placeholder_pytorch_model.bin', from_pt=True, config=config)
+        >>> # Loading from a PyTorch checkpoint file instead of a TensorFlow model (slower)
+        >>> config = AutoConfig.from_pretrained('./pt_model/shortcut_placeholder_pt_model_config.json')
+        >>> model = BaseAutoModelClass.from_pretrained('./pt_model/shortcut_placeholder_pytorch_model.bin', from_pt=True, config=config)
+        ```
 """
 
 
@@ -445,9 +450,9 @@ class _BaseAutoModelClass:
         Register a new model for this class.
 
         Args:
-            config_class (:class:`~transformers.PretrainedConfig`):
+            config_class ([`PretrainedConfig`]):
                 The configuration corresponding to the model to register.
-            model_class (:class:`~transformers.PreTrainedModel`):
+            model_class ([`PreTrainedModel`]):
                 The model to register.
         """
         if hasattr(model_class, "config_class") and model_class.config_class != config_class:
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index 81e0749c51..bfe9772036 100644
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -475,9 +475,9 @@ def replace_list_option_in_docstrings(config_to_class=None, use_model_types=True
 class AutoConfig:
     r"""
     This is a generic configuration class that will be instantiated as one of the configuration classes of the library
-    when created with the :meth:`~transformers.AutoConfig.from_pretrained` class method.
+    when created with the [`~AutoConfig.from_pretrained`] class method.
 
-    This class cannot be instantiated directly using ``__init__()`` (throws an error).
+    This class cannot be instantiated directly using `__init__()` (throws an error).
     """
 
     def __init__(self):
@@ -501,81 +501,81 @@ class AutoConfig:
         r"""
         Instantiate one of the configuration classes of the library from a pretrained model configuration.
 
-        The configuration class to instantiate is selected based on the :obj:`model_type` property of the config object
+        The configuration class to instantiate is selected based on the `model_type` property of the config object
         that is loaded, or when it's missing, by falling back to using pattern matching on
-        :obj:`pretrained_model_name_or_path`:
+        `pretrained_model_name_or_path`:
 
         List options
 
         Args:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
                 Can be either:
 
-                    - A string, the `model id` of a pretrained model configuration hosted inside a model repo on
-                      huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
-                      namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
-                    - A path to a `directory` containing a configuration file saved using the
-                      :meth:`~transformers.PretrainedConfig.save_pretrained` method, or the
-                      :meth:`~transformers.PreTrainedModel.save_pretrained` method, e.g., ``./my_model_directory/``.
-                    - A path or url to a saved configuration JSON `file`, e.g.,
-                      ``./my_model_directory/configuration.json``.
-            cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
+                    - A string, the *model id* of a pretrained model configuration hosted inside a model repo on
+                      huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+                      namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                    - A path to a *directory* containing a configuration file saved using the
+                      [`~PretrainedConfig.save_pretrained`] method, or the
+                      [`~PreTrainedModel.save_pretrained`] method, e.g., `./my_model_directory/`.
+                    - A path or url to a saved configuration JSON *file*, e.g.,
+                      `./my_model_directory/configuration.json`.
+            cache_dir (`str` or `os.PathLike`, *optional*):
                 Path to a directory in which a downloaded pretrained model configuration should be cached if the
                 standard cache should not be used.
-            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            force_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to force the (re-)download the model weights and configuration files and override the
                 cached versions if they exist.
-            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            resume_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to delete incompletely received files. Will attempt to resume the download if such a
                 file exists.
-            proxies (:obj:`Dict[str, str]`, `optional`):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            revision(`str`, *optional*, defaults to `"main"`):
                 The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                 identifier allowed by git.
-            return_unused_kwargs (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                If :obj:`False`, then this function returns just the final configuration object.
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                If `False`, then this function returns just the final configuration object.
 
-                If :obj:`True`, then this functions returns a :obj:`Tuple(config, unused_kwargs)` where `unused_kwargs`
+                If `True`, then this functions returns a `Tuple(config, unused_kwargs)` where *unused_kwargs*
                 is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: i.e.,
-                the part of ``kwargs`` which has not been used to update ``config`` and is otherwise ignored.
-            trust_remote_code (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                the part of `kwargs` which has not been used to update `config` and is otherwise ignored.
+            trust_remote_code (`bool`, *optional*, defaults to `False`):
                 Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
-                should only be set to :obj:`True` for repositories you trust and in which you have read the code, as it
+                should only be set to `True` for repositories you trust and in which you have read the code, as it
                 will execute code present on the Hub on your local machine.
-            kwargs(additional keyword arguments, `optional`):
+            kwargs(additional keyword arguments, *optional*):
                 The values in kwargs of any keys which are configuration attributes will be used to override the loaded
                 values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
-                by the ``return_unused_kwargs`` keyword parameter.
+                by the `return_unused_kwargs` keyword parameter.
 
-        Examples::
+        Examples:
 
-            >>> from transformers import AutoConfig
+        ```python
+        >>> from transformers import AutoConfig
 
-            >>> # Download configuration from huggingface.co and cache.
-            >>> config = AutoConfig.from_pretrained('bert-base-uncased')
+        >>> # Download configuration from huggingface.co and cache.
+        >>> config = AutoConfig.from_pretrained('bert-base-uncased')
 
-            >>> # Download configuration from huggingface.co (user-uploaded) and cache.
-            >>> config = AutoConfig.from_pretrained('dbmdz/bert-base-german-cased')
+        >>> # Download configuration from huggingface.co (user-uploaded) and cache.
+        >>> config = AutoConfig.from_pretrained('dbmdz/bert-base-german-cased')
 
-            >>> # If configuration file is in a directory (e.g., was saved using `save_pretrained('./test/saved_model/')`).
-            >>> config = AutoConfig.from_pretrained('./test/bert_saved_model/')
+        >>> # If configuration file is in a directory (e.g., was saved using *save_pretrained('./test/saved_model/')*).
+        >>> config = AutoConfig.from_pretrained('./test/bert_saved_model/')
 
-            >>> # Load a specific configuration file.
-            >>> config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json')
+        >>> # Load a specific configuration file.
+        >>> config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json')
 
-            >>> # Change some config attributes when loading a pretrained config.
-            >>> config = AutoConfig.from_pretrained('bert-base-uncased', output_attentions=True, foo=False)
-            >>> config.output_attentions
-            True
-            >>> config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attentions=True, foo=False, return_unused_kwargs=True)
-            >>> config.output_attentions
-            True
-            >>> config.unused_kwargs
-            {'foo': False}
-        """
+        >>> # Change some config attributes when loading a pretrained config.
+        >>> config = AutoConfig.from_pretrained('bert-base-uncased', output_attentions=True, foo=False)
+        >>> config.output_attentions
+        True
+        >>> config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attentions=True, foo=False, return_unused_kwargs=True)
+        >>> config.output_attentions
+        True
+        >>> config.unused_kwargs
+        {'foo': False}
+        ```"""
         kwargs["_from_auto"] = True
         kwargs["name_or_path"] = pretrained_model_name_or_path
         trust_remote_code = kwargs.pop("trust_remote_code", False)
@@ -619,8 +619,8 @@ class AutoConfig:
         Register a new configuration for this class.
 
         Args:
-            model_type (:obj:`str`): The model type like "bert" or "gpt".
-            config (:class:`~transformers.PretrainedConfig`): The config to register.
+            model_type (`str`): The model type like "bert" or "gpt".
+            config ([`PretrainedConfig`]): The config to register.
         """
         if issubclass(config, PretrainedConfig) and config.model_type != model_type:
             raise ValueError(
diff --git a/src/transformers/models/auto/dynamic.py b/src/transformers/models/auto/dynamic.py
index daf8161a05..1185298d85 100644
--- a/src/transformers/models/auto/dynamic.py
+++ b/src/transformers/models/auto/dynamic.py
@@ -120,60 +120,63 @@ def get_class_from_dynamic_module(
     """
     Extracts a class from a module file, present in the local folder or repository of a model.
 
-    .. warning::
+    <Tip warning={true}>
 
-        Calling this function will execute the code in the module file found locally or downloaded from the Hub. It
-        should therefore only be called on trusted repos.
+    Calling this function will execute the code in the module file found locally or downloaded from the Hub. It
+    should therefore only be called on trusted repos.
+
+    </Tip>
 
     Args:
-        pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+        pretrained_model_name_or_path (`str` or `os.PathLike`):
             This can be either:
 
-            - a string, the `model id` of a pretrained model configuration hosted inside a model repo on
-              huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
-              namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
-            - a path to a `directory` containing a configuration file saved using the
-              :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g., ``./my_model_directory/``.
+            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
+              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+              namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+            - a path to a *directory* containing a configuration file saved using the
+              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
 
-        module_file (:obj:`str`):
+        module_file (`str`):
             The name of the module file containing the class to look for.
-        class_name (:obj:`str`):
+        class_name (`str`):
             The name of the class to import in the module.
-        cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
+        cache_dir (`str` or `os.PathLike`, *optional*):
             Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
             cache should not be used.
-        force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        force_download (`bool`, *optional*, defaults to `False`):
             Whether or not to force to (re-)download the configuration files and override the cached versions if they
             exist.
-        resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        resume_download (`bool`, *optional*, defaults to `False`):
             Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
-        proxies (:obj:`Dict[str, str]`, `optional`):
-            A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
-            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-        use_auth_token (:obj:`str` or `bool`, `optional`):
-            The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
-            generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
-        revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+        proxies (`Dict[str, str]`, *optional*):
+            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+        use_auth_token (`str` or *bool*, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token
+            generated when running `transformers-cli login` (stored in `~/.huggingface`).
+        revision(`str`, *optional*, defaults to `"main"`):
             The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-            git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
             identifier allowed by git.
-        local_files_only (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            If :obj:`True`, will only try to load the tokenizer configuration from local files.
+        local_files_only (`bool`, *optional*, defaults to `False`):
+            If `True`, will only try to load the tokenizer configuration from local files.
 
-    .. note::
+    <Tip>
 
-        Passing :obj:`use_auth_token=True` is required when you want to use a private model.
+    Passing `use_auth_token=True` is required when you want to use a private model.
 
+    </Tip>
 
     Returns:
-        :obj:`type`: The class, dynamically imported from the module.
+        `type`: The class, dynamically imported from the module.
 
-    Examples::
+    Examples:
 
-        # Download module `modeling.py` from huggingface.co and cache then extract the class `MyBertModel` from this
-        # module.
-        cls = get_class_from_dynamic_module("sgugger/my-bert-model", "modeling.py", "MyBertModel")
-    """
+    ```python
+    # Download module *modeling.py* from huggingface.co and cache then extract the class *MyBertModel* from this
+    # module.
+    cls = get_class_from_dynamic_module("sgugger/my-bert-model", "modeling.py", "MyBertModel")
+    ```"""
     if is_offline_mode() and not local_files_only:
         logger.info("Offline mode: forcing local_files_only=True")
         local_files_only = True
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 45f12953f9..d5b2213ec4 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -65,9 +65,9 @@ def feature_extractor_class_from_name(class_name: str):
 class AutoFeatureExtractor:
     r"""
     This is a generic feature extractor class that will be instantiated as one of the feature extractor classes of the
-    library when created with the :meth:`AutoFeatureExtractor.from_pretrained` class method.
+    library when created with the [`AutoFeatureExtractor.from_pretrained`] class method.
 
-    This class cannot be instantiated directly using ``__init__()`` (throws an error).
+    This class cannot be instantiated directly using `__init__()` (throws an error).
     """
 
     def __init__(self):
@@ -82,68 +82,69 @@ class AutoFeatureExtractor:
         r"""
         Instantiate one of the feature extractor classes of the library from a pretrained model vocabulary.
 
-        The feature extractor class to instantiate is selected based on the :obj:`model_type` property of the config
-        object (either passed as an argument or loaded from :obj:`pretrained_model_name_or_path` if possible), or when
-        it's missing, by falling back to using pattern matching on :obj:`pretrained_model_name_or_path`:
+        The feature extractor class to instantiate is selected based on the `model_type` property of the config
+        object (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when
+        it's missing, by falling back to using pattern matching on `pretrained_model_name_or_path`:
 
         List options
 
         Params:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
                 This can be either:
 
-                - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
-                  namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing a feature extractor file saved using the
-                  :func:`~transformers.feature_extraction_utils.FeatureExtractionMixin.save_pretrained` method, e.g.,
-                  ``./my_model_directory/``.
-                - a path or url to a saved feature extractor JSON `file`, e.g.,
-                  ``./my_model_directory/preprocessor_config.json``.
-            cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
+                - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
+                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                - a path to a *directory* containing a feature extractor file saved using the
+                  [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] method, e.g.,
+                  `./my_model_directory/`.
+                - a path or url to a saved feature extractor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.
+            cache_dir (`str` or `os.PathLike`, *optional*):
                 Path to a directory in which a downloaded pretrained model feature extractor should be cached if the
                 standard cache should not be used.
-            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            force_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to force to (re-)download the feature extractor files and override the cached versions
                 if they exist.
-            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            resume_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to delete incompletely received file. Attempts to resume the download if such a file
                 exists.
-            proxies (:obj:`Dict[str, str]`, `optional`):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-            use_auth_token (:obj:`str` or `bool`, `optional`):
-                The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
-                generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
-            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+            use_auth_token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token
+                generated when running `transformers-cli login` (stored in `~/.huggingface`).
+            revision(`str`, *optional*, defaults to `"main"`):
                 The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                 identifier allowed by git.
-            return_unused_kwargs (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                If :obj:`False`, then this function returns just the final feature extractor object. If :obj:`True`,
-                then this functions returns a :obj:`Tuple(feature_extractor, unused_kwargs)` where `unused_kwargs` is a
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                If `False`, then this function returns just the final feature extractor object. If `True`,
+                then this functions returns a `Tuple(feature_extractor, unused_kwargs)` where *unused_kwargs* is a
                 dictionary consisting of the key/value pairs whose keys are not feature extractor attributes: i.e., the
-                part of ``kwargs`` which has not been used to update ``feature_extractor`` and is otherwise ignored.
-            kwargs (:obj:`Dict[str, Any]`, `optional`):
+                part of `kwargs` which has not been used to update `feature_extractor` and is otherwise ignored.
+            kwargs (`Dict[str, Any]`, *optional*):
                 The values in kwargs of any keys which are feature extractor attributes will be used to override the
                 loaded values. Behavior concerning key/value pairs whose keys are *not* feature extractor attributes is
-                controlled by the ``return_unused_kwargs`` keyword parameter.
+                controlled by the `return_unused_kwargs` keyword parameter.
 
-        .. note::
+        <Tip>
 
-            Passing :obj:`use_auth_token=True` is required when you want to use a private model.
+        Passing `use_auth_token=True` is required when you want to use a private model.
 
-        Examples::
+        </Tip>
 
-            >>> from transformers import AutoFeatureExtractor
+        Examples:
 
-            >>> # Download feature extractor from huggingface.co and cache.
-            >>> feature_extractor = AutoFeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h')
+        ```python
+        >>> from transformers import AutoFeatureExtractor
 
-            >>> # If feature extractor files are in a directory (e.g. feature extractor was saved using `save_pretrained('./test/saved_model/')`)
-            >>> feature_extractor = AutoFeatureExtractor.from_pretrained('./test/saved_model/')
+        >>> # Download feature extractor from huggingface.co and cache.
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained('facebook/wav2vec2-base-960h')
 
-        """
+        >>> # If feature extractor files are in a directory (e.g. feature extractor was saved using *save_pretrained('./test/saved_model/')*)
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained('./test/saved_model/')
+        ```"""
         config = kwargs.pop("config", None)
         kwargs["_from_auto"] = True
 
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 5a5cf8ac8a..f9f4868369 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -62,9 +62,9 @@ def processor_class_from_name(class_name: str):
 class AutoProcessor:
     r"""
     This is a generic processor class that will be instantiated as one of the processor classes of the library when
-    created with the :meth:`AutoProcessor.from_pretrained` class method.
+    created with the [`AutoProcessor.from_pretrained`] class method.
 
-    This class cannot be instantiated directly using ``__init__()`` (throws an error).
+    This class cannot be instantiated directly using `__init__()` (throws an error).
     """
 
     def __init__(self):
@@ -79,64 +79,65 @@ class AutoProcessor:
         r"""
         Instantiate one of the processor classes of the library from a pretrained model vocabulary.
 
-        The processor class to instantiate is selected based on the :obj:`model_type` property of the config object
-        (either passed as an argument or loaded from :obj:`pretrained_model_name_or_path` if possible):
+        The processor class to instantiate is selected based on the `model_type` property of the config object
+        (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible):
 
         List options
 
         Params:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
                 This can be either:
 
-                - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
-                  namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing a processor files saved using the :obj:`save_pretrained()` method,
-                  e.g., ``./my_model_directory/``.
-            cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
+                - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
+                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                - a path to a *directory* containing a processor files saved using the `save_pretrained()` method,
+                  e.g., `./my_model_directory/`.
+            cache_dir (`str` or `os.PathLike`, *optional*):
                 Path to a directory in which a downloaded pretrained model feature extractor should be cached if the
                 standard cache should not be used.
-            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            force_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to force to (re-)download the feature extractor files and override the cached versions
                 if they exist.
-            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            resume_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to delete incompletely received file. Attempts to resume the download if such a file
                 exists.
-            proxies (:obj:`Dict[str, str]`, `optional`):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-            use_auth_token (:obj:`str` or `bool`, `optional`):
-                The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
-                generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
-            revision (:obj:`str`, `optional`, defaults to :obj:`"main"`):
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+            use_auth_token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token
+                generated when running `transformers-cli login` (stored in `~/.huggingface`).
+            revision (`str`, *optional*, defaults to `"main"`):
                 The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                 identifier allowed by git.
-            return_unused_kwargs (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                If :obj:`False`, then this function returns just the final feature extractor object. If :obj:`True`,
-                then this functions returns a :obj:`Tuple(feature_extractor, unused_kwargs)` where `unused_kwargs` is a
+            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+                If `False`, then this function returns just the final feature extractor object. If `True`,
+                then this functions returns a `Tuple(feature_extractor, unused_kwargs)` where *unused_kwargs* is a
                 dictionary consisting of the key/value pairs whose keys are not feature extractor attributes: i.e., the
-                part of ``kwargs`` which has not been used to update ``feature_extractor`` and is otherwise ignored.
-            kwargs (:obj:`Dict[str, Any]`, `optional`):
+                part of `kwargs` which has not been used to update `feature_extractor` and is otherwise ignored.
+            kwargs (`Dict[str, Any]`, *optional*):
                 The values in kwargs of any keys which are feature extractor attributes will be used to override the
                 loaded values. Behavior concerning key/value pairs whose keys are *not* feature extractor attributes is
-                controlled by the ``return_unused_kwargs`` keyword parameter.
+                controlled by the `return_unused_kwargs` keyword parameter.
 
-        .. note::
+        <Tip>
 
-            Passing :obj:`use_auth_token=True` is required when you want to use a private model.
+        Passing `use_auth_token=True` is required when you want to use a private model.
 
-        Examples::
+        </Tip>
 
-            >>> from transformers import AutoProcessor
+        Examples:
 
-            >>> # Download processor from huggingface.co and cache.
-            >>> processor = AutoProcessor.from_pretrained('facebook/wav2vec2-base-960h')
+        ```python
+        >>> from transformers import AutoProcessor
 
-            >>> # If processor files are in a directory (e.g. processor was saved using `save_pretrained('./test/saved_model/')`)
-            >>> processor = AutoProcessor.from_pretrained('./test/saved_model/')
+        >>> # Download processor from huggingface.co and cache.
+        >>> processor = AutoProcessor.from_pretrained('facebook/wav2vec2-base-960h')
 
-        """
+        >>> # If processor files are in a directory (e.g. processor was saved using *save_pretrained('./test/saved_model/')*)
+        >>> processor = AutoProcessor.from_pretrained('./test/saved_model/')
+        ```"""
         config = kwargs.pop("config", None)
         kwargs["_from_auto"] = True
 
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index ca7103f238..6ec092f368 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -273,58 +273,59 @@ def get_tokenizer_config(
     Loads the tokenizer configuration from a pretrained model tokenizer configuration.
 
     Args:
-        pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+        pretrained_model_name_or_path (`str` or `os.PathLike`):
             This can be either:
 
-            - a string, the `model id` of a pretrained model configuration hosted inside a model repo on
-              huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
-              namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
-            - a path to a `directory` containing a configuration file saved using the
-              :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g., ``./my_model_directory/``.
+            - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
+              huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+              namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+            - a path to a *directory* containing a configuration file saved using the
+              [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`.
 
-        cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
+        cache_dir (`str` or `os.PathLike`, *optional*):
             Path to a directory in which a downloaded pretrained model configuration should be cached if the standard
             cache should not be used.
-        force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        force_download (`bool`, *optional*, defaults to `False`):
             Whether or not to force to (re-)download the configuration files and override the cached versions if they
             exist.
-        resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        resume_download (`bool`, *optional*, defaults to `False`):
             Whether or not to delete incompletely received file. Attempts to resume the download if such a file exists.
-        proxies (:obj:`Dict[str, str]`, `optional`):
-            A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
-            'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
-        use_auth_token (:obj:`str` or `bool`, `optional`):
-            The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
-            generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
-        revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+        proxies (`Dict[str, str]`, *optional*):
+            A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request.
+        use_auth_token (`str` or *bool*, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token
+            generated when running `transformers-cli login` (stored in `~/.huggingface`).
+        revision(`str`, *optional*, defaults to `"main"`):
             The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-            git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
             identifier allowed by git.
-        local_files_only (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            If :obj:`True`, will only try to load the tokenizer configuration from local files.
+        local_files_only (`bool`, *optional*, defaults to `False`):
+            If `True`, will only try to load the tokenizer configuration from local files.
 
-    .. note::
+    <Tip>
 
-        Passing :obj:`use_auth_token=True` is required when you want to use a private model.
+    Passing `use_auth_token=True` is required when you want to use a private model.
 
+    </Tip>
 
     Returns:
-        :obj:`Dict`: The configuration of the tokenizer.
+        `Dict`: The configuration of the tokenizer.
 
-    Examples::
+    Examples:
 
-        # Download configuration from huggingface.co and cache.
-        tokenizer_config = get_tokenizer_config("bert-base-uncased")
-        # This model does not have a tokenizer config so the result will be an empty dict.
-        tokenizer_config = get_tokenizer_config("xlm-roberta-base")
+    ```python
+    # Download configuration from huggingface.co and cache.
+    tokenizer_config = get_tokenizer_config("bert-base-uncased")
+    # This model does not have a tokenizer config so the result will be an empty dict.
+    tokenizer_config = get_tokenizer_config("xlm-roberta-base")
 
-        # Save a pretrained tokenizer locally and you can reload its config
-        from transformers import AutoTokenizer
+    # Save a pretrained tokenizer locally and you can reload its config
+    from transformers import AutoTokenizer
 
-        tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
-        tokenizer.save_pretrained("tokenizer-test")
-        tokenizer_config = get_tokenizer_config("tokenizer-test")
-    """
+    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+    tokenizer.save_pretrained("tokenizer-test")
+    tokenizer_config = get_tokenizer_config("tokenizer-test")
+    ```"""
     if is_offline_mode() and not local_files_only:
         logger.info("Offline mode: forcing local_files_only=True")
         local_files_only = True
@@ -360,9 +361,9 @@ def get_tokenizer_config(
 class AutoTokenizer:
     r"""
     This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when
-    created with the :meth:`AutoTokenizer.from_pretrained` class method.
+    created with the [`AutoTokenizer.from_pretrained`] class method.
 
-    This class cannot be instantiated directly using ``__init__()`` (throws an error).
+    This class cannot be instantiated directly using `__init__()` (throws an error).
     """
 
     def __init__(self):
@@ -377,75 +378,74 @@ class AutoTokenizer:
         r"""
         Instantiate one of the tokenizer classes of the library from a pretrained model vocabulary.
 
-        The tokenizer class to instantiate is selected based on the :obj:`model_type` property of the config object
-        (either passed as an argument or loaded from :obj:`pretrained_model_name_or_path` if possible), or when it's
-        missing, by falling back to using pattern matching on :obj:`pretrained_model_name_or_path`:
+        The tokenizer class to instantiate is selected based on the `model_type` property of the config object
+        (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's
+        missing, by falling back to using pattern matching on `pretrained_model_name_or_path`:
 
         List options
 
         Params:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
                 Can be either:
 
-                    - A string, the `model id` of a predefined tokenizer hosted inside a model repo on huggingface.co.
-                      Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under
-                      a user or organization name, like ``dbmdz/bert-base-german-cased``.
-                    - A path to a `directory` containing vocabulary files required by the tokenizer, for instance saved
-                      using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.,
-                      ``./my_model_directory/``.
+                    - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
+                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under
+                      a user or organization name, like `dbmdz/bert-base-german-cased`.
+                    - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
+                      using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g.,
+                      `./my_model_directory/`.
                     - A path or url to a single saved vocabulary file if and only if the tokenizer only requires a
-                      single vocabulary file (like Bert or XLNet), e.g.: ``./my_model_directory/vocab.txt``. (Not
+                      single vocabulary file (like Bert or XLNet), e.g.: `./my_model_directory/vocab.txt`. (Not
                       applicable to all derived classes)
-            inputs (additional positional arguments, `optional`):
-                Will be passed along to the Tokenizer ``__init__()`` method.
-            config (:class:`~transformers.PretrainedConfig`, `optional`)
+            inputs (additional positional arguments, *optional*):
+                Will be passed along to the Tokenizer `__init__()` method.
+            config ([`PretrainedConfig`], *optional*)
                 The configuration object used to dertermine the tokenizer class to instantiate.
-            cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
+            cache_dir (`str` or `os.PathLike`, *optional*):
                 Path to a directory in which a downloaded pretrained model configuration should be cached if the
                 standard cache should not be used.
-            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            force_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to force the (re-)download the model weights and configuration files and override the
                 cached versions if they exist.
-            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            resume_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to delete incompletely received files. Will attempt to resume the download if such a
                 file exists.
-            proxies (:obj:`Dict[str, str]`, `optional`):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            revision(`str`, *optional*, defaults to `"main"`):
                 The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                 identifier allowed by git.
-            subfolder (:obj:`str`, `optional`):
+            subfolder (`str`, *optional*):
                 In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for
                 facebook/rag-token-base), specify it here.
-            use_fast (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            use_fast (`bool`, *optional*, defaults to `True`):
                 Whether or not to try to load the fast version of the tokenizer.
-            tokenizer_type (:obj:`str`, `optional`):
+            tokenizer_type (`str`, *optional*):
                 Tokenizer type to be loaded.
-            trust_remote_code (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            trust_remote_code (`bool`, *optional*, defaults to `False`):
                 Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
-                should only be set to :obj:`True` for repositories you trust and in which you have read the code, as it
+                should only be set to `True` for repositories you trust and in which you have read the code, as it
                 will execute code present on the Hub on your local machine.
-            kwargs (additional keyword arguments, `optional`):
-                Will be passed to the Tokenizer ``__init__()`` method. Can be used to set special tokens like
-                ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``,
-                ``mask_token``, ``additional_special_tokens``. See parameters in the ``__init__()`` for more details.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the Tokenizer `__init__()` method. Can be used to set special tokens like
+                `bos_token`, `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`,
+                `mask_token`, `additional_special_tokens`. See parameters in the `__init__()` for more details.
 
-        Examples::
+        Examples:
 
-            >>> from transformers import AutoTokenizer
+        ```python
+        >>> from transformers import AutoTokenizer
 
-            >>> # Download vocabulary from huggingface.co and cache.
-            >>> tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+        >>> # Download vocabulary from huggingface.co and cache.
+        >>> tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
 
-            >>> # Download vocabulary from huggingface.co (user-uploaded) and cache.
-            >>> tokenizer = AutoTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
+        >>> # Download vocabulary from huggingface.co (user-uploaded) and cache.
+        >>> tokenizer = AutoTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
 
-            >>> # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
-            >>> tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')
-
-        """
+        >>> # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
+        >>> tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')
+        ```"""
         config = kwargs.pop("config", None)
         kwargs["_from_auto"] = True
 
@@ -568,11 +568,11 @@ class AutoTokenizer:
 
 
         Args:
-            config_class (:class:`~transformers.PretrainedConfig`):
+            config_class ([`PretrainedConfig`]):
                 The configuration corresponding to the model to register.
-            slow_tokenizer_class (:class:`~transformers.PretrainedTokenizer`, `optional`):
+            slow_tokenizer_class ([`PretrainedTokenizer`], *optional*):
                 The slow tokenizer to register.
-            slow_tokenizer_class (:class:`~transformers.PretrainedTokenizerFast`, `optional`):
+            slow_tokenizer_class ([`PretrainedTokenizerFast`], *optional*):
                 The fast tokenizer to register.
         """
         if slow_tokenizer_class is None and fast_tokenizer_class is None:
diff --git a/src/transformers/models/bart/configuration_bart.py b/src/transformers/models/bart/configuration_bart.py
index 86ca38a61d..3e978bba50 100644
--- a/src/transformers/models/bart/configuration_bart.py
+++ b/src/transformers/models/bart/configuration_bart.py
@@ -32,79 +32,79 @@ BART_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class BartConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.BartModel`. It is used to
+    This is the configuration class to store the configuration of a [`BartModel`]. It is used to
     instantiate a BART model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the BART `facebook/bart-large
-    <https://huggingface.co/facebook/bart-large>`__ architecture.
+    configuration with the defaults will yield a similar configuration to that of the BART [facebook/bart-large](https://huggingface.co/facebook/bart-large) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 50265):
+        vocab_size (`int`, *optional*, defaults to 50265):
             Vocabulary size of the BART model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.BartModel` or
-            :class:`~transformers.TFBartModel`.
-        d_model (:obj:`int`, `optional`, defaults to 1024):
+            `inputs_ids` passed when calling [`BartModel`] or
+            [`TFBartModel`].
+        d_model (`int`, *optional*, defaults to 1024):
             Dimensionality of the layers and the pooler layer.
-        encoder_layers (:obj:`int`, `optional`, defaults to 12):
+        encoder_layers (`int`, *optional*, defaults to 12):
             Number of encoder layers.
-        decoder_layers (:obj:`int`, `optional`, defaults to 12):
+        decoder_layers (`int`, *optional*, defaults to 12):
             Number of decoder layers.
-        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        activation_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for classifier.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
+        init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        scale_embedding (`bool`, *optional*, defaults to `False`):
             Scale embeddings by diving by sqrt(d_model).
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
-        num_labels: (:obj:`int`, `optional`, defaults to 3):
-            The number of labels to use in :class:`~transformers.BartForSequenceClassification`.
-        forced_eos_token_id (:obj:`int`, `optional`, defaults to 2):
-            The id of the token to force as the last generated token when :obj:`max_length` is reached. Usually set to
-            :obj:`eos_token_id`.
+        num_labels: (`int`, *optional*, defaults to 3):
+            The number of labels to use in [`BartForSequenceClassification`].
+        forced_eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+            `eos_token_id`.
 
-    Example::
+    Example:
 
-        >>> from transformers import BartModel, BartConfig
+    ```python
+    >>> from transformers import BartModel, BartConfig
 
-        >>> # Initializing a BART facebook/bart-large style configuration
-        >>> configuration = BartConfig()
+    >>> # Initializing a BART facebook/bart-large style configuration
+    >>> configuration = BartConfig()
 
-        >>> # Initializing a model from the facebook/bart-large style configuration
-        >>> model = BartModel(configuration)
+    >>> # Initializing a model from the facebook/bart-large style configuration
+    >>> model = BartModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "bart"
     keys_to_ignore_at_inference = ["past_key_values"]
     attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
diff --git a/src/transformers/models/bart/modeling_flax_bart.py b/src/transformers/models/bart/modeling_flax_bart.py
index b001b5bf58..67a5872e65 100644
--- a/src/transformers/models/bart/modeling_flax_bart.py
+++ b/src/transformers/models/bart/modeling_flax_bart.py
@@ -1016,17 +1016,18 @@ class FlaxBartPreTrainedModel(FlaxPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import BartTokenizer, FlaxBartForConditionalGeneration
+        ```python
+        >>> from transformers import BartTokenizer, FlaxBartForConditionalGeneration
 
-            >>> model = FlaxBartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
-            >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+        >>> model = FlaxBartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
+        >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
 
-            >>> text = "My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
-            >>> encoder_outputs = model.encode(**inputs)
-        """
+        >>> text = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
+        >>> encoder_outputs = model.encode(**inputs)
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1081,23 +1082,24 @@ class FlaxBartPreTrainedModel(FlaxPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import BartTokenizer, FlaxBartForConditionalGeneration
+        ```python
+        >>> from transformers import BartTokenizer, FlaxBartForConditionalGeneration
 
-            >>> model = FlaxBartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
-            >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+        >>> model = FlaxBartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
+        >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
 
-            >>> text = "My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
-            >>> encoder_outputs = model.encode(**inputs)
+        >>> text = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
+        >>> encoder_outputs = model.encode(**inputs)
 
-            >>> decoder_start_token_id = model.config.decoder_start_token_id
-            >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
 
-            >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
-            >>> last_decoder_hidden_states = outputs.last_hidden_state
-        """
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> last_decoder_hidden_states = outputs.last_hidden_state
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1348,23 +1350,24 @@ class FlaxBartForConditionalGeneration(FlaxBartPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import BartTokenizer, FlaxBartForConditionalGeneration
+        ```python
+        >>> from transformers import BartTokenizer, FlaxBartForConditionalGeneration
 
-            >>> model = FlaxBartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
-            >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+        >>> model = FlaxBartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
+        >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
 
-            >>> text = "My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
-            >>> encoder_outputs = model.encode(**inputs)
+        >>> text = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
+        >>> encoder_outputs = model.encode(**inputs)
 
-            >>> decoder_start_token_id = model.config.decoder_start_token_id
-            >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
 
-            >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
-            >>> logits = outputs.logits
-        """
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> logits = outputs.logits
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/bart/tokenization_bart.py b/src/transformers/models/bart/tokenization_bart.py
index 5a6b960dbb..ccd189ab1b 100644
--- a/src/transformers/models/bart/tokenization_bart.py
+++ b/src/transformers/models/bart/tokenization_bart.py
@@ -56,8 +56,8 @@ class BartTokenizer(RobertaTokenizer):
     r"""
     Construct a BART tokenizer.
 
-    :class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer`. Refer to superclass
-    :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the initialization
+    [`BartTokenizer`] is identical to [`RobertaTokenizer`]. Refer to superclass
+    [`RobertaTokenizer`] for usage examples and documentation concerning the initialization
     parameters and other methods.
     """
     vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/models/bart/tokenization_bart_fast.py b/src/transformers/models/bart/tokenization_bart_fast.py
index 10ba84e7ab..33bda3efb1 100644
--- a/src/transformers/models/bart/tokenization_bart_fast.py
+++ b/src/transformers/models/bart/tokenization_bart_fast.py
@@ -63,10 +63,10 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 
 class BartTokenizerFast(RobertaTokenizerFast):
     r"""
-    Construct a "fast" BART tokenizer (backed by HuggingFace's `tokenizers` library).
+    Construct a "fast" BART tokenizer (backed by HuggingFace's *tokenizers* library).
 
-    :class:`~transformers.BartTokenizerFast` is identical to :class:`~transformers.RobertaTokenizerFast`. Refer to
-    superclass :class:`~transformers.RobertaTokenizerFast` for usage examples and documentation concerning the
+    [`BartTokenizerFast`] is identical to [`RobertaTokenizerFast`]. Refer to
+    superclass [`RobertaTokenizerFast`] for usage examples and documentation concerning the
     initialization parameters and other methods.
     """
     vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/models/barthez/tokenization_barthez.py b/src/transformers/models/barthez/tokenization_barthez.py
index 2f39f421bd..ea0ae5897b 100644
--- a/src/transformers/models/barthez/tokenization_barthez.py
+++ b/src/transformers/models/barthez/tokenization_barthez.py
@@ -48,65 +48,70 @@ SPIECE_UNDERLINE = "▁"
 
 class BarthezTokenizer(PreTrainedTokenizer):
     """
-    Adapted from :class:`~transformers.CamembertTokenizer` and :class:`~transformers.BartTokenizer`. Construct a
-    BARThez tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
+    Adapted from [`CamembertTokenizer`] and [`BartTokenizer`]. Construct a
+    BARThez tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
             Additional special tokens used by the tokenizer.
-        sp_model_kwargs (:obj:`dict`, `optional`):
-            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
-            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
 
-            - ``enable_sampling``: Enable subword regularization.
-            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
 
-              - ``nbest_size = {0,1}``: No sampling is performed.
-              - ``nbest_size > 1``: samples from the nbest_size results.
-              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                 using forward-filtering-and-backward-sampling algorithm.
 
-            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
               BPE-dropout.
 
     Attributes:
-        sp_model (:obj:`SentencePieceProcessor`):
-            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -160,17 +165,17 @@ class BarthezTokenizer(PreTrainedTokenizer):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. A BARThez sequence has the following format:
 
-        - single sequence: ``<s> X </s>``
-        - pair of sequences: ``<s> A </s></s> B </s>``
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
 
         if token_ids_1 is None:
@@ -184,18 +189,18 @@ class BarthezTokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
         if already_has_special_tokens:
             return super().get_special_tokens_mask(
@@ -213,13 +218,13 @@ class BarthezTokenizer(PreTrainedTokenizer):
         Create a mask from the two sequences passed to be used in a sequence-pair classification task.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of zeros.
+            `List[int]`: List of zeros.
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
diff --git a/src/transformers/models/barthez/tokenization_barthez_fast.py b/src/transformers/models/barthez/tokenization_barthez_fast.py
index a66f5936a9..f896a331a0 100644
--- a/src/transformers/models/barthez/tokenization_barthez_fast.py
+++ b/src/transformers/models/barthez/tokenization_barthez_fast.py
@@ -58,46 +58,52 @@ SPIECE_UNDERLINE = "▁"
 
 class BarthezTokenizerFast(PreTrainedTokenizerFast):
     """
-    Adapted from :class:`~transformers.CamembertTokenizer` and :class:`~transformers.BartTokenizer`. Construct a "fast"
-    BARThez tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
+    Adapted from [`CamembertTokenizer`] and [`BartTokenizer`]. Construct a "fast"
+    BARThez tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
     methods. Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
             Additional special tokens used by the tokenizer.
     """
 
@@ -146,17 +152,17 @@ class BarthezTokenizerFast(PreTrainedTokenizerFast):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. A BARThez sequence has the following format:
 
-        - single sequence: ``<s> X </s>``
-        - pair of sequences: ``<s> A </s></s> B </s>``
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
 
         if token_ids_1 is None:
@@ -172,13 +178,13 @@ class BarthezTokenizerFast(PreTrainedTokenizerFast):
         Create a mask from the two sequences passed to be used in a sequence-pair classification task.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of zeros.
+            `List[int]`: List of zeros.
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
diff --git a/src/transformers/models/bartpho/tokenization_bartpho.py b/src/transformers/models/bartpho/tokenization_bartpho.py
index adb6af893f..c2e599371c 100644
--- a/src/transformers/models/bartpho/tokenization_bartpho.py
+++ b/src/transformers/models/bartpho/tokenization_bartpho.py
@@ -45,68 +45,72 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"vinai/bartpho-syllable": 1024}
 
 class BartphoTokenizer(PreTrainedTokenizer):
     """
-    Adapted from :class:`~transformers.XLMRobertaTokenizer`. Based on `SentencePiece
-    <https://github.com/google/sentencepiece>`__.
+    Adapted from [`XLMRobertaTokenizer`]. Based on [SentencePiece](https://github.com/google/sentencepiece).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             Path to the vocabulary file. This vocabulary is the pre-trained SentencePiece model available from the
             multilingual XLM-RoBERTa, also used in mBART, consisting of 250K types.
-        monolingual_vocab_file (:obj:`str`):
+        monolingual_vocab_file (`str`):
             Path to the monolingual vocabulary file. This monolingual vocabulary consists of Vietnamese-specialized
             types extracted from the multilingual vocabulary vocab_file of 250K types.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
             Additional special tokens used by the tokenizer.
-        sp_model_kwargs (:obj:`dict`, `optional`):
-            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
-            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
 
-            - ``enable_sampling``: Enable subword regularization.
-            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
 
-              - ``nbest_size = {0,1}``: No sampling is performed.
-              - ``nbest_size > 1``: samples from the nbest_size results.
-              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                 using forward-filtering-and-backward-sampling algorithm.
 
-            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
               BPE-dropout.
 
     Attributes:
-        sp_model (:obj:`SentencePieceProcessor`):
-            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -183,17 +187,17 @@ class BartphoTokenizer(PreTrainedTokenizer):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. An BARTPho sequence has the following format:
 
-        - single sequence: ``<s> X </s>``
-        - pair of sequences: ``<s> A </s></s> B </s>``
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
 
         if token_ids_1 is None:
@@ -207,18 +211,18 @@ class BartphoTokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
 
         if already_has_special_tokens:
@@ -238,13 +242,13 @@ class BartphoTokenizer(PreTrainedTokenizer):
         make use of token type ids, therefore a list of zeros is returned.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of zeros.
+            `List[int]`: List of zeros.
 
         """
 
diff --git a/src/transformers/models/beit/configuration_beit.py b/src/transformers/models/beit/configuration_beit.py
index 15a0b82b7e..6634fc03b1 100644
--- a/src/transformers/models/beit/configuration_beit.py
+++ b/src/transformers/models/beit/configuration_beit.py
@@ -28,86 +28,87 @@ BEIT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class BeitConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.BeitModel`. It is used to
+    This is the configuration class to store the configuration of a [`BeitModel`]. It is used to
     instantiate an BEiT model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the BEiT
-    `microsoft/beit-base-patch16-224-in22k <https://huggingface.co/microsoft/beit-base-patch16-224-in22k>`__
+    [microsoft/beit-base-patch16-224-in22k](https://huggingface.co/microsoft/beit-base-patch16-224-in22k)
     architecture.
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 8092):
+        vocab_size (`int`, *optional*, defaults to 8092):
             Vocabulary size of the BEiT model. Defines the number of different image tokens that can be used during
             pre-training.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        image_size (:obj:`int`, `optional`, defaults to :obj:`224`):
+        image_size (`int`, *optional*, defaults to `224`):
             The size (resolution) of each image.
-        patch_size (:obj:`int`, `optional`, defaults to :obj:`16`):
+        patch_size (`int`, *optional*, defaults to `16`):
             The size (resolution) of each patch.
-        num_channels (:obj:`int`, `optional`, defaults to :obj:`3`):
+        num_channels (`int`, *optional*, defaults to `3`):
             The number of input channels.
-        use_mask_token (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        use_mask_token (`bool`, *optional*, defaults to `False`):
             Whether to use a mask token for masked image modeling.
-        use_absolute_position_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        use_absolute_position_embeddings (`bool`, *optional*, defaults to `False`):
             Whether to use BERT-style absolute position embeddings.
-        use_relative_position_bias (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        use_relative_position_bias (`bool`, *optional*, defaults to `False`):
             Whether to use T5-style relative position embeddings in the self-attention layers.
-        use_shared_relative_position_bias (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        use_shared_relative_position_bias (`bool`, *optional*, defaults to `False`):
             Whether to use the same relative position embeddings across all self-attention layers of the Transformer.
-        layer_scale_init_value (:obj:`float`, `optional`, defaults to 0.1):
+        layer_scale_init_value (`float`, *optional*, defaults to 0.1):
             Scale to use in the self-attention layers. 0.1 for base, 1e-5 for large. Set 0 to disable layer scale.
-        drop_path_rate (:obj:`float`, `optional`, defaults to 0.1):
+        drop_path_rate (`float`, *optional*, defaults to 0.1):
             Stochastic depth rate per sample (when applied in the main path of residual layers).
-        use_mean_pooling (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_mean_pooling (`bool`, *optional*, defaults to `True`):
             Whether to mean pool the final hidden states of the patches instead of using the final hidden state of the
             CLS token, before applying the classification head.
-        out_indices (:obj:`List[int]`, `optional`, defaults to :obj:`[3, 5, 7, 11]`):
+        out_indices (`List[int]`, *optional*, defaults to `[3, 5, 7, 11]`):
             Indices of the feature maps to use for semantic segmentation.
-        pool_scales (:obj:`Tuple[int]`, `optional`, defaults to :obj:`[1, 2, 3, 6]`):
+        pool_scales (`Tuple[int]`, *optional*, defaults to `[1, 2, 3, 6]`):
             Pooling scales used in Pooling Pyramid Module applied on the last feature map.
-        use_auxiliary_head (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_auxiliary_head (`bool`, *optional*, defaults to `True`):
             Whether to use an auxiliary head during training.
-        auxiliary_loss_weight (:obj:`float`, `optional`, defaults to 0.4):
+        auxiliary_loss_weight (`float`, *optional*, defaults to 0.4):
             Weight of the cross-entropy loss of the auxiliary head.
-        auxiliary_channels (:obj:`int`, `optional`, defaults to 256):
+        auxiliary_channels (`int`, *optional*, defaults to 256):
             Number of channels to use in the auxiliary head.
-        auxiliary_num_convs (:obj:`int`, `optional`, defaults to 1):
+        auxiliary_num_convs (`int`, *optional*, defaults to 1):
             Number of convolutional layers to use in the auxiliary head.
-        auxiliary_concat_input (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        auxiliary_concat_input (`bool`, *optional*, defaults to `False`):
             Whether to concatenate the output of the auxiliary head with the input before the classification layer.
-        semantic_loss_ignore_index (:obj:`int`, `optional`, defaults to 255):
+        semantic_loss_ignore_index (`int`, *optional*, defaults to 255):
             The index that is ignored by the loss function of the semantic segmentation model.
 
-    Example::
+    Example:
 
-        >>> from transformers import BeitModel, BeitConfig
+    ```python
+    >>> from transformers import BeitModel, BeitConfig
 
-        >>> # Initializing a BEiT beit-base-patch16-224-in22k style configuration
-        >>> configuration = BeitConfig()
+    >>> # Initializing a BEiT beit-base-patch16-224-in22k style configuration
+    >>> configuration = BeitConfig()
 
-        >>> # Initializing a model from the beit-base-patch16-224-in22k style configuration
-        >>> model = BeitModel(configuration)
+    >>> # Initializing a model from the beit-base-patch16-224-in22k style configuration
+    >>> model = BeitModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "beit"
 
     def __init__(
diff --git a/src/transformers/models/beit/feature_extraction_beit.py b/src/transformers/models/beit/feature_extraction_beit.py
index 66067b34ee..997f860115 100644
--- a/src/transformers/models/beit/feature_extraction_beit.py
+++ b/src/transformers/models/beit/feature_extraction_beit.py
@@ -38,34 +38,34 @@ class BeitFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
     r"""
     Constructs a BEiT feature extractor.
 
-    This feature extractor inherits from :class:`~transformers.feature_extraction_utils.FeatureExtractionMixin` which
+    This feature extractor inherits from [`~feature_extraction_utils.FeatureExtractionMixin`] which
     contains most of the main methods. Users should refer to this superclass for more information regarding those
     methods.
 
     Args:
-        do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether to resize the input to a certain :obj:`size`.
-        size (:obj:`int` or :obj:`Tuple(int)`, `optional`, defaults to 256):
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input to a certain `size`.
+        size (`int` or `Tuple(int)`, *optional*, defaults to 256):
             Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an
-            integer is provided, then the input will be resized to (size, size). Only has an effect if :obj:`do_resize`
-            is set to :obj:`True`.
-        resample (:obj:`int`, `optional`, defaults to :obj:`PIL.Image.BICUBIC`):
-            An optional resampling filter. This can be one of :obj:`PIL.Image.NEAREST`, :obj:`PIL.Image.BOX`,
-            :obj:`PIL.Image.BILINEAR`, :obj:`PIL.Image.HAMMING`, :obj:`PIL.Image.BICUBIC` or :obj:`PIL.Image.LANCZOS`.
-            Only has an effect if :obj:`do_resize` is set to :obj:`True`.
-        do_center_crop (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether to crop the input at the center. If the input size is smaller than :obj:`crop_size` along any edge,
+            integer is provided, then the input will be resized to (size, size). Only has an effect if `do_resize`
+            is set to `True`.
+        resample (`int`, *optional*, defaults to `PIL.Image.BICUBIC`):
+            An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
+            `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`.
+            Only has an effect if `do_resize` is set to `True`.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to crop the input at the center. If the input size is smaller than `crop_size` along any edge,
             the image is padded with 0's and then center cropped.
-        crop_size (:obj:`int`, `optional`, defaults to 224):
-            Desired output size when applying center-cropping. Only has an effect if :obj:`do_center_crop` is set to
-            :obj:`True`.
-        do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to normalize the input with :obj:`image_mean` and :obj:`image_std`.
-        image_mean (:obj:`List[int]`, defaults to :obj:`[0.5, 0.5, 0.5]`):
+        crop_size (`int`, *optional*, defaults to 224):
+            Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to
+            `True`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the input with `image_mean` and `image_std`.
+        image_mean (`List[int]`, defaults to `[0.5, 0.5, 0.5]`):
             The sequence of means for each channel, to be used when normalizing images.
-        image_std (:obj:`List[int]`, defaults to :obj:`[0.5, 0.5, 0.5]`):
+        image_std (`List[int]`, defaults to `[0.5, 0.5, 0.5]`):
             The sequence of standard deviations for each channel, to be used when normalizing images.
-        reduce_labels (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        reduce_labels (`bool`, *optional*, defaults to `False`):
             Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 is
             used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). The
             background label will be replaced by 255.
@@ -107,34 +107,36 @@ class BeitFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
         """
         Main method to prepare for the model one or several image(s).
 
-        .. warning::
+        <Tip warning={true}>
 
-           NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
-           PIL images.
+        NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
+        PIL images.
+
+        </Tip>
 
         Args:
-            images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                 number of channels, H and W are image height and width.
 
-            segmentation_maps (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`, `optional`):
+            segmentation_maps (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
                 Optionally, the corresponding semantic segmentation maps with the pixel-wise annotations.
 
-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`, defaults to :obj:`'np'`):
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*, defaults to `'np'`):
                 If set, will return tensors of a particular framework. Acceptable values are:
 
-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return NumPy :obj:`np.ndarray` objects.
-                * :obj:`'jax'`: Return JAX :obj:`jnp.ndarray` objects.
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
 
         Returns:
-            :class:`~transformers.BatchFeature`: A :class:`~transformers.BatchFeature` with the following fields:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
             - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
               width).
-            - **labels** -- Optional labels to be fed to a model (when :obj:`segmentation_maps` are provided)
+            - **labels** -- Optional labels to be fed to a model (when `segmentation_maps` are provided)
         """
         # Input type checking for clearer error
         valid_images = False
diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py
index bd8071ba70..d9a8f47c2f 100755
--- a/src/transformers/models/beit/modeling_beit.py
+++ b/src/transformers/models/beit/modeling_beit.py
@@ -626,22 +626,23 @@ class BeitModel(BeitPreTrainedModel):
         r"""
         Returns:
 
-        Examples::
+        Examples:
 
-            >>> from transformers import BeitFeatureExtractor, BeitModel
-            >>> from PIL import Image
-            >>> import requests
+        ```python
+        >>> from transformers import BeitFeatureExtractor, BeitModel
+        >>> from PIL import Image
+        >>> import requests
 
-            >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
-            >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> image = Image.open(requests.get(url, stream=True).raw)
 
-            >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-patch16-224-pt22k-ft22k')
-            >>> model = BeitModel.from_pretrained('microsoft/beit-base-patch16-224-pt22k-ft22k')
+        >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-patch16-224-pt22k-ft22k')
+        >>> model = BeitModel.from_pretrained('microsoft/beit-base-patch16-224-pt22k-ft22k')
 
-            >>> inputs = feature_extractor(images=image, return_tensors="pt")
-            >>> outputs = model(**inputs)
-            >>> last_hidden_states = outputs.last_hidden_state
-        """
+        >>> inputs = feature_extractor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/beit/modeling_flax_beit.py b/src/transformers/models/beit/modeling_flax_beit.py
index 5a1b0c25af..b81f1afb41 100644
--- a/src/transformers/models/beit/modeling_flax_beit.py
+++ b/src/transformers/models/beit/modeling_flax_beit.py
@@ -728,21 +728,23 @@ class FlaxBeitModel(FlaxBeitPreTrainedModel):
 FLAX_BEIT_MODEL_DOCSTRING = """
     Returns:
 
-    Examples::
+    Examples:
 
-        >>> from transformers import BeitFeatureExtractor, FlaxBeitModel
-        >>> from PIL import Image
-        >>> import requests
+    ```python
+    >>> from transformers import BeitFeatureExtractor, FlaxBeitModel
+    >>> from PIL import Image
+    >>> import requests
 
-        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+    >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+    >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-patch16-224-pt22k-ft22k')
-        >>> model = FlaxBeitModel.from_pretrained('microsoft/beit-base-patch16-224-pt22k-ft22k')
+    >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-patch16-224-pt22k-ft22k')
+    >>> model = FlaxBeitModel.from_pretrained('microsoft/beit-base-patch16-224-pt22k-ft22k')
 
-        >>> inputs = feature_extractor(images=image, return_tensors="np")
-        >>> outputs = model(**inputs)
-        >>> last_hidden_states = outputs.last_hidden_state
+    >>> inputs = feature_extractor(images=image, return_tensors="np")
+    >>> outputs = model(**inputs)
+    >>> last_hidden_states = outputs.last_hidden_state
+    ```
 """
 
 overwrite_call_docstring(FlaxBeitModel, FLAX_BEIT_MODEL_DOCSTRING)
@@ -897,24 +899,26 @@ class FlaxBeitForImageClassification(FlaxBeitPreTrainedModel):
 FLAX_BEIT_CLASSIF_DOCSTRING = """
     Returns:
 
-    Example::
+    Example:
 
-        >>> from transformers import BeitFeatureExtractor, FlaxBeitForImageClassification
-        >>> from PIL import Image
-        >>> import requests
+    ```python
+    >>> from transformers import BeitFeatureExtractor, FlaxBeitForImageClassification
+    >>> from PIL import Image
+    >>> import requests
 
-        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+    >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+    >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-patch16-224')
-        >>> model = FlaxBeitForImageClassification.from_pretrained('microsoft/beit-base-patch16-224')
+    >>> feature_extractor = BeitFeatureExtractor.from_pretrained('microsoft/beit-base-patch16-224')
+    >>> model = FlaxBeitForImageClassification.from_pretrained('microsoft/beit-base-patch16-224')
 
-        >>> inputs = feature_extractor(images=image, return_tensors="np")
-        >>> outputs = model(**inputs)
-        >>> logits = outputs.logits
-        >>> # model predicts one of the 1000 ImageNet classes
-        >>> predicted_class_idx = logits.argmax(-1).item()
-        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
+    >>> inputs = feature_extractor(images=image, return_tensors="np")
+    >>> outputs = model(**inputs)
+    >>> logits = outputs.logits
+    >>> # model predicts one of the 1000 ImageNet classes
+    >>> predicted_class_idx = logits.argmax(-1).item()
+    >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
+    ```
 """
 
 overwrite_call_docstring(FlaxBeitForImageClassification, FLAX_BEIT_CLASSIF_DOCSTRING)
diff --git a/src/transformers/models/bert/configuration_bert.py b/src/transformers/models/bert/configuration_bert.py
index 861cdfbc8e..a3a3ef5ac8 100644
--- a/src/transformers/models/bert/configuration_bert.py
+++ b/src/transformers/models/bert/configuration_bert.py
@@ -53,71 +53,70 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class BertConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.BertModel` or a
-    :class:`~transformers.TFBertModel`. It is used to instantiate a BERT model according to the specified arguments,
+    This is the configuration class to store the configuration of a [`BertModel`] or a
+    [`TFBertModel`]. It is used to instantiate a BERT model according to the specified arguments,
     defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
-    to that of the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
+    to that of the BERT [bert-base-uncased](https://huggingface.co/bert-base-uncased) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or
-            :class:`~transformers.TFBertModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            `inputs_ids` passed when calling [`BertModel`] or
+            [`TFBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or
-            :class:`~transformers.TFBertModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or
+            [`TFBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
-            Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
-            :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
-            :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
-            <https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
-            `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
-            <https://arxiv.org/abs/2009.13658>`__.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`,
+            `"relative_key_query"`. For positional embeddings use `"absolute"`. For more information on
+            `"relative_key"`, please refer to [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). For more information on `"relative_key_query"`, please refer to
+            *Method 4* in [Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if ``config.is_decoder=True``.
-        classifier_dropout (:obj:`float`, `optional`):
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
             The dropout ratio for the classification head.
 
-    Examples::
+    Examples:
 
-        >>> from transformers import BertModel, BertConfig
+    ```python
+    >>> from transformers import BertModel, BertConfig
 
-        >>> # Initializing a BERT bert-base-uncased style configuration
-        >>> configuration = BertConfig()
+    >>> # Initializing a BERT bert-base-uncased style configuration
+    >>> configuration = BertConfig()
 
-        >>> # Initializing a model from the bert-base-uncased style configuration
-        >>> model = BertModel(configuration)
+    >>> # Initializing a model from the bert-base-uncased style configuration
+    >>> model = BertModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "bert"
 
     def __init__(
diff --git a/src/transformers/models/bert/modeling_flax_bert.py b/src/transformers/models/bert/modeling_flax_bert.py
index 6ffcc9d221..369195df7d 100644
--- a/src/transformers/models/bert/modeling_flax_bert.py
+++ b/src/transformers/models/bert/modeling_flax_bert.py
@@ -832,18 +832,20 @@ class FlaxBertForPreTraining(FlaxBertPreTrainedModel):
 FLAX_BERT_FOR_PRETRAINING_DOCSTRING = """
     Returns:
 
-    Example::
+    Example:
 
-        >>> from transformers import BertTokenizer, FlaxBertForPreTraining
+    ```python
+    >>> from transformers import BertTokenizer, FlaxBertForPreTraining
 
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> model = FlaxBertForPreTraining.from_pretrained('bert-base-uncased')
+    >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    >>> model = FlaxBertForPreTraining.from_pretrained('bert-base-uncased')
 
-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
-        >>> outputs = model(**inputs)
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
+    >>> outputs = model(**inputs)
 
-        >>> prediction_logits = outputs.prediction_logits
-        >>> seq_relationship_logits = outputs.seq_relationship_logits
+    >>> prediction_logits = outputs.prediction_logits
+    >>> seq_relationship_logits = outputs.seq_relationship_logits
+    ```
 """
 
 overwrite_call_docstring(
@@ -976,20 +978,22 @@ class FlaxBertForNextSentencePrediction(FlaxBertPreTrainedModel):
 FLAX_BERT_FOR_NEXT_SENT_PRED_DOCSTRING = """
     Returns:
 
-    Example::
+    Example:
 
-        >>> from transformers import BertTokenizer, FlaxBertForNextSentencePrediction
+    ```python
+    >>> from transformers import BertTokenizer, FlaxBertForNextSentencePrediction
 
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> model = FlaxBertForNextSentencePrediction.from_pretrained('bert-base-uncased')
+    >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    >>> model = FlaxBertForNextSentencePrediction.from_pretrained('bert-base-uncased')
 
-        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='jax')
+    >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+    >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+    >>> encoding = tokenizer(prompt, next_sentence, return_tensors='jax')
 
-        >>> outputs = model(**encoding)
-        >>> logits = outputs.logits
-        >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
+    >>> outputs = model(**encoding)
+    >>> logits = outputs.logits
+    >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
+    ```
 """
 
 
diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py
index 6c3ceaa5bf..3b3854be71 100644
--- a/src/transformers/models/bert/modeling_tf_bert.py
+++ b/src/transformers/models/bert/modeling_tf_bert.py
@@ -1599,21 +1599,22 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel, TFNextSentencePredi
         r"""
         Return:
 
-        Examples::
+        Examples:
 
-            >>> import tensorflow as tf
-            >>> from transformers import BertTokenizer, TFBertForNextSentencePrediction
+        ```python
+        >>> import tensorflow as tf
+        >>> from transformers import BertTokenizer, TFBertForNextSentencePrediction
 
-            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-            >>> model = TFBertForNextSentencePrediction.from_pretrained('bert-base-uncased')
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> model = TFBertForNextSentencePrediction.from_pretrained('bert-base-uncased')
 
-            >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-            >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-            >>> encoding = tokenizer(prompt, next_sentence, return_tensors='tf')
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='tf')
 
-            >>> logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
-            >>> assert logits[0][0] < logits[0][1] # the next sentence was random
-        """
+        >>> logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
+        >>> assert logits[0][0] < logits[0][1] # the next sentence was random
+        ```"""
         inputs = input_processing(
             func=self.call,
             config=self.config,
diff --git a/src/transformers/models/bert/tokenization_bert.py b/src/transformers/models/bert/tokenization_bert.py
index 897fb32761..5520f845cc 100644
--- a/src/transformers/models/bert/tokenization_bert.py
+++ b/src/transformers/models/bert/tokenization_bert.py
@@ -118,42 +118,41 @@ class BertTokenizer(PreTrainedTokenizer):
     r"""
     Construct a BERT tokenizer. Based on WordPiece.
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             File containing the vocabulary.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
             Whether or not to lowercase the input when tokenizing.
-        do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
             Whether or not to do basic tokenization before WordPiece.
-        never_split (:obj:`Iterable`, `optional`):
+        never_split (`Iterable`, *optional*):
             Collection of tokens which will never be split during tokenization. Only has an effect when
-            :obj:`do_basic_tokenize=True`
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+            `do_basic_tokenize=True`
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
             The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
             Whether or not to tokenize Chinese characters.
 
-            This should likely be deactivated for Japanese (see this `issue
-            <https://github.com/huggingface/transformers/issues/328>`__).
-        strip_accents: (:obj:`bool`, `optional`):
+            This should likely be deactivated for Japanese (see this [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents: (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for :obj:`lowercase` (as in the original BERT).
+            value for `lowercase` (as in the original BERT).
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -252,17 +251,17 @@ class BertTokenizer(PreTrainedTokenizer):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. A BERT sequence has the following format:
 
-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         if token_ids_1 is None:
             return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -275,18 +274,18 @@ class BertTokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
 
         if already_has_special_tokens:
@@ -305,21 +304,21 @@ class BertTokenizer(PreTrainedTokenizer):
         Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
         pair mask has the following format:
 
-        ::
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
 
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
-
-        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
             sequence(s).
         """
         sep = [self.sep_token_id]
@@ -354,19 +353,18 @@ class BasicTokenizer(object):
     Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
 
     Args:
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
             Whether or not to lowercase the input when tokenizing.
-        never_split (:obj:`Iterable`, `optional`):
+        never_split (`Iterable`, *optional*):
             Collection of tokens which will never be split during tokenization. Only has an effect when
-            :obj:`do_basic_tokenize=True`
-        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            `do_basic_tokenize=True`
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
             Whether or not to tokenize Chinese characters.
 
-            This should likely be deactivated for Japanese (see this `issue
-            <https://github.com/huggingface/transformers/issues/328>`__).
-        strip_accents: (:obj:`bool`, `optional`):
+            This should likely be deactivated for Japanese (see this [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents: (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for :obj:`lowercase` (as in the original BERT).
+            value for `lowercase` (as in the original BERT).
     """
 
     def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
@@ -383,9 +381,9 @@ class BasicTokenizer(object):
         WordPieceTokenizer.
 
         Args:
-            **never_split**: (`optional`) list of str
+            never_split (`LIst[str]`, *optional*)
                 Kept for backward compatibility purposes. Now implemented directly at the base class level (see
-                :func:`PreTrainedTokenizer.tokenize`) List of token not to split.
+                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
         """
         # union() returns a new set by concatenating the two sets.
         never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
@@ -511,14 +509,14 @@ class WordpieceTokenizer(object):
         Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
         tokenization using the given vocabulary.
 
-        For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`.
+        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
 
         Args:
-          text: A single token or whitespace separated tokens. This should have
-            already been passed through `BasicTokenizer`.
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through *BasicTokenizer*.
 
         Returns:
-          A list of wordpiece tokens.
+            A list of wordpiece tokens.
         """
 
         output_tokens = []
diff --git a/src/transformers/models/bert/tokenization_bert_fast.py b/src/transformers/models/bert/tokenization_bert_fast.py
index 8004978f60..5b0ebaf086 100644
--- a/src/transformers/models/bert/tokenization_bert_fast.py
+++ b/src/transformers/models/bert/tokenization_bert_fast.py
@@ -116,41 +116,41 @@ PRETRAINED_INIT_CONFIGURATION = {
 
 class BertTokenizerFast(PreTrainedTokenizerFast):
     r"""
-    Construct a "fast" BERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on WordPiece.
+    Construct a "fast" BERT tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
     methods. Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             File containing the vocabulary.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
             Whether or not to lowercase the input when tokenizing.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
             The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        clean_text (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        clean_text (`bool`, *optional*, defaults to `True`):
             Whether or not to clean the text before tokenization by removing any control characters and replacing all
             whitespaces by the classic one.
-        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see `this
-            issue <https://github.com/huggingface/transformers/issues/328>`__).
-        strip_accents: (:obj:`bool`, `optional`):
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
+            issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents: (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for :obj:`lowercase` (as in the original BERT).
-        wordpieces_prefix: (:obj:`str`, `optional`, defaults to :obj:`"##"`):
+            value for `lowercase` (as in the original BERT).
+        wordpieces_prefix: (`str`, *optional*, defaults to `"##"`):
             The prefix for subwords.
     """
 
@@ -205,17 +205,17 @@ class BertTokenizerFast(PreTrainedTokenizerFast):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. A BERT sequence has the following format:
 
-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
 
@@ -231,21 +231,21 @@ class BertTokenizerFast(PreTrainedTokenizerFast):
         Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
         pair mask has the following format:
 
-        ::
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
 
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
-
-        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
             sequence(s).
         """
         sep = [self.sep_token_id]
diff --git a/src/transformers/models/bert_generation/configuration_bert_generation.py b/src/transformers/models/bert_generation/configuration_bert_generation.py
index 2284f873e7..3c79f25fd2 100644
--- a/src/transformers/models/bert_generation/configuration_bert_generation.py
+++ b/src/transformers/models/bert_generation/configuration_bert_generation.py
@@ -20,62 +20,61 @@ from ...configuration_utils import PretrainedConfig
 class BertGenerationConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a
-    :class:`~transformers.BertGenerationPreTrainedModel`. It is used to instantiate a BertGeneration model according to
+    [`BertGenerationPreTrainedModel`]. It is used to instantiate a BertGeneration model according to
     the specified arguments, defining the model architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 50358):
+        vocab_size (`int`, *optional*, defaults to 50358):
             Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.BertGeneration`.
-        hidden_size (:obj:`int`, `optional`, defaults to 1024):
+            `inputs_ids` passed when calling [`BertGeneration`].
+        hidden_size (`int`, *optional*, defaults to 1024):
             Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 24):
+        num_hidden_layers (`int`, *optional*, defaults to 24):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        num_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (often called feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
-            Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
-            :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
-            :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
-            <https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
-            `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
-            <https://arxiv.org/abs/2009.13658>`__.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`,
+            `"relative_key_query"`. For positional embeddings use `"absolute"`. For more information on
+            `"relative_key"`, please refer to [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). For more information on `"relative_key_query"`, please refer to
+            *Method 4* in [Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if ``config.is_decoder=True``.
+            relevant if `config.is_decoder=True`.
 
-    Examples::
+    Examples:
 
-        >>> from transformers import BertGenerationConfig, BertGenerationEncoder
+    ```python
+    >>> from transformers import BertGenerationConfig, BertGenerationEncoder
 
-        >>> # Initializing a BertGeneration config
-        >>> configuration = BertGenerationConfig()
+    >>> # Initializing a BertGeneration config
+    >>> configuration = BertGenerationConfig()
 
-        >>> # Initializing a model from the config
-        >>> model = BertGenerationEncoder(configuration)
+    >>> # Initializing a model from the config
+    >>> model = BertGenerationEncoder(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "bert-generation"
 
     def __init__(
diff --git a/src/transformers/models/bert_generation/tokenization_bert_generation.py b/src/transformers/models/bert_generation/tokenization_bert_generation.py
index 43676e2801..f6b7a7f9cc 100644
--- a/src/transformers/models/bert_generation/tokenization_bert_generation.py
+++ b/src/transformers/models/bert_generation/tokenization_bert_generation.py
@@ -40,37 +40,36 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"bert_for_seq_generation": 512}
 
 class BertGenerationTokenizer(PreTrainedTokenizer):
     """
-    Construct a BertGeneration tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
+    Construct a BertGeneration tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
             The begin of sequence token.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        sp_model_kwargs (:obj:`dict`, `optional`):
-            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
-            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
 
-            - ``enable_sampling``: Enable subword regularization.
-            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
 
-              - ``nbest_size = {0,1}``: No sampling is performed.
-              - ``nbest_size > 1``: samples from the nbest_size results.
-              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                 using forward-filtering-and-backward-sampling algorithm.
 
-            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
               BPE-dropout.
     """
 
diff --git a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
index 41c686e41e..0d50dadd00 100644
--- a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
+++ b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
@@ -74,20 +74,20 @@ class BertJapaneseTokenizer(BertTokenizer):
     Construct a BERT tokenizer for Japanese text, based on a MecabTokenizer.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             Path to a one-wordpiece-per-line vocabulary file.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
             Whether to lower case the input. Only has an effect when do_basic_tokenize=True.
-        do_word_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_word_tokenize (`bool`, *optional*, defaults to `True`):
             Whether to do word tokenization.
-        do_subword_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_subword_tokenize (`bool`, *optional*, defaults to `True`):
             Whether to do subword tokenization.
-        word_tokenizer_type (:obj:`str`, `optional`, defaults to :obj:`"basic"`):
+        word_tokenizer_type (`str`, *optional*, defaults to `"basic"`):
             Type of word tokenizer.
-        subword_tokenizer_type (:obj:`str`, `optional`, defaults to :obj:`"wordpiece"`):
+        subword_tokenizer_type (`str`, *optional*, defaults to `"wordpiece"`):
             Type of subword tokenizer.
-        mecab_kwargs (:obj:`str`, `optional`):
-            Dictionary passed to the :obj:`MecabTokenizer` constructor.
+        mecab_kwargs (`str`, *optional*):
+            Dictionary passed to the `MecabTokenizer` constructor.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -210,17 +210,17 @@ class MecabTokenizer:
         Constructs a MecabTokenizer.
 
         Args:
-            **do_lower_case**: (`optional`) boolean (default True)
+            **do_lower_case**: (*optional*) boolean (default True)
                 Whether to lowercase the input.
-            **never_split**: (`optional`) list of str
+            **never_split**: (*optional*) list of str
                 Kept for backward compatibility purposes. Now implemented directly at the base class level (see
-                :func:`PreTrainedTokenizer.tokenize`) List of tokens not to split.
-            **normalize_text**: (`optional`) boolean (default True)
+                [`PreTrainedTokenizer.tokenize`]) List of tokens not to split.
+            **normalize_text**: (*optional*) boolean (default True)
                 Whether to apply unicode normalization to text before tokenization.
-            **mecab_dic**: (`optional`) string (default "ipadic")
+            **mecab_dic**: (*optional*) string (default "ipadic")
                 Name of dictionary to be used for MeCab initialization. If you are using a system-installed dictionary,
-                set this option to `None` and modify `mecab_option`.
-            **mecab_option**: (`optional`) string
+                set this option to *None* and modify *mecab_option*.
+            **mecab_option**: (*optional*) string
                 String passed to MeCab constructor.
         """
         self.do_lower_case = do_lower_case
@@ -326,11 +326,11 @@ class CharacterTokenizer:
         """
         Tokenizes a piece of text into characters.
 
-        For example, :obj:`input = "apple""` wil return as output :obj:`["a", "p", "p", "l", "e"]`.
+        For example, `input = "apple""` wil return as output `["a", "p", "p", "l", "e"]`.
 
         Args:
             text: A single token or whitespace separated tokens.
-                This should have already been passed through `BasicTokenizer`.
+                This should have already been passed through *BasicTokenizer*.
 
         Returns:
             A list of characters.
diff --git a/src/transformers/models/bertweet/tokenization_bertweet.py b/src/transformers/models/bertweet/tokenization_bertweet.py
index 76103d051c..dfa5e74699 100644
--- a/src/transformers/models/bertweet/tokenization_bertweet.py
+++ b/src/transformers/models/bertweet/tokenization_bertweet.py
@@ -69,43 +69,49 @@ class BertweetTokenizer(PreTrainedTokenizer):
     """
     Constructs a BERTweet tokenizer, using Byte-Pair-Encoding.
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             Path to the vocabulary file.
-        merges_file (:obj:`str`):
+        merges_file (`str`):
             Path to the merges file.
-        normalization (:obj:`bool`, `optional`, defaults to :obj:`False`)
+        normalization (`bool`, *optional*, defaults to `False`)
             Whether or not to apply a normalization preprocess.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
     """
@@ -181,17 +187,17 @@ class BertweetTokenizer(PreTrainedTokenizer):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. A BERTweet sequence has the following format:
 
-        - single sequence: ``<s> X </s>``
-        - pair of sequences: ``<s> A </s></s> B </s>``
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
 
         if token_ids_1 is None:
@@ -205,18 +211,18 @@ class BertweetTokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
 
         if already_has_special_tokens:
@@ -236,13 +242,13 @@ class BertweetTokenizer(PreTrainedTokenizer):
         not make use of token type ids, therefore a list of zeros is returned.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of zeros.
+            `List[int]`: List of zeros.
         """
 
         sep = [self.sep_token_id]
@@ -621,12 +627,12 @@ def _replace_html_entities(text, keep=(), remove_illegal=True, encoding="utf-8")
 
     Args:
         text:
-            A unicode string or a byte string encoded in the given `encoding` (which defaults to 'utf-8').
+            A unicode string or a byte string encoded in the given *encoding* (which defaults to 'utf-8').
         keep (list):
-            List of entity names which should not be replaced. This supports both numeric entities (``&#nnnn;`` and
-            ``&#hhhh;``) and named entities (such as ``&nbsp;`` or ``&gt;``).
+            List of entity names which should not be replaced. This supports both numeric entities (`&#nnnn;` and
+            `&#hhhh;`) and named entities (such as `&nbsp;` or `&gt;`).
         remove_illegal (bool):
-            If `True`, entities that can't be converted are removed. Otherwise, entities that can't be converted are
+            If *True*, entities that can't be converted are removed. Otherwise, entities that can't be converted are
             kept "as is".
 
     Returns: A unicode string with the entities removed.
@@ -674,21 +680,22 @@ def _replace_html_entities(text, keep=(), remove_illegal=True, encoding="utf-8")
 
 class TweetTokenizer:
     r"""
-    Examples::
+    Examples:
 
-        >>> # Tokenizer for tweets.
-        >>> from nltk.tokenize import TweetTokenizer
-        >>> tknzr = TweetTokenizer()
-        >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
-        >>> tknzr.tokenize(s0)
-        ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']
+    ```python
+    >>> # Tokenizer for tweets.
+    >>> from nltk.tokenize import TweetTokenizer
+    >>> tknzr = TweetTokenizer()
+    >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
+    >>> tknzr.tokenize(s0)
+    ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--']
 
-        >>> # Examples using `strip_handles` and `reduce_len parameters`:
-        >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
-        >>> s1 = '@remy: This is waaaaayyyy too much for you!!!!!!'
-        >>> tknzr.tokenize(s1)
-        [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
-    """
+    >>> # Examples using *strip_handles* and *reduce_len parameters*:
+    >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)
+    >>> s1 = '@remy: This is waaaaayyyy too much for you!!!!!!'
+    >>> tknzr.tokenize(s1)
+    [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!']
+    ```"""
 
     def __init__(self, preserve_case=True, reduce_len=False, strip_handles=False):
         self.preserve_case = preserve_case
diff --git a/src/transformers/models/big_bird/configuration_big_bird.py b/src/transformers/models/big_bird/configuration_big_bird.py
index 85dd8de7dd..80dd708b92 100644
--- a/src/transformers/models/big_bird/configuration_big_bird.py
+++ b/src/transformers/models/big_bird/configuration_big_bird.py
@@ -30,62 +30,65 @@ BIG_BIRD_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class BigBirdConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.BigBirdModel`. It is used to
+    This is the configuration class to store the configuration of a [`BigBirdModel`]. It is used to
     instantiate an BigBird model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the BigBird
-    `google/bigbird-roberta-base <https://huggingface.co/google/bigbird-roberta-base>`__ architecture.
+    [google/bigbird-roberta-base](https://huggingface.co/google/bigbird-roberta-base) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 50358):
+        vocab_size (`int`, *optional*, defaults to 50358):
             Vocabulary size of the BigBird model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.BigBirdModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            `inputs_ids` passed when calling [`BigBirdModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimension of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu_new"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_new"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 4096):
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 1024 or 2048 or 4096).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BigBirdModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`BigBirdModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if ``config.is_decoder=True``.
-        attention_type (:obj:`str`, `optional`, defaults to :obj:`"block_sparse"`)
+            relevant if `config.is_decoder=True`.
+        attention_type (`str`, *optional*, defaults to `"block_sparse"`)
             Whether to use block sparse attention (with n complexity) as introduced in paper or original attention
-            layer (with n^2 complexity). Possible values are :obj:`"original_full"` and :obj:`"block_sparse"`.
-        use_bias (:obj:`bool`, `optional`, defaults to :obj:`True`)
+            layer (with n^2 complexity). Possible values are `"original_full"` and `"block_sparse"`.
+        use_bias (`bool`, *optional*, defaults to `True`)
             Whether to use bias in query, key, value.
-        rescale_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`)
+        rescale_embeddings (`bool`, *optional*, defaults to `False`)
             Whether to rescale embeddings with (hidden_size ** 0.5).
-        block_size (:obj:`int`, `optional`, defaults to 64)
-            Size of each block. Useful only when :obj:`attention_type == "block_sparse"`.
-        num_random_blocks (:obj:`int`, `optional`, defaults to 3)
-            Each query is going to attend these many number of random blocks. Useful only when :obj:`attention_type ==
-            "block_sparse"`.
-        classifier_dropout (:obj:`float`, `optional`):
+        block_size (`int`, *optional*, defaults to 64)
+            Size of each block. Useful only when `attention_type == "block_sparse"`.
+        num_random_blocks (`int`, *optional*, defaults to 3)
+            Each query is going to attend these many number of random blocks. Useful only when `attention_type == "block_sparse"`.
+        classifier_dropout (`float`, *optional*):
             The dropout ratio for the classification head.
 
-        Example::
+    Example:
+
+    ```python
+
+    ```
 
         >>> from transformers import BigBirdModel, BigBirdConfig
 
diff --git a/src/transformers/models/big_bird/modeling_flax_big_bird.py b/src/transformers/models/big_bird/modeling_flax_big_bird.py
index b1ed49cd36..a1be468934 100644
--- a/src/transformers/models/big_bird/modeling_flax_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_flax_big_bird.py
@@ -1635,18 +1635,20 @@ class FlaxBigBirdForPreTraining(FlaxBigBirdPreTrainedModel):
 FLAX_BIG_BIRD_FOR_PRETRAINING_DOCSTRING = """
     Returns:
 
-    Example::
+    Example:
 
-        >>> from transformers import BigBirdTokenizer, FlaxBigBirdForPreTraining
+    ```python
+    >>> from transformers import BigBirdTokenizer, FlaxBigBirdForPreTraining
 
-        >>> tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')
-        >>> model = FlaxBigBirdForPreTraining.from_pretrained('google/bigbird-roberta-base')
+    >>> tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')
+    >>> model = FlaxBigBirdForPreTraining.from_pretrained('google/bigbird-roberta-base')
 
-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
-        >>> outputs = model(**inputs)
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
+    >>> outputs = model(**inputs)
 
-        >>> prediction_logits = outputs.prediction_logits
-        >>> seq_relationship_logits = outputs.seq_relationship_logits
+    >>> prediction_logits = outputs.prediction_logits
+    >>> seq_relationship_logits = outputs.seq_relationship_logits
+    ```
 """
 
 overwrite_call_docstring(
diff --git a/src/transformers/models/big_bird/tokenization_big_bird.py b/src/transformers/models/big_bird/tokenization_big_bird.py
index 92f652448d..355e3fd068 100644
--- a/src/transformers/models/big_bird/tokenization_big_bird.py
+++ b/src/transformers/models/big_bird/tokenization_big_bird.py
@@ -46,47 +46,46 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 
 class BigBirdTokenizer(PreTrainedTokenizer):
     """
-    Construct a BigBird tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
+    Construct a BigBird tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
             The begin of sequence token.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        sp_model_kwargs (:obj:`dict`, `optional`):
-            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
-            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
 
-            - ``enable_sampling``: Enable subword regularization.
-            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
 
-              - ``nbest_size = {0,1}``: No sampling is performed.
-              - ``nbest_size > 1``: samples from the nbest_size results.
-              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                 using forward-filtering-and-backward-sampling algorithm.
 
-            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
               BPE-dropout.
     """
 
@@ -200,17 +199,17 @@ class BigBirdTokenizer(PreTrainedTokenizer):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. A Big Bird sequence has the following format:
 
-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         if token_ids_1 is None:
             return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -223,18 +222,18 @@ class BigBirdTokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
         if already_has_special_tokens:
             return super().get_special_tokens_mask(
@@ -251,16 +250,16 @@ class BigBirdTokenizer(PreTrainedTokenizer):
         """
         Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
         pair mask has the following format: :: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second
-        sequence | If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        sequence | If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
             sequence(s).
         """
         sep = [self.sep_token_id]
diff --git a/src/transformers/models/big_bird/tokenization_big_bird_fast.py b/src/transformers/models/big_bird/tokenization_big_bird_fast.py
index 36f2afa337..26adf8a3ec 100644
--- a/src/transformers/models/big_bird/tokenization_big_bird_fast.py
+++ b/src/transformers/models/big_bird/tokenization_big_bird_fast.py
@@ -58,38 +58,40 @@ SPIECE_UNDERLINE = "▁"
 
 class BigBirdTokenizerFast(PreTrainedTokenizerFast):
     """
-    Construct a "fast" BigBird tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
-    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__. This tokenizer
-    inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main methods. Users should
+    Construct a "fast" BigBird tokenizer (backed by HuggingFace's *tokenizers* library). Based on [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This tokenizer
+    inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
     refer to this superclass for more information regarding those methods
 
     Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
-            .. note::
+            <Tip>
 
-               When building a sequence using special tokens, this is not the token that is used for the beginning of
-               sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token
-            that is used for the end of sequence. The token used is the :obj:`sep_token`.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            that is used for the end of sequence. The token used is the `sep_token`.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
     """
@@ -147,17 +149,17 @@ class BigBirdTokenizerFast(PreTrainedTokenizerFast):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. An BigBird sequence has the following format:
 
-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
@@ -170,18 +172,18 @@ class BigBirdTokenizerFast(PreTrainedTokenizerFast):
     ) -> List[int]:
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Set to True if the token list is already formatted with special tokens for the model
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
 
         if already_has_special_tokens:
@@ -203,21 +205,21 @@ class BigBirdTokenizerFast(PreTrainedTokenizerFast):
         Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
         sequence pair mask has the following format:
 
-        ::
-
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
 
         if token_ids_1 is None, only returns the first portion of the mask (0s).
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
             sequence(s).
         """
         sep = [self.sep_token_id]
diff --git a/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py
index 297e2cede4..2d9fdd18d7 100644
--- a/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py
@@ -30,72 +30,75 @@ BIGBIRD_PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class BigBirdPegasusConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.BigBirdPegasusModel`. It is
+    This is the configuration class to store the configuration of a [`BigBirdPegasusModel`]. It is
     used to instantiate an BigBirdPegasus model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the BigBirdPegasus
-    `google/bigbird-pegasus-large-arxiv <https://huggingface.co/google/bigbird-pegasus-large-arxiv>`__ architecture.
+    [google/bigbird-pegasus-large-arxiv](https://huggingface.co/google/bigbird-pegasus-large-arxiv) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 96103):
+        vocab_size (`int`, *optional*, defaults to 96103):
             Vocabulary size of the BigBirdPegasus model. Defines the number of different tokens that can be represented
-            by the :obj:`inputs_ids` passed when calling :class:`~transformers.BigBirdPegasusModel`.
-        d_model (:obj:`int`, `optional`, defaults to 1024):
+            by the `inputs_ids` passed when calling [`BigBirdPegasusModel`].
+        d_model (`int`, *optional*, defaults to 1024):
             Dimension of the layers and the pooler layer.
-        encoder_layers (:obj:`int`, `optional`, defaults to 16):
+        encoder_layers (`int`, *optional*, defaults to 16):
             Number of encoder layers.
-        decoder_layers (:obj:`int`, `optional`, defaults to 16):
+        decoder_layers (`int`, *optional*, defaults to 16):
             Number of decoder layers.
-        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
             Dimension of the "intermediate" (often named feed-forward) layer in decoder.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
             Dimension of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu_new"`):
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu_new"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        activation_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for classifier.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 4096):
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 1024 or 2048 or 4096).
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
+        init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
-        attention_type (:obj:`str`, `optional`, defaults to :obj:`"block_sparse"`)
+        attention_type (`str`, *optional*, defaults to `"block_sparse"`)
             Whether to use block sparse attention (with n complexity) as introduced in paper or original attention
-            layer (with n^2 complexity) in encoder. Possible values are :obj:`"original_full"` and
-            :obj:`"block_sparse"`.
-        use_bias (:obj:`bool`, `optional`, defaults to :obj:`False`)
+            layer (with n^2 complexity) in encoder. Possible values are `"original_full"` and
+            `"block_sparse"`.
+        use_bias (`bool`, *optional*, defaults to `False`)
             Whether to use bias in query, key, value.
-        block_size (:obj:`int`, `optional`, defaults to 64)
-            Size of each block. Useful only when :obj:`attention_type == "block_sparse"`.
-        num_random_blocks (:obj:`int`, `optional`, defaults to 3)
-            Each query is going to attend these many number of random blocks. Useful only when :obj:`attention_type ==
-            "block_sparse"`.
-        scale_embeddings (:obj:`bool`, `optional`, defaults to :obj:`True`)
+        block_size (`int`, *optional*, defaults to 64)
+            Size of each block. Useful only when `attention_type == "block_sparse"`.
+        num_random_blocks (`int`, *optional*, defaults to 3)
+            Each query is going to attend these many number of random blocks. Useful only when `attention_type == "block_sparse"`.
+        scale_embeddings (`bool`, *optional*, defaults to `True`)
             Whether to rescale embeddings with (hidden_size ** 0.5).
 
-        Example::
+    Example:
+
+    ```python
+
+    ```
 
         >>> from transformers import BigBirdPegasusModel, BigBirdPegasusConfig
 
diff --git a/src/transformers/models/blenderbot/configuration_blenderbot.py b/src/transformers/models/blenderbot/configuration_blenderbot.py
index 13acbdf699..5dccf86d9c 100644
--- a/src/transformers/models/blenderbot/configuration_blenderbot.py
+++ b/src/transformers/models/blenderbot/configuration_blenderbot.py
@@ -28,77 +28,78 @@ BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class BlenderbotConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.BlenderbotModel`. It is used
+    This is the configuration class to store the configuration of a [`BlenderbotModel`]. It is used
     to instantiate an Blenderbot model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the Blenderbot
-    `facebook/blenderbot-3B <https://huggingface.co/facebook/blenderbot-3B>`__ architecture.
+    [facebook/blenderbot-3B](https://huggingface.co/facebook/blenderbot-3B) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 50265):
+        vocab_size (`int`, *optional*, defaults to 50265):
             Vocabulary size of the Blenderbot model. Defines the number of different tokens that can be represented by
-            the :obj:`inputs_ids` passed when calling :class:`~transformers.BlenderbotModel` or
-            :class:`~transformers.TFBlenderbotModel`.
-        d_model (:obj:`int`, `optional`, defaults to 1024):
+            the `inputs_ids` passed when calling [`BlenderbotModel`] or
+            [`TFBlenderbotModel`].
+        d_model (`int`, *optional*, defaults to 1024):
             Dimensionality of the layers and the pooler layer.
-        encoder_layers (:obj:`int`, `optional`, defaults to 12):
+        encoder_layers (`int`, *optional*, defaults to 12):
             Number of encoder layers.
-        decoder_layers (:obj:`int`, `optional`, defaults to 12):
+        decoder_layers (`int`, *optional*, defaults to 12):
             Number of decoder layers.
-        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        activation_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for classifier.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 128):
+        max_position_embeddings (`int`, *optional*, defaults to 128):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
+        init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        scale_embedding (`bool`, *optional*, defaults to `False`):
             Scale embeddings by diving by sqrt(d_model).
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models)
-        forced_eos_token_id (:obj:`int`, `optional`, defaults to 2):
-            The id of the token to force as the last generated token when :obj:`max_length` is reached. Usually set to
-            :obj:`eos_token_id`.
+        forced_eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+            `eos_token_id`.
 
-    Example::
+    Example:
 
-        >>> from transformers import BlenderbotModel, BlenderbotConfig
+    ```python
+    >>> from transformers import BlenderbotModel, BlenderbotConfig
 
-        >>> # Initializing a Blenderbot facebook/blenderbot-3B style configuration
-        >>> configuration = BlenderbotConfig()
+    >>> # Initializing a Blenderbot facebook/blenderbot-3B style configuration
+    >>> configuration = BlenderbotConfig()
 
-        >>> # Initializing a model from the facebook/blenderbot-3B style configuration
-        >>> model = BlenderbotModel(configuration)
+    >>> # Initializing a model from the facebook/blenderbot-3B style configuration
+    >>> model = BlenderbotModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "blenderbot"
     keys_to_ignore_at_inference = ["past_key_values"]
     attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index d92678a76e..fc9597f76e 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -1128,19 +1128,20 @@ class BlenderbotModel(BlenderbotPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import BlenderbotTokenizer, BlenderbotModel
+        ```python
+        >>> from transformers import BlenderbotTokenizer, BlenderbotModel
 
-            >>> model = BlenderbotModel.from_pretrained("facebook/blenderbot-400M-distill")
-            >>> tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
+        >>> model = BlenderbotModel.from_pretrained("facebook/blenderbot-400M-distill")
+        >>> tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
 
-            >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
-            >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
-            >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+        >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
 
-            >>> last_hidden_states = outputs.last_hidden_state
-        """
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/blenderbot/modeling_flax_blenderbot.py b/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
index 068161dbcf..f3dcf35f64 100644
--- a/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_flax_blenderbot.py
@@ -977,17 +977,18 @@ class FlaxBlenderbotPreTrainedModel(FlaxPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import BlenderbotTokenizer, FlaxBlenderbotForConditionalGeneration
+        ```python
+        >>> from transformers import BlenderbotTokenizer, FlaxBlenderbotForConditionalGeneration
 
-            >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot-400M-distill')
-            >>> tokenizer = BlenderbotTokenizer.from_pretrained('facebook/blenderbot-400M-distill')
+        >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot-400M-distill')
+        >>> tokenizer = BlenderbotTokenizer.from_pretrained('facebook/blenderbot-400M-distill')
 
-            >>> text = "My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
-            >>> encoder_outputs = model.encode(**inputs)
-        """
+        >>> text = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
+        >>> encoder_outputs = model.encode(**inputs)
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1044,23 +1045,24 @@ class FlaxBlenderbotPreTrainedModel(FlaxPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import BlenderbotTokenizer, FlaxBlenderbotForConditionalGeneration
+        ```python
+        >>> from transformers import BlenderbotTokenizer, FlaxBlenderbotForConditionalGeneration
 
-            >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot-400M-distill')
-            >>> tokenizer = BlenderbotTokenizer.from_pretrained('facebook/blenderbot-400M-distill')
+        >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot-400M-distill')
+        >>> tokenizer = BlenderbotTokenizer.from_pretrained('facebook/blenderbot-400M-distill')
 
-            >>> text = "My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
-            >>> encoder_outputs = model.encode(**inputs)
+        >>> text = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
+        >>> encoder_outputs = model.encode(**inputs)
 
-            >>> decoder_start_token_id = model.config.decoder_start_token_id
-            >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
 
-            >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
-            >>> last_decoder_hidden_states = outputs.last_hidden_state
-        """
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> last_decoder_hidden_states = outputs.last_hidden_state
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1312,23 +1314,24 @@ class FlaxBlenderbotForConditionalGeneration(FlaxBlenderbotPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import BlenderbotTokenizer, FlaxBlenderbotForConditionalGeneration
+        ```python
+        >>> from transformers import BlenderbotTokenizer, FlaxBlenderbotForConditionalGeneration
 
-            >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot-400M-distill')
-            >>> tokenizer = BlenderbotTokenizer.from_pretrained('facebook/blenderbot-400M-distill')
+        >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained('facebook/blenderbot-400M-distill')
+        >>> tokenizer = BlenderbotTokenizer.from_pretrained('facebook/blenderbot-400M-distill')
 
-            >>> text = "My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
-            >>> encoder_outputs = model.encode(**inputs)
+        >>> text = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
+        >>> encoder_outputs = model.encode(**inputs)
 
-            >>> decoder_start_token_id = model.config.decoder_start_token_id
-            >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
 
-            >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
-            >>> logits = outputs.logits
-        """
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> logits = outputs.logits
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot.py b/src/transformers/models/blenderbot/tokenization_blenderbot.py
index e003d80534..966b1294db 100644
--- a/src/transformers/models/blenderbot/tokenization_blenderbot.py
+++ b/src/transformers/models/blenderbot/tokenization_blenderbot.py
@@ -47,11 +47,11 @@ class BlenderbotTokenizer(RobertaTokenizer):
     r"""
     Construct a Blenderbot tokenizer.
 
-    :class:`~transformers.Blenderbot` is nearly identical to :class:`~transformers.RobertaTokenizer` and runs
+    [`Blenderbot`] is nearly identical to [`RobertaTokenizer`] and runs
     end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesn't add BOS token
     to the beginning of sequences.
 
-    Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning
+    Refer to superclass [`RobertaTokenizer`] for usage examples and documentation concerning
     parameters.
     """
     vocab_files_names = VOCAB_FILES_NAMES
@@ -63,16 +63,16 @@ class BlenderbotTokenizer(RobertaTokenizer):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. A Blenderbot sequence has the following format:
 
-        - single sequence: `` X </s>``
+        - single sequence: ` X </s>`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Will be ignored
 
         Returns:
-            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         return token_ids_0 + [self.eos_token_id]
 
diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
index f7835d573c..f04ce1b369 100644
--- a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
+++ b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
@@ -46,13 +46,13 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/blenderbot-3B": 128}
 
 class BlenderbotTokenizerFast(RobertaTokenizerFast):
     r"""
-    Construct a "fast" Blenderbot tokenizer (backed by HuggingFace's `tokenizers` library).
+    Construct a "fast" Blenderbot tokenizer (backed by HuggingFace's *tokenizers* library).
 
-    :class:`~transformers.BlenderbotFast` is nearly identical to :class:`~transformers.RobertaTokenizerFast` and runs
+    [`BlenderbotFast`] is nearly identical to [`RobertaTokenizerFast`] and runs
     end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesn't add BOS token
     to the beginning of sequences.
 
-    Refer to superclass :class:`~transformers.RobertaTokenizerFast` for usage examples and documentation concerning
+    Refer to superclass [`RobertaTokenizerFast`] for usage examples and documentation concerning
     parameters.
     """
     vocab_files_names = VOCAB_FILES_NAMES
@@ -65,16 +65,16 @@ class BlenderbotTokenizerFast(RobertaTokenizerFast):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. A Blenderbot sequence has the following format:
 
-        - single sequence: `` X </s>``
+        - single sequence: ` X </s>`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Will be ignored
 
         Returns:
-            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         return token_ids_0 + [self.eos_token_id]
 
diff --git a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
index 0f76e2e3ae..2490cb0207 100644
--- a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
@@ -28,77 +28,78 @@ BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class BlenderbotSmallConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.BlenderbotSmallModel`. It is
+    This is the configuration class to store the configuration of a [`BlenderbotSmallModel`]. It is
     used to instantiate an BlenderbotSmall model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the BlenderbotSmall
-    `facebook/blenderbot_small-90M <https://huggingface.co/facebook/blenderbot_small-90M>`__ architecture.
+    [facebook/blenderbot_small-90M](https://huggingface.co/facebook/blenderbot_small-90M) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 50265):
+        vocab_size (`int`, *optional*, defaults to 50265):
             Vocabulary size of the BlenderbotSmall model. Defines the number of different tokens that can be
-            represented by the :obj:`inputs_ids` passed when calling :class:`~transformers.BlenderbotSmallModel` or
-            :class:`~transformers.TFBlenderbotSmallModel`.
-        d_model (:obj:`int`, `optional`, defaults to 512):
+            represented by the `inputs_ids` passed when calling [`BlenderbotSmallModel`] or
+            [`TFBlenderbotSmallModel`].
+        d_model (`int`, *optional*, defaults to 512):
             Dimensionality of the layers and the pooler layer.
-        encoder_layers (:obj:`int`, `optional`, defaults to 8):
+        encoder_layers (`int`, *optional*, defaults to 8):
             Number of encoder layers.
-        decoder_layers (:obj:`int`, `optional`, defaults to 8):
+        decoder_layers (`int`, *optional*, defaults to 8):
             Number of decoder layers.
-        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
+        decoder_ffn_dim (`int`, *optional*, defaults to 2048):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
+        encoder_ffn_dim (`int`, *optional*, defaults to 2048):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        activation_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for classifier.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
+        init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        scale_embedding (`bool`, *optional*, defaults to `False`):
             Scale embeddings by diving by sqrt(d_model).
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models)
-        forced_eos_token_id (:obj:`int`, `optional`, defaults to 2):
-            The id of the token to force as the last generated token when :obj:`max_length` is reached. Usually set to
-            :obj:`eos_token_id`.
+        forced_eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+            `eos_token_id`.
 
-    Example::
+    Example:
 
-        >>> from transformers import BlenderbotSmallModel, BlenderbotSmallConfig
+    ```python
+    >>> from transformers import BlenderbotSmallModel, BlenderbotSmallConfig
 
-        >>> # Initializing a BlenderbotSmall facebook/blenderbot_small-90M style configuration
-        >>> configuration = BlenderbotSmallConfig()
+    >>> # Initializing a BlenderbotSmall facebook/blenderbot_small-90M style configuration
+    >>> configuration = BlenderbotSmallConfig()
 
-        >>> # Initializing a model from the facebook/blenderbot_small-90M style configuration
-        >>> model = BlenderbotSmallModel(configuration)
+    >>> # Initializing a model from the facebook/blenderbot_small-90M style configuration
+    >>> model = BlenderbotSmallModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "blenderbot-small"
     keys_to_ignore_at_inference = ["past_key_values"]
     attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index 276ff96a54..e0dcd95aa6 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -1115,19 +1115,20 @@ class BlenderbotSmallModel(BlenderbotSmallPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import BlenderbotSmallTokenizer, BlenderbotSmallModel
+        ```python
+        >>> from transformers import BlenderbotSmallTokenizer, BlenderbotSmallModel
 
-            >>> model = BlenderbotSmallModel.from_pretrained("facebook/blenderbot_small-90M")
-            >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot_small-90M")
+        >>> model = BlenderbotSmallModel.from_pretrained("facebook/blenderbot_small-90M")
+        >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained("facebook/blenderbot_small-90M")
 
-            >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
-            >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
-            >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+        >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
 
-            >>> last_hidden_states = outputs.last_hidden_state
-        """
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
index 1daf801b78..fd27a6c094 100644
--- a/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py
@@ -989,17 +989,18 @@ class FlaxBlenderbotSmallPreTrainedModel(FlaxPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import BlenderbotSmallTokenizer, FlaxBlenderbotSmallForConditionalGeneration
+        ```python
+        >>> from transformers import BlenderbotSmallTokenizer, FlaxBlenderbotSmallForConditionalGeneration
 
-            >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained('facebook/blenderbot_small-90M')
-            >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/blenderbot_small-90M')
+        >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained('facebook/blenderbot_small-90M')
+        >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/blenderbot_small-90M')
 
-            >>> text = "My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
-            >>> encoder_outputs = model.encode(**inputs)
-        """
+        >>> text = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
+        >>> encoder_outputs = model.encode(**inputs)
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1056,23 +1057,24 @@ class FlaxBlenderbotSmallPreTrainedModel(FlaxPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import BlenderbotSmallTokenizer, FlaxBlenderbotSmallForConditionalGeneration
+        ```python
+        >>> from transformers import BlenderbotSmallTokenizer, FlaxBlenderbotSmallForConditionalGeneration
 
-            >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained('facebook/blenderbot_small-90M')
-            >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/blenderbot_small-90M')
+        >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained('facebook/blenderbot_small-90M')
+        >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/blenderbot_small-90M')
 
-            >>> text = "My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
-            >>> encoder_outputs = model.encode(**inputs)
+        >>> text = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
+        >>> encoder_outputs = model.encode(**inputs)
 
-            >>> decoder_start_token_id = model.config.decoder_start_token_id
-            >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
 
-            >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
-            >>> last_decoder_hidden_states = outputs.last_hidden_state
-        """
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> last_decoder_hidden_states = outputs.last_hidden_state
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1324,23 +1326,24 @@ class FlaxBlenderbotSmallForConditionalGeneration(FlaxBlenderbotSmallPreTrainedM
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import BlenderbotSmallTokenizer, FlaxBlenderbotSmallForConditionalGeneration
+        ```python
+        >>> from transformers import BlenderbotSmallTokenizer, FlaxBlenderbotSmallForConditionalGeneration
 
-            >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained('facebook/blenderbot_small-90M')
-            >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/blenderbot_small-90M')
+        >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained('facebook/blenderbot_small-90M')
+        >>> tokenizer = BlenderbotSmallTokenizer.from_pretrained('facebook/blenderbot_small-90M')
 
-            >>> text = "My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
-            >>> encoder_outputs = model.encode(**inputs)
+        >>> text = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
+        >>> encoder_outputs = model.encode(**inputs)
 
-            >>> decoder_start_token_id = model.config.decoder_start_token_id
-            >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
 
-            >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
-            >>> logits = outputs.logits
-        """
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> logits = outputs.logits
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
index 1b8104e924..29746559be 100644
--- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
@@ -68,25 +68,25 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer):
     """
     Constructs a Blenderbot-90M tokenizer based on BPE (Byte-Pair-Encoding)
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to the superclass for more information regarding methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             File containing the vocabulary.
-        merges_file (:obj:`str`):
+        merges_file (`str`):
             Path to the merges file.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"__start__"`):
+        bos_token (`str`, *optional*, defaults to `"__start__"`):
             The beginning of sentence token.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"__end__"`):
+        eos_token (`str`, *optional*, defaults to `"__end__"`):
             The end of sentence token.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"__unk__"`):
+        unk_token (`str`, *optional*, defaults to `"__unk__"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"__pad__"`):
+        pad_token (`str`, *optional*, defaults to `"__pad__"`):
             The token used for padding, for example when batching sequences of different lengths.
         **kwargs
-            Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer`
+            Additional keyword arguments passed along to [`PreTrainedTokenizer`]
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
index 2867b598b7..63c8c39563 100644
--- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
+++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
@@ -49,10 +49,10 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 
 class BlenderbotSmallTokenizerFast(PreTrainedTokenizerFast):
     """
-    Construct a "fast" BlenderbotSmall tokenizer (backed by HuggingFace's `tokenizers` library).
+    Construct a "fast" BlenderbotSmall tokenizer (backed by HuggingFace's *tokenizers* library).
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             Path to the vocabulary file.
     """
 
@@ -101,13 +101,13 @@ class BlenderbotSmallTokenizerFast(PreTrainedTokenizerFast):
         does not make use of token type ids, therefore a list of zeros is returned.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of zeros.
+            `List[int]`: List of zeros.
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
diff --git a/src/transformers/models/byt5/tokenization_byt5.py b/src/transformers/models/byt5/tokenization_byt5.py
index 4714dbd27e..7c0e94f35c 100644
--- a/src/transformers/models/byt5/tokenization_byt5.py
+++ b/src/transformers/models/byt5/tokenization_byt5.py
@@ -29,29 +29,31 @@ class ByT5Tokenizer(PreTrainedTokenizer):
     """
     Construct a ByT5 tokenizer. ByT5 simply uses raw bytes utf-8 encoding.
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        extra_ids (:obj:`int`, `optional`, defaults to 100):
+        extra_ids (`int`, *optional*, defaults to 100):
             Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are
             accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are
             indexed from the end of the vocabulary up to beginning ("<extra_id_0>" is the last token in the vocabulary
-            like in ByT5 preprocessing see `here
-            <https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__).
-        additional_special_tokens (:obj:`List[str]`, `optional`):
+            like in ByT5 preprocessing see [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)).
+        additional_special_tokens (`List[str]`, *optional*):
             Additional special tokens used by the tokenizer.
     """
 
@@ -116,18 +118,18 @@ class ByT5Tokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
         if already_has_special_tokens:
             return super().get_special_tokens_mask(
@@ -157,13 +159,13 @@ class ByT5Tokenizer(PreTrainedTokenizer):
         make use of token type ids, therefore a list of zeros is returned.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of zeros.
+            `List[int]`: List of zeros.
         """
         eos = [self.eos_token_id]
 
@@ -178,17 +180,17 @@ class ByT5Tokenizer(PreTrainedTokenizer):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. A sequence has the following format:
 
-        - single sequence: ``X </s>``
-        - pair of sequences: ``A </s> B </s>``
+        - single sequence: `X </s>`
+        - pair of sequences: `A </s> B </s>`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         token_ids_0 = self._add_eos_if_not_present(token_ids_0)
         if token_ids_1 is None:
diff --git a/src/transformers/models/camembert/configuration_camembert.py b/src/transformers/models/camembert/configuration_camembert.py
index 8a55e1c320..2479ca98a6 100644
--- a/src/transformers/models/camembert/configuration_camembert.py
+++ b/src/transformers/models/camembert/configuration_camembert.py
@@ -34,7 +34,7 @@ CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class CamembertConfig(RobertaConfig):
     """
-    This class overrides :class:`~transformers.RobertaConfig`. Please check the superclass for the appropriate
+    This class overrides [`RobertaConfig`]. Please check the superclass for the appropriate
     documentation alongside usage examples.
     """
 
diff --git a/src/transformers/models/camembert/tokenization_camembert.py b/src/transformers/models/camembert/tokenization_camembert.py
index c367e31255..2bb76236ee 100644
--- a/src/transformers/models/camembert/tokenization_camembert.py
+++ b/src/transformers/models/camembert/tokenization_camembert.py
@@ -44,65 +44,70 @@ SPIECE_UNDERLINE = "▁"
 
 class CamembertTokenizer(PreTrainedTokenizer):
     """
-    Adapted from :class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Construct a
-    CamemBERT tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
+    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Construct a
+    CamemBERT tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
             Additional special tokens used by the tokenizer.
-        sp_model_kwargs (:obj:`dict`, `optional`):
-            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
-            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
 
-            - ``enable_sampling``: Enable subword regularization.
-            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
 
-              - ``nbest_size = {0,1}``: No sampling is performed.
-              - ``nbest_size > 1``: samples from the nbest_size results.
-              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                 using forward-filtering-and-backward-sampling algorithm.
 
-            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
               BPE-dropout.
 
     Attributes:
-        sp_model (:obj:`SentencePieceProcessor`):
-            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -158,17 +163,17 @@ class CamembertTokenizer(PreTrainedTokenizer):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. An CamemBERT sequence has the following format:
 
-        - single sequence: ``<s> X </s>``
-        - pair of sequences: ``<s> A </s></s> B </s>``
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
 
         if token_ids_1 is None:
@@ -182,18 +187,18 @@ class CamembertTokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
         if already_has_special_tokens:
             return super().get_special_tokens_mask(
@@ -212,13 +217,13 @@ class CamembertTokenizer(PreTrainedTokenizer):
         RoBERTa, does not make use of token type ids, therefore a list of zeros is returned.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of zeros.
+            `List[int]`: List of zeros.
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
diff --git a/src/transformers/models/camembert/tokenization_camembert_fast.py b/src/transformers/models/camembert/tokenization_camembert_fast.py
index cce7e2f63c..782ba2f5c3 100644
--- a/src/transformers/models/camembert/tokenization_camembert_fast.py
+++ b/src/transformers/models/camembert/tokenization_camembert_fast.py
@@ -53,47 +53,52 @@ SPIECE_UNDERLINE = "▁"
 
 class CamembertTokenizerFast(PreTrainedTokenizerFast):
     """
-    Construct a "fast" CamemBERT tokenizer (backed by HuggingFace's `tokenizers` library). Adapted from
-    :class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Based on `BPE
-    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models>`__.
+    Construct a "fast" CamemBERT tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
+    [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
     methods. Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
             Additional special tokens used by the tokenizer.
     """
 
@@ -144,17 +149,17 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. An CamemBERT sequence has the following format:
 
-        - single sequence: ``<s> X </s>``
-        - pair of sequences: ``<s> A </s></s> B </s>``
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
 
         if token_ids_1 is None:
@@ -171,13 +176,13 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
         RoBERTa, does not make use of token type ids, therefore a list of zeros is returned.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of zeros.
+            `List[int]`: List of zeros.
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
diff --git a/src/transformers/models/canine/configuration_canine.py b/src/transformers/models/canine/configuration_canine.py
index 79be54a824..b57a4fafff 100644
--- a/src/transformers/models/canine/configuration_canine.py
+++ b/src/transformers/models/canine/configuration_canine.py
@@ -28,66 +28,66 @@ CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class CanineConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.CanineModel`. It is used to
+    This is the configuration class to store the configuration of a [`CanineModel`]. It is used to
     instantiate an CANINE model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the CANINE `google/canine-s
-    <https://huggingface.co/google/canine-s>`__ architecture.
+    configuration with the defaults will yield a similar configuration to that of the CANINE [google/canine-s](https://huggingface.co/google/canine-s) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimension of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the deep Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoders.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoders.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoders, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 16384):
+        max_position_embeddings (`int`, *optional*, defaults to 16384):
             The maximum sequence length that this model might ever be used with.
-        type_vocab_size (:obj:`int`, `optional`, defaults to 16):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.CanineModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 16):
+            The vocabulary size of the `token_type_ids` passed when calling [`CanineModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        downsampling_rate (:obj:`int`, `optional`, defaults to 4):
+        downsampling_rate (`int`, *optional*, defaults to 4):
             The rate at which to downsample the original character sequence length before applying the deep Transformer
             encoder.
-        upsampling_kernel_size (:obj:`int`, `optional`, defaults to 4):
+        upsampling_kernel_size (`int`, *optional*, defaults to 4):
             The kernel size (i.e. the number of characters in each window) of the convolutional projection layer when
-            projecting back from :obj:`hidden_size`*2 to :obj:`hidden_size`.
-        num_hash_functions (:obj:`int`, `optional`, defaults to 8):
+            projecting back from `hidden_size`*2 to `hidden_size`.
+        num_hash_functions (`int`, *optional*, defaults to 8):
             The number of hash functions to use. Each hash function has its own embedding matrix.
-        num_hash_buckets (:obj:`int`, `optional`, defaults to 16384):
+        num_hash_buckets (`int`, *optional*, defaults to 16384):
             The number of hash buckets to use.
-        local_transformer_stride (:obj:`int`, `optional`, defaults to 128):
+        local_transformer_stride (`int`, *optional*, defaults to 128):
             The stride of the local attention of the first shallow Transformer encoder. Defaults to 128 for good
             TPU/XLA memory alignment.
 
-    Example::
+    Example:
 
-        >>> from transformers import CanineModel, CanineConfig
+    ```python
+    >>> from transformers import CanineModel, CanineConfig
 
-        >>> # Initializing a CANINE google/canine-s style configuration
-        >>> configuration = CanineConfig()
+    >>> # Initializing a CANINE google/canine-s style configuration
+    >>> configuration = CanineConfig()
 
-        >>> # Initializing a model from the google/canine-s style configuration
-        >>> model = CanineModel(configuration)
+    >>> # Initializing a model from the google/canine-s style configuration
+    >>> model = CanineModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "canine"
 
     def __init__(
diff --git a/src/transformers/models/canine/tokenization_canine.py b/src/transformers/models/canine/tokenization_canine.py
index 87580629c8..4bf96f1828 100644
--- a/src/transformers/models/canine/tokenization_canine.py
+++ b/src/transformers/models/canine/tokenization_canine.py
@@ -65,13 +65,13 @@ class CanineTokenizer(PreTrainedTokenizer):
     Construct a CANINE tokenizer (i.e. a character splitter). It turns text into a sequence of characters, and then
     converts each character into its Unicode code point.
 
-    :class:`~transformers.CanineTokenizer` inherits from :class:`~transformers.PreTrainedTokenizer`.
+    [`CanineTokenizer`] inherits from [`PreTrainedTokenizer`].
 
-    Refer to superclass :class:`~transformers.PreTrainedTokenizer` for usage examples and documentation concerning
+    Refer to superclass [`PreTrainedTokenizer`] for usage examples and documentation concerning
     parameters.
 
     Args:
-        model_max_length (:obj:`int`, `optional`, defaults to 2048):
+        model_max_length (`int`, *optional*, defaults to 2048):
                 The maximum sentence length the model accepts.
     """
 
@@ -160,17 +160,17 @@ class CanineTokenizer(PreTrainedTokenizer):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. A CANINE sequence has the following format:
 
-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
@@ -185,18 +185,18 @@ class CanineTokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
         if already_has_special_tokens:
             return super().get_special_tokens_mask(
@@ -215,21 +215,21 @@ class CanineTokenizer(PreTrainedTokenizer):
         Create a mask from the two sequences passed to be used in a sequence-pair classification task. A CANINE
         sequence pair mask has the following format:
 
-        ::
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
 
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
-
-        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
             sequence(s).
         """
         sep = [self.sep_token_id]
diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py
index 0f8b6fa9a4..e0a34e722d 100644
--- a/src/transformers/models/clip/configuration_clip.py
+++ b/src/transformers/models/clip/configuration_clip.py
@@ -30,58 +30,58 @@ CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class CLIPTextConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.CLIPModel`. It is used to
+    This is the configuration class to store the configuration of a [`CLIPModel`]. It is used to
     instantiate an CLIP model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the CLIP
-    `openai/clip-vit-base-patch32 <https://huggingface.co/openai/clip-vit-base-patch32>`__ architecture.
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 49408):
+        vocab_size (`int`, *optional*, defaults to 49408):
             Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by
-            the :obj:`inputs_ids` passed when calling :class:`~transformers.CLIPModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 512):
+            the `inputs_ids` passed when calling [`CLIPModel`].
+        hidden_size (`int`, *optional*, defaults to 512):
             Dimensionality of the encoder layers and the pooler layer.
-        intermediate_size (:obj:`int`, `optional`, defaults to 2048):
+        intermediate_size (`int`, *optional*, defaults to 2048):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 8):
+        num_attention_heads (`int`, *optional*, defaults to 8):
             Number of attention heads for each attention layer in the Transformer encoder.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 77):
+        max_position_embeddings (`int`, *optional*, defaults to 77):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"quick_gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` :obj:`"quick_gelu"` are supported.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-5):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults to 1e-5):
             The epsilon used by the layer normalization layers.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        dropout (:obj:`float`, `optional`, defaults to 0.0):
+        dropout (`float`, *optional*, defaults to 0.0):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (:obj:`float`, `optional`, defaults to 1):
+        initializer_factor (`float``, *optional*, defaults to 1):
             A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
             testing).
 
-    Example::
+    Example:
 
-        >>> from transformers import CLIPTextModel, CLIPTextConfig
+    ```python
+    >>> from transformers import CLIPTextModel, CLIPTextConfig
 
-        >>> # Initializing a CLIPTextModel with openai/clip-vit-base-patch32 style configuration
-        >>> configuration = CLIPTextConfig()
+    >>> # Initializing a CLIPTextModel with openai/clip-vit-base-patch32 style configuration
+    >>> configuration = CLIPTextConfig()
 
-        >>> # Initializing a CLIPTextConfig from the openai/clip-vit-base-patch32 style configuration
-        >>> model = CLIPTextModel(configuration)
+    >>> # Initializing a CLIPTextConfig from the openai/clip-vit-base-patch32 style configuration
+    >>> model = CLIPTextModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "clip_text_model"
 
     def __init__(
@@ -121,56 +121,56 @@ class CLIPTextConfig(PretrainedConfig):
 
 class CLIPVisionConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.CLIPModel`. It is used to
+    This is the configuration class to store the configuration of a [`CLIPModel`]. It is used to
     instantiate an CLIP model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the CLIP
-    `openai/clip-vit-base-patch32 <https://huggingface.co/openai/clip-vit-base-patch32>`__ architecture.
+    [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        image_size (:obj:`int`, `optional`, defaults to 224):
+        image_size (`int`, *optional*, defaults to 224):
             The size (resolution) of each image.
-        patch_size (:obj:`int`, `optional`, defaults to 32):
+        patch_size (`int`, *optional*, defaults to 32):
             The size (resolution) of each patch.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"quick_gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` :obj:`"quick_gelu"` are supported.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-5):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults to 1e-5):
             The epsilon used by the layer normalization layers.
-        dropout (:obj:`float`, `optional`, defaults to 0.0):
+        dropout (`float`, *optional*, defaults to 0.0):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (:obj:`float`, `optional`, defaults to 1):
+        initializer_factor (`float``, *optional*, defaults to 1):
             A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
             testing).
 
-    Example::
+    Example:
 
-        >>> from transformers import CLIPVisionModel, CLIPVisionConfig
+    ```python
+    >>> from transformers import CLIPVisionModel, CLIPVisionConfig
 
-        >>> # Initializing a CLIPVisionModel with openai/clip-vit-base-patch32 style configuration
-        >>> configuration = CLIPVisionConfig()
+    >>> # Initializing a CLIPVisionModel with openai/clip-vit-base-patch32 style configuration
+    >>> configuration = CLIPVisionConfig()
 
-        >>> # Initializing a CLIPVisionModel model from the openai/clip-vit-base-patch32 style configuration
-        >>> model = CLIPVisionModel(configuration)
+    >>> # Initializing a CLIPVisionModel model from the openai/clip-vit-base-patch32 style configuration
+    >>> model = CLIPVisionModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
 
     model_type = "clip_vision_model"
 
@@ -208,23 +208,23 @@ class CLIPVisionConfig(PretrainedConfig):
 
 class CLIPConfig(PretrainedConfig):
     r"""
-    :class:`~transformers.CLIPConfig` is the configuration class to store the configuration of a
-    :class:`~transformers.CLIPModel`. It is used to instantiate CLIP model according to the specified arguments,
+    [`CLIPConfig`] is the configuration class to store the configuration of a
+    [`CLIPModel`]. It is used to instantiate CLIP model according to the specified arguments,
     defining the text model and vision model configs.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        text_config_dict (:obj:`dict`, `optional`):
-            Dictionary of configuration options used to initialize :class:`~transformers.CLIPTextConfig`.
-        vision_config_dict (:obj:`dict`, `optional`):
-            Dictionary of configuration options used to initialize :class:`~transformers.CLIPVisionConfig`.
-        projection_dim (:obj:`int`, `optional`, defaults to 512):
+        text_config_dict (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CLIPTextConfig`].
+        vision_config_dict (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
+        projection_dim (`int`, *optional*, defaults to 512):
             Dimentionality of text and vision projection layers.
-        logit_scale_init_value (:obj:`float`, `optional`, defaults to 2.6592):
-            The inital value of the `logit_scale` paramter. Default is used as per the original CLIP implementation.
-        kwargs (`optional`):
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
+        kwargs (*optional*):
             Dictionary of keyword arguments.
     """
 
@@ -259,11 +259,11 @@ class CLIPConfig(PretrainedConfig):
     @classmethod
     def from_text_vision_configs(cls, text_config: CLIPTextConfig, vision_config: CLIPVisionConfig, **kwargs):
         r"""
-        Instantiate a :class:`~transformers.CLIPConfig` (or a derived class) from clip text model configuration and
+        Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and
         clip vision model configuration.
 
         Returns:
-            :class:`CLIPConfig`: An instance of a configuration object
+            [`CLIPConfig`]: An instance of a configuration object
         """
 
         return cls(text_config_dict=text_config.to_dict(), vision_config_dict=vision_config.to_dict(), **kwargs)
@@ -271,10 +271,10 @@ class CLIPConfig(PretrainedConfig):
     def to_dict(self):
         """
         Serializes this instance to a Python dictionary. Override the default
-        :meth:`~transformers.PretrainedConfig.to_dict`.
+        [`~PretrainedConfig.to_dict`].
 
         Returns:
-            :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
         """
         output = copy.deepcopy(self.__dict__)
         output["text_config"] = self.text_config.to_dict()
diff --git a/src/transformers/models/clip/feature_extraction_clip.py b/src/transformers/models/clip/feature_extraction_clip.py
index b6256b4686..45c5289c90 100644
--- a/src/transformers/models/clip/feature_extraction_clip.py
+++ b/src/transformers/models/clip/feature_extraction_clip.py
@@ -32,29 +32,29 @@ class CLIPFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
     r"""
     Constructs a CLIP feature extractor.
 
-    This feature extractor inherits from :class:`~transformers.FeatureExtractionMixin` which contains most of the main
+    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main
     methods. Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether to resize the input to a certain :obj:`size`.
-        size (:obj:`int`, `optional`, defaults to 224):
-            Resize the input to the given size. Only has an effect if :obj:`do_resize` is set to :obj:`True`.
-        resample (:obj:`int`, `optional`, defaults to :obj:`PIL.Image.BICUBIC`):
-            An optional resampling filter. This can be one of :obj:`PIL.Image.NEAREST`, :obj:`PIL.Image.BOX`,
-            :obj:`PIL.Image.BILINEAR`, :obj:`PIL.Image.HAMMING`, :obj:`PIL.Image.BICUBIC` or :obj:`PIL.Image.LANCZOS`.
-            Only has an effect if :obj:`do_resize` is set to :obj:`True`.
-        do_center_crop (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether to crop the input at the center. If the input size is smaller than :obj:`crop_size` along any edge,
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input to a certain `size`.
+        size (`int`, *optional*, defaults to 224):
+            Resize the input to the given size. Only has an effect if `do_resize` is set to `True`.
+        resample (`int`, *optional*, defaults to `PIL.Image.BICUBIC`):
+            An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
+            `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`.
+            Only has an effect if `do_resize` is set to `True`.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to crop the input at the center. If the input size is smaller than `crop_size` along any edge,
             the image is padded with 0's and then center cropped.
-        crop_size (:obj:`int`, `optional`, defaults to 224):
-            Desired output size when applying center-cropping. Only has an effect if :obj:`do_center_crop` is set to
-            :obj:`True`.
-        do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to normalize the input with :obj:`image_mean` and :obj:`image_std`.
-        image_mean (:obj:`List[int]`, defaults to :obj:`[0.485, 0.456, 0.406]`):
+        crop_size (`int`, *optional*, defaults to 224):
+            Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to
+            `True`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the input with `image_mean` and `image_std`.
+        image_mean (`List[int]`, defaults to `[0.485, 0.456, 0.406]`):
             The sequence of means for each channel, to be used when normalizing images.
-        image_std (:obj:`List[int]`, defaults to :obj:`[0.229, 0.224, 0.225]`):
+        image_std (`List[int]`, defaults to `[0.229, 0.224, 0.225]`):
             The sequence of standard deviations for each channel, to be used when normalizing images.
     """
 
@@ -93,27 +93,29 @@ class CLIPFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
         """
         Main method to prepare for the model one or several image(s).
 
-        .. warning::
+        <Tip warning={true}>
 
-           NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
-           PIL images.
+        NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
+        PIL images.
+
+        </Tip>
 
         Args:
-            images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                 number of channels, H and W are image height and width.
 
-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`, defaults to :obj:`'np'`):
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*, defaults to `'np'`):
                 If set, will return tensors of a particular framework. Acceptable values are:
 
-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return NumPy :obj:`np.ndarray` objects.
-                * :obj:`'jax'`: Return JAX :obj:`jnp.ndarray` objects.
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
 
         Returns:
-            :class:`~transformers.BatchFeature`: A :class:`~transformers.BatchFeature` with the following fields:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
             - **pixel_values** -- Pixel values to be fed to a model.
         """
@@ -157,13 +159,13 @@ class CLIPFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
 
     def center_crop(self, image, size):
         """
-        Crops :obj:`image` to the given size using a center crop. Note that if the image is too small to be cropped to
+        Crops `image` to the given size using a center crop. Note that if the image is too small to be cropped to
         the size is given, it will be padded (so the returned result has the size asked).
 
         Args:
-            image (:obj:`PIL.Image.Image` or :obj:`np.ndarray` or :obj:`torch.Tensor`):
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
                 The image to resize.
-            size (:obj:`int` or :obj:`Tuple[int, int]`):
+            size (`int` or `Tuple[int, int]`):
                 The size to which crop the image.
         """
         self._ensure_format_supported(image)
@@ -183,14 +185,14 @@ class CLIPFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
 
     def resize(self, image, size, resample=Image.BICUBIC):
         """
-        Resizes :obj:`image`. Note that this will trigger a conversion of :obj:`image` to a PIL Image.
+        Resizes `image`. Note that this will trigger a conversion of `image` to a PIL Image.
 
         Args:
-            image (:obj:`PIL.Image.Image` or :obj:`np.ndarray` or :obj:`torch.Tensor`):
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
                 The image to resize.
-            size (:obj:`int` or :obj:`Tuple[int, int]`):
-                The size to use for resizing the image. If :obj:`int` it will be resized to match the shorter side
-            resample (:obj:`int`, `optional`, defaults to :obj:`PIL.Image.BILINEAR`):
+            size (`int` or `Tuple[int, int]`):
+                The size to use for resizing the image. If `int` it will be resized to match the shorter side
+            resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
                 The filter to user for resampling.
         """
         self._ensure_format_supported(image)
diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py
index d61ce2553c..45e7c0b307 100755
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -704,19 +704,20 @@ class CLIPTextModel(CLIPPreTrainedModel):
         r"""
         Returns:
 
-        Examples::
+        Examples:
 
-            >>> from transformers import CLIPTokenizer, CLIPTextModel
+        ```python
+        >>> from transformers import CLIPTokenizer, CLIPTextModel
 
-            >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
-            >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+        >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
 
-            >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"],  padding=True, return_tensors="pt")
+        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"],  padding=True, return_tensors="pt")
 
-            >>> outputs = model(**inputs)
-            >>> last_hidden_state = outputs.last_hidden_state
-            >>> pooled_output = outputs.pooler_output # pooled (EOS token) states
-        """
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output # pooled (EOS token) states
+        ```"""
         return self.text_model(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -810,24 +811,25 @@ class CLIPVisionModel(CLIPPreTrainedModel):
         r"""
         Returns:
 
-        Examples::
+        Examples:
 
-            >>> from PIL import Image
-            >>> import requests
-            >>> from transformers import CLIPProcessor, CLIPVisionModel
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import CLIPProcessor, CLIPVisionModel
 
-            >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
-            >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
 
-            >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-            >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
 
-            >>> inputs = processor(images=image, return_tensors="pt")
+        >>> inputs = processor(images=image, return_tensors="pt")
 
-            >>> outputs = model(**inputs)
-            >>> last_hidden_state = outputs.last_hidden_state
-            >>> pooled_output = outputs.pooler_output # pooled CLS states
-        """
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output # pooled CLS states
+        ```"""
         return self.vision_model(
             pixel_values=pixel_values,
             output_attentions=output_attentions,
@@ -968,25 +970,25 @@ class CLIPModel(CLIPPreTrainedModel):
         r"""
         Returns:
 
-        Examples::
+        Examples:
 
-            >>> from PIL import Image
-            >>> import requests
-            >>> from transformers import CLIPProcessor, CLIPModel
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import CLIPProcessor, CLIPModel
 
-            >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-            >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
 
-            >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-            >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
 
-            >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
+        >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
 
-            >>> outputs = model(**inputs)
-            >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
-            >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
-
-        """
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
+        >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
+        ```"""
         return_dict = return_dict if return_dict is not None else self.config.return_dict
         vision_outputs = self.vision_model(
             pixel_values=pixel_values,
diff --git a/src/transformers/models/clip/modeling_flax_clip.py b/src/transformers/models/clip/modeling_flax_clip.py
index 13530e39d3..97db507467 100644
--- a/src/transformers/models/clip/modeling_flax_clip.py
+++ b/src/transformers/models/clip/modeling_flax_clip.py
@@ -940,18 +940,20 @@ class FlaxCLIPTextModel(FlaxCLIPTextPreTrainedModel):
 FLAX_CLIP_TEXT_MODEL_DOCSTRING = """
     Returns:
 
-    Example::
+    Example:
 
-        >>> from transformers import CLIPTokenizer, FlaxCLIPTextModel
+    ```python
+    >>> from transformers import CLIPTokenizer, FlaxCLIPTextModel
 
-        >>> model = FlaxCLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
-        >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
+    >>> model = FlaxCLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
+    >>> tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-base-patch32")
 
-        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"],  padding=True, return_tensors="np")
+    >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"],  padding=True, return_tensors="np")
 
-        >>> outputs = model(**inputs)
-        >>> last_hidden_state = outputs.last_hidden_state
-        >>> pooler_output = outputs.pooler_output # pooled (EOS token) states
+    >>> outputs = model(**inputs)
+    >>> last_hidden_state = outputs.last_hidden_state
+    >>> pooler_output = outputs.pooler_output # pooled (EOS token) states
+    ```
 """
 
 overwrite_call_docstring(FlaxCLIPTextModel, CLIP_TEXT_INPUTS_DOCSTRING + FLAX_CLIP_TEXT_MODEL_DOCSTRING)
@@ -991,23 +993,25 @@ class FlaxCLIPVisionModel(FlaxCLIPVisionPreTrainedModel):
 FLAX_CLIP_VISION_MODEL_DOCSTRING = """
     Returns:
 
-    Example::
+    Example:
 
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import CLIPProcessor, FlaxCLIPVisionModel
+    ```python
+    >>> from PIL import Image
+    >>> import requests
+    >>> from transformers import CLIPProcessor, FlaxCLIPVisionModel
 
-        >>> model = FlaxCLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
-        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+    >>> model = FlaxCLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
+    >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
 
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> inputs = processor(images=image, return_tensors="np")
+    >>> inputs = processor(images=image, return_tensors="np")
 
-        >>> outputs = model(**inputs)
-        >>> last_hidden_state = outputs.last_hidden_state
-        >>> pooler_output = outputs.pooler_output # pooled CLS states
+    >>> outputs = model(**inputs)
+    >>> last_hidden_state = outputs.last_hidden_state
+    >>> pooler_output = outputs.pooler_output # pooled CLS states
+    ```
 """
 
 overwrite_call_docstring(FlaxCLIPVisionModel, CLIP_VISION_INPUTS_DOCSTRING + FLAX_CLIP_VISION_MODEL_DOCSTRING)
@@ -1115,24 +1119,26 @@ class FlaxCLIPModel(FlaxCLIPPreTrainedModel):
 FLAX_CLIP_MODEL_DOCSTRING = """
     Returns:
 
-    Example::
+    Example:
 
-        >>> import jax
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import CLIPProcessor, FlaxCLIPModel
+    ```python
+    >>> import jax
+    >>> from PIL import Image
+    >>> import requests
+    >>> from transformers import CLIPProcessor, FlaxCLIPModel
 
-        >>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
-        >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+    >>> model = FlaxCLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+    >>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
 
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="np", padding=True)
+    >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="np", padding=True)
 
-        >>> outputs = model(**inputs)
-        >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
-        >>> probs = jax.nn.softmax(logits_per_image, axis=1) # we can take the softmax to get the label probabilities
+    >>> outputs = model(**inputs)
+    >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
+    >>> probs = jax.nn.softmax(logits_per_image, axis=1) # we can take the softmax to get the label probabilities
+    ```
 """
 
 overwrite_call_docstring(FlaxCLIPModel, CLIP_INPUTS_DOCSTRING + FLAX_CLIP_MODEL_DOCSTRING)
diff --git a/src/transformers/models/clip/processing_clip.py b/src/transformers/models/clip/processing_clip.py
index e75199f2b2..caae7983c1 100644
--- a/src/transformers/models/clip/processing_clip.py
+++ b/src/transformers/models/clip/processing_clip.py
@@ -24,14 +24,14 @@ class CLIPProcessor:
     r"""
     Constructs a CLIP processor which wraps a CLIP feature extractor and a CLIP tokenizer into a single processor.
 
-    :class:`~transformers.CLIPProcessor` offers all the functionalities of :class:`~transformers.CLIPFeatureExtractor`
-    and :class:`~transformers.CLIPTokenizer`. See the :meth:`~transformers.CLIPProcessor.__call__` and
-    :meth:`~transformers.CLIPProcessor.decode` for more information.
+    [`CLIPProcessor`] offers all the functionalities of [`CLIPFeatureExtractor`]
+    and [`CLIPTokenizer`]. See the [`~CLIPProcessor.__call__`] and
+    [`~CLIPProcessor.decode`] for more information.
 
     Args:
-        feature_extractor (:class:`~transformers.CLIPFeatureExtractor`):
+        feature_extractor ([`CLIPFeatureExtractor`]):
             The feature extractor is a required input.
-        tokenizer (:class:`~transformers.CLIPTokenizer`):
+        tokenizer ([`CLIPTokenizer`]):
             The tokenizer is a required input.
     """
 
@@ -49,17 +49,19 @@ class CLIPProcessor:
 
     def save_pretrained(self, save_directory):
         """
-        Save a CLIP feature extractor object and CLIP tokenizer object to the directory ``save_directory``, so that it
-        can be re-loaded using the :func:`~transformers.CLIPProcessor.from_pretrained` class method.
+        Save a CLIP feature extractor object and CLIP tokenizer object to the directory `save_directory`, so that it
+        can be re-loaded using the [`~CLIPProcessor.from_pretrained`] class method.
 
-        .. note::
+        <Tip>
 
-            This class method is simply calling :meth:`~transformers.PreTrainedFeatureExtractor.save_pretrained` and
-            :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.save_pretrained`. Please refer to the
-            docstrings of the methods above for more information.
+        This class method is simply calling [`~PreTrainedFeatureExtractor.save_pretrained`] and
+        [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the
+        docstrings of the methods above for more information.
+
+        </Tip>
 
         Args:
-            save_directory (:obj:`str` or :obj:`os.PathLike`):
+            save_directory (`str` or `os.PathLike`):
                 Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
                 be created if it does not exist).
         """
@@ -70,31 +72,33 @@ class CLIPProcessor:
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         r"""
-        Instantiate a :class:`~transformers.CLIPProcessor` from a pretrained CLIP processor.
+        Instantiate a [`CLIPProcessor`] from a pretrained CLIP processor.
 
-        .. note::
+        <Tip>
 
-            This class method is simply calling CLIPFeatureExtractor's
-            :meth:`~transformers.PreTrainedFeatureExtractor.from_pretrained` and CLIPTokenizer's
-            :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained`. Please refer to the
-            docstrings of the methods above for more information.
+        This class method is simply calling CLIPFeatureExtractor's
+        [`~PreTrainedFeatureExtractor.from_pretrained`] and CLIPTokenizer's
+        [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the
+        docstrings of the methods above for more information.
+
+        </Tip>
 
         Args:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
                 This can be either:
 
-                - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like ``clip-vit-base-patch32``, or
-                  namespaced under a user or organization name, like ``openai/clip-vit-base-patch32``.
-                - a path to a `directory` containing a feature extractor file saved using the
-                  :meth:`~transformers.PreTrainedFeatureExtractor.save_pretrained` method, e.g.,
-                  ``./my_model_directory/``.
-                - a path or url to a saved feature extractor JSON `file`, e.g.,
-                  ``./my_model_directory/preprocessor_config.json``.
+                - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
+                  huggingface.co. Valid model ids can be located at the root-level, like `clip-vit-base-patch32`, or
+                  namespaced under a user or organization name, like `openai/clip-vit-base-patch32`.
+                - a path to a *directory* containing a feature extractor file saved using the
+                  [`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g.,
+                  `./my_model_directory/`.
+                - a path or url to a saved feature extractor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.
 
             **kwargs
-                Additional keyword arguments passed along to both :class:`~transformers.PreTrainedFeatureExtractor` and
-                :class:`~transformers.PreTrainedTokenizer`
+                Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and
+                [`PreTrainedTokenizer`]
         """
         feature_extractor = CLIPFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
         tokenizer = CLIPTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
@@ -104,38 +108,38 @@ class CLIPProcessor:
     def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the
-        :obj:`text` and :obj:`kwargs` arguments to CLIPTokenizer's :meth:`~transformers.CLIPTokenizer.__call__` if
-        :obj:`text` is not :obj:`None` to encode the text. To prepare the image(s), this method forwards the
-        :obj:`images` and :obj:`kwrags` arguments to CLIPFeatureExtractor's
-        :meth:`~transformers.CLIPFeatureExtractor.__call__` if :obj:`images` is not :obj:`None`. Please refer to the
+        `text` and `kwargs` arguments to CLIPTokenizer's [`~CLIPTokenizer.__call__`] if
+        `text` is not `None` to encode the text. To prepare the image(s), this method forwards the
+        `images` and `kwrags` arguments to CLIPFeatureExtractor's
+        [`~CLIPFeatureExtractor.__call__`] if `images` is not `None`. Please refer to the
         doctsring of the above two methods for more information.
 
         Args:
-            text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+            text (`str`, `List[str]`, `List[List[str]]`):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                :obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                 number of channels, H and W are image height and width.
 
-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
                 If set, will return tensors of a particular framework. Acceptable values are:
 
-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return NumPy :obj:`np.ndarray` objects.
-                * :obj:`'jax'`: Return JAX :obj:`jnp.ndarray` objects.
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
 
         Returns:
-            :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
 
-            - **input_ids** -- List of token ids to be fed to a model. Returned when :obj:`text` is not :obj:`None`.
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              :obj:`return_attention_mask=True` or if `"attention_mask"` is in :obj:`self.model_input_names` and if
-              :obj:`text` is not :obj:`None`).
-            - **pixel_values** -- Pixel values to be fed to a model. Returned when :obj:`images` is not :obj:`None`.
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if
+              `text` is not `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
 
         if text is None and images is None:
@@ -158,14 +162,14 @@ class CLIPProcessor:
     def batch_decode(self, *args, **kwargs):
         """
         This method forwards all its arguments to CLIPTokenizer's
-        :meth:`~transformers.PreTrainedTokenizer.batch_decode`. Please refer to the docstring of this method for more
+        [`~PreTrainedTokenizer.batch_decode`]. Please refer to the docstring of this method for more
         information.
         """
         return self.tokenizer.batch_decode(*args, **kwargs)
 
     def decode(self, *args, **kwargs):
         """
-        This method forwards all its arguments to CLIPTokenizer's :meth:`~transformers.PreTrainedTokenizer.decode`.
+        This method forwards all its arguments to CLIPTokenizer's [`~PreTrainedTokenizer.decode`].
         Please refer to the docstring of this method for more information.
         """
         return self.tokenizer.decode(*args, **kwargs)
diff --git a/src/transformers/models/clip/tokenization_clip.py b/src/transformers/models/clip/tokenization_clip.py
index 474fc24421..a3da5bb56e 100644
--- a/src/transformers/models/clip/tokenization_clip.py
+++ b/src/transformers/models/clip/tokenization_clip.py
@@ -105,33 +105,34 @@ class CLIPTokenizer(PreTrainedTokenizer):
     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 
 
-    You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
+    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
     call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
 
-    .. note::
+    <Tip>
 
-        When used with ``is_split_into_words=True``, this tokenizer will add a space before each word (even the first
-        one).
+    When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first
+    one).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    </Tip>
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             Path to the vocabulary file.
-        merges_file (:obj:`str`):
+        merges_file (`str`):
             Path to the merges file.
-        errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
-            Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
-            <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+        bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
             The beginning of sequence token.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
             The end of sequence token.
-        add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
             Whether or not to add an initial space to the input. This allows to treat the leading word just as any
             other word. (CLIP tokenizer detect beginning of words by the preceding space).
     """
@@ -200,7 +201,7 @@ class CLIPTokenizer(PreTrainedTokenizer):
     @property
     def pad_token_id(self) -> Optional[int]:
         """
-        :obj:`Optional[int]`: Id of the padding token in the vocabulary. Returns :obj:`None` if the token has not been
+        `Optional[int]`: Id of the padding token in the vocabulary. Returns `None` if the token has not been
         set.
         """
         return 0
@@ -219,18 +220,18 @@ class CLIPTokenizer(PreTrainedTokenizer):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. A CLIP sequence has the following format:
 
-        - single sequence: ``<|startoftext|> X <|endoftext|>``
+        - single sequence: `<|startoftext|> X <|endoftext|>`
 
         Pairs of sequences are not the expected use case, but they will be handled without a separator.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         if token_ids_1 is None:
             return [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
@@ -241,18 +242,18 @@ class CLIPTokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
 
         if already_has_special_tokens:
diff --git a/src/transformers/models/clip/tokenization_clip_fast.py b/src/transformers/models/clip/tokenization_clip_fast.py
index 876c6f7bf5..1870c3b8ae 100644
--- a/src/transformers/models/clip/tokenization_clip_fast.py
+++ b/src/transformers/models/clip/tokenization_clip_fast.py
@@ -49,51 +49,52 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 
 class CLIPTokenizerFast(PreTrainedTokenizerFast):
     """
-    Construct a "fast" CLIP tokenizer (backed by HuggingFace's `tokenizers` library). Based on byte-level
+    Construct a "fast" CLIP tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
     Byte-Pair-Encoding.
 
     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 
-    ::
+    ```
+    >>> from transformers import CLIPTokenizerFast
+    >>> tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32")
+    >>> tokenizer("Hello world")['input_ids']
+    [15496, 995]
+    >>> tokenizer(" Hello world")['input_ids']
+    [18435, 995]
+    ```
 
-        >>> from transformers import CLIPTokenizerFast
-        >>> tokenizer = CLIPTokenizerFast.from_pretrained("openai/clip-vit-base-patch32")
-        >>> tokenizer("Hello world")['input_ids']
-        [15496, 995]
-        >>> tokenizer(" Hello world")['input_ids']
-        [18435, 995]
-
-    You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
+    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
     call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
 
-    .. note::
+    <Tip>
 
-        When used with ``is_split_into_words=True``, this tokenizer needs to be instantiated with
-        ``add_prefix_space=True``.
+    When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with
+    `add_prefix_space=True`.
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    </Tip>
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
     methods. Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             Path to the vocabulary file.
-        merges_file (:obj:`str`):
+        merges_file (`str`):
             Path to the merges file.
-        errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
-            Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
-            <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+        bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
             The beginning of sequence token.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
             The end of sequence token.
-        add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
             Whether or not to add an initial space to the input. This allows to treat the leading word just as any
             other word. (CLIP tokenizer detect beginning of words by the preceding space).
-        trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        trim_offsets (`bool`, *optional*, defaults to `True`):
             Whether or not the post-processing step should trim offsets to avoid including whitespaces.
     """
 
@@ -139,7 +140,7 @@ class CLIPTokenizerFast(PreTrainedTokenizerFast):
     @property
     def pad_token_id(self) -> Optional[int]:
         """
-        :obj:`Optional[int]`: Id of the padding token in the vocabulary. Returns :obj:`None` if the token has not been
+        `Optional[int]`: Id of the padding token in the vocabulary. Returns `None` if the token has not been
         set.
         """
         return 0
diff --git a/src/transformers/models/convbert/configuration_convbert.py b/src/transformers/models/convbert/configuration_convbert.py
index 1f904ddfce..bce7518754 100644
--- a/src/transformers/models/convbert/configuration_convbert.py
+++ b/src/transformers/models/convbert/configuration_convbert.py
@@ -30,61 +30,62 @@ CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class ConvBertConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.ConvBertModel`. It is used to
+    This is the configuration class to store the configuration of a [`ConvBertModel`]. It is used to
     instantiate an ConvBERT model according to the specified arguments, defining the model architecture. Instantiating
-    a configuration with the defaults will yield a similar configuration to that of the ConvBERT `conv-bert-base
-    <https://huggingface.co/YituTech/conv-bert-base>`__ architecture. Configuration objects inherit from
-    :class:`~transformers.PretrainedConfig` and can be used to control the model outputs. Read the documentation from
-    :class:`~transformers.PretrainedConfig` for more information.
+    a configuration with the defaults will yield a similar configuration to that of the ConvBERT [conv-bert-base](https://huggingface.co/YituTech/conv-bert-base) architecture. Configuration objects inherit from
+    [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from
+    [`PretrainedConfig`] for more information.
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the ConvBERT model. Defines the number of different tokens that can be represented by
-            the :obj:`inputs_ids` passed when calling :class:`~transformers.ConvBertModel` or
-            :class:`~transformers.TFConvBertModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            the `inputs_ids` passed when calling [`ConvBertModel`] or
+            [`TFConvBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.ConvBertModel`
-            or :class:`~transformers.TFConvBertModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`ConvBertModel`]
+            or [`TFConvBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        head_ratio (:obj:`int`, `optional`, defaults to 2):
+        head_ratio (`int`, *optional*, defaults to 2):
             Ratio gamma to reduce the number of attention heads.
-        num_groups (:obj:`int`, `optional`, defaults to 1):
+        num_groups (`int`, *optional*, defaults to 1):
             The number of groups for grouped linear layers for ConvBert model
-        conv_kernel_size (:obj:`int`, `optional`, defaults to 9):
+        conv_kernel_size (`int`, *optional*, defaults to 9):
             The size of the convolutional kernel.
-        classifier_dropout (:obj:`float`, `optional`):
+        classifier_dropout (`float`, *optional*):
             The dropout ratio for the classification head.
 
-    Example::
-        >>> from transformers import ConvBertModel, ConvBertConfig
-        >>> # Initializing a ConvBERT convbert-base-uncased style configuration
-        >>> configuration = ConvBertConfig()
-        >>> # Initializing a model from the convbert-base-uncased style configuration
-        >>> model = ConvBertModel(configuration)
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    Example:
+
+    ```python
+    >>> from transformers import ConvBertModel, ConvBertConfig
+    >>> # Initializing a ConvBERT convbert-base-uncased style configuration
+    >>> configuration = ConvBertConfig()
+    >>> # Initializing a model from the convbert-base-uncased style configuration
+    >>> model = ConvBertModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "convbert"
 
     def __init__(
diff --git a/src/transformers/models/convbert/tokenization_convbert.py b/src/transformers/models/convbert/tokenization_convbert.py
index 12ee66ed28..e4a73f5018 100644
--- a/src/transformers/models/convbert/tokenization_convbert.py
+++ b/src/transformers/models/convbert/tokenization_convbert.py
@@ -45,9 +45,9 @@ PRETRAINED_INIT_CONFIGURATION = {
 
 class ConvBertTokenizer(BertTokenizer):
     r"""
-    Construct a ConvBERT tokenizer. :class:`~transformers.ConvBertTokenizer` is identical to
-    :class:`~transformers.BertTokenizer` and runs end-to-end tokenization: punctuation splitting and wordpiece. Refer
-    to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning parameters.
+    Construct a ConvBERT tokenizer. [`ConvBertTokenizer`] is identical to
+    [`BertTokenizer`] and runs end-to-end tokenization: punctuation splitting and wordpiece. Refer
+    to superclass [`BertTokenizer`] for usage examples and documentation concerning parameters.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/models/convbert/tokenization_convbert_fast.py b/src/transformers/models/convbert/tokenization_convbert_fast.py
index 4bc4c05234..8a0f42880c 100644
--- a/src/transformers/models/convbert/tokenization_convbert_fast.py
+++ b/src/transformers/models/convbert/tokenization_convbert_fast.py
@@ -46,12 +46,12 @@ PRETRAINED_INIT_CONFIGURATION = {
 
 class ConvBertTokenizerFast(BertTokenizerFast):
     r"""
-    Construct a "fast" ConvBERT tokenizer (backed by HuggingFace's `tokenizers` library).
+    Construct a "fast" ConvBERT tokenizer (backed by HuggingFace's *tokenizers* library).
 
-    :class:`~transformers.ConvBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
+    [`ConvBertTokenizerFast`] is identical to [`BertTokenizerFast`] and runs
     end-to-end tokenization: punctuation splitting and wordpiece.
 
-    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning
     parameters.
     """
     vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/models/cpm/tokenization_cpm.py b/src/transformers/models/cpm/tokenization_cpm.py
index 7410128a92..5cd3a72012 100644
--- a/src/transformers/models/cpm/tokenization_cpm.py
+++ b/src/transformers/models/cpm/tokenization_cpm.py
@@ -33,59 +33,64 @@ class CpmTokenizer(XLNetTokenizer):
 
     def __init__(self, *args, **kwargs):
         """
-        Construct a CPM tokenizer. Based on `Jieba <https://pypi.org/project/jieba/>` and `SentencePiece
-        <https://github.com/google/sentencepiece>`__.
+        Construct a CPM tokenizer. Based on *Jieba <https://pypi.org/project/jieba/>* and [SentencePiece](https://github.com/google/sentencepiece).
 
-        This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main
+        This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main
         methods. Users should refer to this superclass for more information regarding those methods.
 
         Args:
-            vocab_file (:obj:`str`):
-                `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a .spm extension) that
+            vocab_file (`str`):
+                [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
                 contains the vocabulary necessary to instantiate a tokenizer.
-            do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            do_lower_case (`bool`, *optional*, defaults to `True`):
                 Whether to lowercase the input when tokenizing.
-            remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            remove_space (`bool`, *optional*, defaults to `True`):
                 Whether to strip the text when tokenizing (removing excess spaces before and after the string).
-            keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            keep_accents (`bool`, *optional*, defaults to `False`):
                 Whether to keep accents when tokenizing.
-            bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+            bos_token (`str`, *optional*, defaults to `"<s>"`):
                 The beginning of sequence token that was used during pretraining. Can be used a sequence classifier
                 token.
 
-                .. note::
+                <Tip>
 
-                    When building a sequence using special tokens, this is not the token that is used for the beginning
-                    of sequence. The token used is the :obj:`cls_token`.
-            eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+                When building a sequence using special tokens, this is not the token that is used for the beginning
+                of sequence. The token used is the `cls_token`.
+
+                </Tip>
+
+            eos_token (`str`, *optional*, defaults to `"</s>"`):
                 The end of sequence token.
 
-                .. note::
+                <Tip>
 
-                    When building a sequence using special tokens, this is not the token that is used for the end of
-                    sequence. The token used is the :obj:`sep_token`.
-            unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+                When building a sequence using special tokens, this is not the token that is used for the end of
+                sequence. The token used is the `sep_token`.
+
+                </Tip>
+
+            unk_token (`str`, *optional*, defaults to `"<unk>"`):
                 The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be
                 this token instead.
-            sep_token (:obj:`str`, `optional`, defaults to :obj:`"<sep>"`):
+            sep_token (`str`, *optional*, defaults to `"<sep>"`):
                 The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
                 for sequence classification or for a text and a question for question answering. It is also used as the
                 last token of a sequence built with special tokens.
-            pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+            pad_token (`str`, *optional*, defaults to `"<pad>"`):
                 The token used for padding, for example when batching sequences of different lengths.
-            cls_token (:obj:`str`, `optional`, defaults to :obj:`"<cls>"`):
+            cls_token (`str`, *optional*, defaults to `"<cls>"`):
                 The classifier token which is used when doing sequence classification (classification of the whole
                 sequence instead of per-token classification). It is the first token of the sequence when built with
                 special tokens.
-            mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+            mask_token (`str`, *optional*, defaults to `"<mask>"`):
                 The token used for masking values. This is the token used when training this model with masked language
                 modeling. This is the token which the model will try to predict.
-            additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<eop>", "<eod>"]`):
+            additional_special_tokens (`List[str]`, *optional*, defaults to `["<eop>", "<eod>"]`):
                 Additional special tokens used by the tokenizer.
 
         Attributes:
-            sp_model (:obj:`SentencePieceProcessor`):
-                The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+            sp_model (`SentencePieceProcessor`):
+                The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
         """
         super().__init__(*args, **kwargs)
         try:
diff --git a/src/transformers/models/cpm/tokenization_cpm_fast.py b/src/transformers/models/cpm/tokenization_cpm_fast.py
index 24a856c73c..42a627d88c 100644
--- a/src/transformers/models/cpm/tokenization_cpm_fast.py
+++ b/src/transformers/models/cpm/tokenization_cpm_fast.py
@@ -36,59 +36,64 @@ class CpmTokenizerFast(XLNetTokenizerFast):
 
     def __init__(self, *args, **kwargs):
         """
-        Construct a CPM tokenizer. Based on `Jieba <https://pypi.org/project/jieba/>` and `SentencePiece
-        <https://github.com/google/sentencepiece>`__.
+        Construct a CPM tokenizer. Based on *Jieba <https://pypi.org/project/jieba/>* and [SentencePiece](https://github.com/google/sentencepiece).
 
-        This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main
+        This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main
         methods. Users should refer to this superclass for more information regarding those methods.
 
         Args:
-            vocab_file (:obj:`str`):
-                `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a .spm extension) that
+            vocab_file (`str`):
+                [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
                 contains the vocabulary necessary to instantiate a tokenizer.
-            do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            do_lower_case (`bool`, *optional*, defaults to `True`):
                 Whether to lowercase the input when tokenizing.
-            remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            remove_space (`bool`, *optional*, defaults to `True`):
                 Whether to strip the text when tokenizing (removing excess spaces before and after the string).
-            keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            keep_accents (`bool`, *optional*, defaults to `False`):
                 Whether to keep accents when tokenizing.
-            bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+            bos_token (`str`, *optional*, defaults to `"<s>"`):
                 The beginning of sequence token that was used during pretraining. Can be used a sequence classifier
                 token.
 
-                .. note::
+                <Tip>
 
-                    When building a sequence using special tokens, this is not the token that is used for the beginning
-                    of sequence. The token used is the :obj:`cls_token`.
-            eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+                When building a sequence using special tokens, this is not the token that is used for the beginning
+                of sequence. The token used is the `cls_token`.
+
+                </Tip>
+
+            eos_token (`str`, *optional*, defaults to `"</s>"`):
                 The end of sequence token.
 
-                .. note::
+                <Tip>
 
-                    When building a sequence using special tokens, this is not the token that is used for the end of
-                    sequence. The token used is the :obj:`sep_token`.
-            unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+                When building a sequence using special tokens, this is not the token that is used for the end of
+                sequence. The token used is the `sep_token`.
+
+                </Tip>
+
+            unk_token (`str`, *optional*, defaults to `"<unk>"`):
                 The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be
                 this token instead.
-            sep_token (:obj:`str`, `optional`, defaults to :obj:`"<sep>"`):
+            sep_token (`str`, *optional*, defaults to `"<sep>"`):
                 The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
                 for sequence classification or for a text and a question for question answering. It is also used as the
                 last token of a sequence built with special tokens.
-            pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+            pad_token (`str`, *optional*, defaults to `"<pad>"`):
                 The token used for padding, for example when batching sequences of different lengths.
-            cls_token (:obj:`str`, `optional`, defaults to :obj:`"<cls>"`):
+            cls_token (`str`, *optional*, defaults to `"<cls>"`):
                 The classifier token which is used when doing sequence classification (classification of the whole
                 sequence instead of per-token classification). It is the first token of the sequence when built with
                 special tokens.
-            mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+            mask_token (`str`, *optional*, defaults to `"<mask>"`):
                 The token used for masking values. This is the token used when training this model with masked language
                 modeling. This is the token which the model will try to predict.
-            additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<eop>", "<eod>"]`):
+            additional_special_tokens (`List[str]`, *optional*, defaults to `["<eop>", "<eod>"]`):
                 Additional special tokens used by the tokenizer.
 
         Attributes:
-            sp_model (:obj:`SentencePieceProcessor`):
-                The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+            sp_model (`SentencePieceProcessor`):
+                The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
         """
         super().__init__(*args, **kwargs)
         try:
diff --git a/src/transformers/models/ctrl/configuration_ctrl.py b/src/transformers/models/ctrl/configuration_ctrl.py
index 2db3f778f8..5c8aa366a0 100644
--- a/src/transformers/models/ctrl/configuration_ctrl.py
+++ b/src/transformers/models/ctrl/configuration_ctrl.py
@@ -25,57 +25,58 @@ CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP = {"ctrl": "https://huggingface.co/ctrl/resol
 
 class CTRLConfig(PretrainedConfig):
     """
-    This is the configuration class to store the configuration of a :class:`~transformers.CTRLModel` or a
-    :class:`~transformers.TFCTRLModel`. It is used to instantiate a CTRL model according to the specified arguments,
+    This is the configuration class to store the configuration of a [`CTRLModel`] or a
+    [`TFCTRLModel`]. It is used to instantiate a CTRL model according to the specified arguments,
     defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
-    to that of the `ctrl <https://huggingface.co/ctrl>`__ architecture from SalesForce.
+    to that of the [ctrl](https://huggingface.co/ctrl) architecture from SalesForce.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 246534):
+        vocab_size (`int`, *optional*, defaults to 246534):
             Vocabulary size of the CTRL model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.CTRLModel` or
-            :class:`~transformers.TFCTRLModel`.
-        n_positions (:obj:`int`, `optional`, defaults to 256):
+            `inputs_ids` passed when calling [`CTRLModel`] or
+            [`TFCTRLModel`].
+        n_positions (`int`, *optional*, defaults to 256):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        n_embd (:obj:`int`, `optional`, defaults to 1280):
+        n_embd (`int`, *optional*, defaults to 1280):
             Dimensionality of the embeddings and hidden states.
-        dff (:obj:`int`, `optional`, defaults to 8192):
+        dff (`int`, *optional*, defaults to 8192):
             Dimensionality of the inner dimension of the feed forward networks (FFN).
-        n_layer (:obj:`int`, `optional`, defaults to 48):
+        n_layer (`int`, *optional*, defaults to 48):
             Number of hidden layers in the Transformer encoder.
-        n_head (:obj:`int`, `optional`, defaults to 16):
+        n_head (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
-        resid_pdrop (:obj:`float`, `optional`, defaults to 0.1):
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        embd_pdrop (:obj:`int`, `optional`, defaults to 0.1):
+        embd_pdrop (`int`, *optional*, defaults to 0.1):
             The dropout ratio for the embeddings.
-        attn_pdrop (:obj:`float`, `optional`, defaults to 0.1):
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention.
-        layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-6):
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-6):
             The epsilon to use in the layer normalization layers
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
 
 
-    Examples::
+    Examples:
 
-        >>> from transformers import CTRLModel, CTRLConfig
+    ```python
+    >>> from transformers import CTRLModel, CTRLConfig
 
-        >>> # Initializing a CTRL configuration
-        >>> configuration = CTRLConfig()
+    >>> # Initializing a CTRL configuration
+    >>> configuration = CTRLConfig()
 
-        >>> # Initializing a model from the configuration
-        >>> model = CTRLModel(configuration)
+    >>> # Initializing a model from the configuration
+    >>> model = CTRLModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
 
     model_type = "ctrl"
     keys_to_ignore_at_inference = ["past_key_values"]
diff --git a/src/transformers/models/ctrl/tokenization_ctrl.py b/src/transformers/models/ctrl/tokenization_ctrl.py
index 31ac0637a9..86c24f3125 100644
--- a/src/transformers/models/ctrl/tokenization_ctrl.py
+++ b/src/transformers/models/ctrl/tokenization_ctrl.py
@@ -120,15 +120,15 @@ class CTRLTokenizer(PreTrainedTokenizer):
     """
     Construct a CTRL tokenizer. Based on Byte-Pair-Encoding.
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             Path to the vocabulary file.
-        merges_file (:obj:`str`):
+        merges_file (`str`):
             Path to the merges file.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
     """
diff --git a/src/transformers/models/deberta/configuration_deberta.py b/src/transformers/models/deberta/configuration_deberta.py
index 30a984f620..c7701db961 100644
--- a/src/transformers/models/deberta/configuration_deberta.py
+++ b/src/transformers/models/deberta/configuration_deberta.py
@@ -32,59 +32,59 @@ DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class DebertaConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.DebertaModel` or a
-    :class:`~transformers.TFDebertaModel`. It is used to instantiate a DeBERTa model according to the specified
+    This is the configuration class to store the configuration of a [`DebertaModel`] or a
+    [`TFDebertaModel`]. It is used to instantiate a DeBERTa model according to the specified
     arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
-    configuration to that of the DeBERTa `microsoft/deberta-base <https://huggingface.co/microsoft/deberta-base>`__
+    configuration to that of the DeBERTa [microsoft/deberta-base](https://huggingface.co/microsoft/deberta-base)
     architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
     Arguments:
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the DeBERTa model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.DebertaModel` or
-            :class:`~transformers.TFDebertaModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            `inputs_ids` passed when calling [`DebertaModel`] or
+            [`TFDebertaModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"`, :obj:`"gelu"`, :obj:`"tanh"`, :obj:`"gelu_fast"`,
-            :obj:`"mish"`, :obj:`"linear"`, :obj:`"sigmoid"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"`, `"gelu"`, `"tanh"`, `"gelu_fast"`,
+            `"mish"`, `"linear"`, `"sigmoid"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.DebertaModel` or
-            :class:`~transformers.TFDebertaModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`DebertaModel`] or
+            [`TFDebertaModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        relative_attention (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        relative_attention (`bool`, *optional*, defaults to `False`):
             Whether use relative position encoding.
-        max_relative_positions (:obj:`int`, `optional`, defaults to 1):
-            The range of relative positions :obj:`[-max_position_embeddings, max_position_embeddings]`. Use the same
-            value as :obj:`max_position_embeddings`.
-        pad_token_id (:obj:`int`, `optional`, defaults to 0):
+        max_relative_positions (`int`, *optional*, defaults to 1):
+            The range of relative positions `[-max_position_embeddings, max_position_embeddings]`. Use the same
+            value as `max_position_embeddings`.
+        pad_token_id (`int`, *optional*, defaults to 0):
             The value used to pad input_ids.
-        position_biased_input (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        position_biased_input (`bool`, *optional*, defaults to `True`):
             Whether add absolute position embedding to content embedding.
-        pos_att_type (:obj:`List[str]`, `optional`):
-            The type of relative position attention, it can be a combination of :obj:`["p2c", "c2p", "p2p"]`, e.g.
-            :obj:`["p2c"]`, :obj:`["p2c", "c2p"]`, :obj:`["p2c", "c2p", 'p2p"]`.
-        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+        pos_att_type (`List[str]`, *optional*):
+            The type of relative position attention, it can be a combination of `["p2c", "c2p", "p2p"]`, e.g.
+            `["p2c"]`, `["p2c", "c2p"]`, `["p2c", "c2p", 'p2p"]`.
+        layer_norm_eps (`float`, optional, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
     """
     model_type = "deberta"
diff --git a/src/transformers/models/deberta/tokenization_deberta.py b/src/transformers/models/deberta/tokenization_deberta.py
index ddd08e5286..97ddff5d5e 100644
--- a/src/transformers/models/deberta/tokenization_deberta.py
+++ b/src/transformers/models/deberta/tokenization_deberta.py
@@ -64,23 +64,23 @@ class DebertaTokenizer(GPT2Tokenizer):
     Constructs a DeBERTa tokenizer, which runs end-to-end tokenization: punctuation splitting + wordpiece
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             File containing the vocabulary.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
             Whether or not to lowercase the input when tokenizing.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
             The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
     """
@@ -141,13 +141,13 @@ class DebertaTokenizer(GPT2Tokenizer):
         - pair of sequences: [CLS] A [SEP] B [SEP]
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         if token_ids_1 is None:
             return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -160,18 +160,18 @@ class DebertaTokenizer(GPT2Tokenizer):
     ) -> List[int]:
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
+        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
         if already_has_special_tokens:
             return super().get_special_tokens_mask(
@@ -189,21 +189,21 @@ class DebertaTokenizer(GPT2Tokenizer):
         Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
         sequence pair mask has the following format:
 
-        ::
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
 
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
-
-        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
             sequence(s).
         """
         sep = [self.sep_token_id]
diff --git a/src/transformers/models/deberta/tokenization_deberta_fast.py b/src/transformers/models/deberta/tokenization_deberta_fast.py
index 54f82d6b1a..ae8e37abcc 100644
--- a/src/transformers/models/deberta/tokenization_deberta_fast.py
+++ b/src/transformers/models/deberta/tokenization_deberta_fast.py
@@ -63,26 +63,26 @@ PRETRAINED_INIT_CONFIGURATION = {
 class DebertaTokenizerFast(GPT2TokenizerFast):
     """
     Constructs a "fast" DeBERTa tokenizer, which runs end-to-end tokenization: punctuation splitting + wordpiece. It is
-    backed by HuggingFace's `tokenizers` library.
+    backed by HuggingFace's *tokenizers* library.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             File containing the vocabulary.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
             Whether or not to lowercase the input when tokenizing.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
             The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
     """
@@ -129,11 +129,11 @@ class DebertaTokenizerFast(GPT2TokenizerFast):
     @property
     def mask_token(self) -> str:
         """
-        :obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
+        `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
         not having been set.
 
         Deberta tokenizer has a special mask token to be used in the fill-mask pipeline. The mask token will greedily
-        comprise the space before the `[MASK]`.
+        comprise the space before the *[MASK]*.
         """
         if self._mask_token is None and self.verbose:
             logger.error("Using mask_token, but it is not set yet.")
@@ -161,13 +161,13 @@ class DebertaTokenizerFast(GPT2TokenizerFast):
         - pair of sequences: [CLS] A [SEP] B [SEP]
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         if token_ids_1 is None:
             return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -182,21 +182,21 @@ class DebertaTokenizerFast(GPT2TokenizerFast):
         Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
         sequence pair mask has the following format:
 
-        ::
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+        | first sequence    | second sequence |
+        ```
 
-            0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
-            | first sequence    | second sequence |
-
-        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
             sequence(s).
         """
         sep = [self.sep_token_id]
diff --git a/src/transformers/models/deberta_v2/configuration_deberta_v2.py b/src/transformers/models/deberta_v2/configuration_deberta_v2.py
index 9870979fb8..1c283f2cfa 100644
--- a/src/transformers/models/deberta_v2/configuration_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/configuration_deberta_v2.py
@@ -30,57 +30,57 @@ DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class DebertaV2Config(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.DebertaV2Model`. It is used
+    This is the configuration class to store the configuration of a [`DebertaV2Model`]. It is used
     to instantiate a DeBERTa-v2 model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the DeBERTa
-    `microsoft/deberta-v2-xlarge <https://huggingface.co/microsoft/deberta-base>`__ architecture.
+    [microsoft/deberta-v2-xlarge](https://huggingface.co/microsoft/deberta-base) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
     Arguments:
-        vocab_size (:obj:`int`, `optional`, defaults to 128100):
+        vocab_size (`int`, *optional*, defaults to 128100):
             Vocabulary size of the DeBERTa-v2 model. Defines the number of different tokens that can be represented by
-            the :obj:`inputs_ids` passed when calling :class:`~transformers.DebertaV2Model`.
-        hidden_size (:obj:`int`, `optional`, defaults to 1536):
+            the `inputs_ids` passed when calling [`DebertaV2Model`].
+        hidden_size (`int`, *optional*, defaults to 1536):
             Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 24):
+        num_hidden_layers (`int`, *optional*, defaults to 24):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 24):
+        num_attention_heads (`int`, *optional*, defaults to 24):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 6144):
+        intermediate_size (`int`, *optional*, defaults to 6144):
             Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"`, :obj:`"gelu"`, :obj:`"tanh"`, :obj:`"gelu_fast"`,
-            :obj:`"mish"`, :obj:`"linear"`, :obj:`"sigmoid"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"`, `"gelu"`, `"tanh"`, `"gelu_fast"`,
+            `"mish"`, `"linear"`, `"sigmoid"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 0):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.DebertaModel` or
-            :class:`~transformers.TFDebertaModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 0):
+            The vocabulary size of the `token_type_ids` passed when calling [`DebertaModel`] or
+            [`TFDebertaModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-7):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-7):
             The epsilon used by the layer normalization layers.
-        relative_attention (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        relative_attention (`bool`, *optional*, defaults to `True`):
             Whether use relative position encoding.
-        max_relative_positions (:obj:`int`, `optional`, defaults to -1):
-            The range of relative positions :obj:`[-max_position_embeddings, max_position_embeddings]`. Use the same
-            value as :obj:`max_position_embeddings`.
-        pad_token_id (:obj:`int`, `optional`, defaults to 0):
+        max_relative_positions (`int`, *optional*, defaults to -1):
+            The range of relative positions `[-max_position_embeddings, max_position_embeddings]`. Use the same
+            value as `max_position_embeddings`.
+        pad_token_id (`int`, *optional*, defaults to 0):
             The value used to pad input_ids.
-        position_biased_input (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        position_biased_input (`bool`, *optional*, defaults to `False`):
             Whether add absolute position embedding to content embedding.
-        pos_att_type (:obj:`List[str]`, `optional`):
-            The type of relative position attention, it can be a combination of :obj:`["p2c", "c2p", "p2p"]`, e.g.
-            :obj:`["p2c"]`, :obj:`["p2c", "c2p"]`, :obj:`["p2c", "c2p", 'p2p"]`.
-        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+        pos_att_type (`List[str]`, *optional*):
+            The type of relative position attention, it can be a combination of `["p2c", "c2p", "p2p"]`, e.g.
+            `["p2c"]`, `["p2c", "c2p"]`, `["p2c", "c2p", 'p2p"]`.
+        layer_norm_eps (`float`, optional, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
     """
     model_type = "deberta-v2"
diff --git a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
index dfca91fb1a..5c6612af1a 100644
--- a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
@@ -52,49 +52,48 @@ VOCAB_FILES_NAMES = {"vocab_file": "spm.model"}
 
 class DebertaV2Tokenizer(PreTrainedTokenizer):
     r"""
-    Constructs a DeBERTa-v2 tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
+    Constructs a DeBERTa-v2 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
 
     Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        do_lower_case (`bool`, *optional*, defaults to `False`):
             Whether or not to lowercase the input when tokenizing.
-        bos_token (:obj:`string`, `optional`, defaults to "[CLS]"):
+        bos_token (`string`, *optional*, defaults to "[CLS]"):
             The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
             When building a sequence using special tokens, this is not the token that is used for the beginning of
-            sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`string`, `optional`, defaults to "[SEP]"):
+            sequence. The token used is the `cls_token`.
+        eos_token (`string`, *optional*, defaults to "[SEP]"):
             The end of sequence token. When building a sequence using special tokens, this is not the token that is
-            used for the end of sequence. The token used is the :obj:`sep_token`.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+            used for the end of sequence. The token used is the `sep_token`.
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
             The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        sp_model_kwargs (:obj:`dict`, `optional`):
-            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
-            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
 
-            - ``enable_sampling``: Enable subword regularization.
-            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
 
-              - ``nbest_size = {0,1}``: No sampling is performed.
-              - ``nbest_size > 1``: samples from the nbest_size results.
-              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                 using forward-filtering-and-backward-sampling algorithm.
 
-            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
               BPE-dropout.
     """
 
@@ -183,13 +182,13 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
         - pair of sequences: [CLS] A [SEP] B [SEP]
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
 
         if token_ids_1 is None:
@@ -201,18 +200,18 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
     def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
+        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
 
         if already_has_special_tokens:
@@ -229,21 +228,21 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
         Create a mask from the two sequences passed to be used in a sequence-pair classification task. A DeBERTa
         sequence pair mask has the following format:
 
-        ::
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
 
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
-
-        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
             sequence(s).
         """
         sep = [self.sep_token_id]
@@ -264,25 +263,24 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
 
 class SPMTokenizer:
     r"""
-    Constructs a tokenizer based on `SentencePiece <https://github.com/google/sentencepiece>`__.
+    Constructs a tokenizer based on [SentencePiece](https://github.com/google/sentencepiece).
 
     Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
-        sp_model_kwargs (:obj:`dict`, `optional`):
-            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
-            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
 
-            - ``enable_sampling``: Enable subword regularization.
-            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
 
-              - ``nbest_size = {0,1}``: No sampling is performed.
-              - ``nbest_size > 1``: samples from the nbest_size results.
-              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                 using forward-filtering-and-backward-sampling algorithm.
 
-            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
               BPE-dropout.
     """
 
diff --git a/src/transformers/models/deit/configuration_deit.py b/src/transformers/models/deit/configuration_deit.py
index d394431925..09d979daef 100644
--- a/src/transformers/models/deit/configuration_deit.py
+++ b/src/transformers/models/deit/configuration_deit.py
@@ -28,59 +28,60 @@ DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class DeiTConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.DeiTModel`. It is used to
+    This is the configuration class to store the configuration of a [`DeiTModel`]. It is used to
     instantiate an DeiT model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the DeiT
-    `facebook/deit-base-distilled-patch16-224 <https://huggingface.co/facebook/deit-base-distilled-patch16-224>`__
+    [facebook/deit-base-distilled-patch16-224](https://huggingface.co/facebook/deit-base-distilled-patch16-224)
     architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        image_size (:obj:`int`, `optional`, defaults to :obj:`224`):
+        image_size (`int`, *optional*, defaults to `224`):
             The size (resolution) of each image.
-        patch_size (:obj:`int`, `optional`, defaults to :obj:`16`):
+        patch_size (`int`, *optional*, defaults to `16`):
             The size (resolution) of each patch.
-        num_channels (:obj:`int`, `optional`, defaults to :obj:`3`):
+        num_channels (`int`, *optional*, defaults to `3`):
             The number of input channels.
-        qkv_bias (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
 
 
-    Example::
+    Example:
 
-        >>> from transformers import DeiTModel, DeiTConfig
+    ```python
+    >>> from transformers import DeiTModel, DeiTConfig
 
-        >>> # Initializing a DeiT deit-base-distilled-patch16-224 style configuration
-        >>> configuration = DeiTConfig()
+    >>> # Initializing a DeiT deit-base-distilled-patch16-224 style configuration
+    >>> configuration = DeiTConfig()
 
-        >>> # Initializing a model from the deit-base-distilled-patch16-224 style configuration
-        >>> model = DeiTModel(configuration)
+    >>> # Initializing a model from the deit-base-distilled-patch16-224 style configuration
+    >>> model = DeiTModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "deit"
 
     def __init__(
diff --git a/src/transformers/models/deit/feature_extraction_deit.py b/src/transformers/models/deit/feature_extraction_deit.py
index b5d86ebba6..a5ed140303 100644
--- a/src/transformers/models/deit/feature_extraction_deit.py
+++ b/src/transformers/models/deit/feature_extraction_deit.py
@@ -38,31 +38,31 @@ class DeiTFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
     r"""
     Constructs a DeiT feature extractor.
 
-    This feature extractor inherits from :class:`~transformers.FeatureExtractionMixin` which contains most of the main
+    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main
     methods. Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether to resize the input to a certain :obj:`size`.
-        size (:obj:`int` or :obj:`Tuple(int)`, `optional`, defaults to 256):
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input to a certain `size`.
+        size (`int` or `Tuple(int)`, *optional*, defaults to 256):
             Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an
-            integer is provided, then the input will be resized to (size, size). Only has an effect if :obj:`do_resize`
-            is set to :obj:`True`.
-        resample (:obj:`int`, `optional`, defaults to :obj:`PIL.Image.BICUBIC`):
-            An optional resampling filter. This can be one of :obj:`PIL.Image.NEAREST`, :obj:`PIL.Image.BOX`,
-            :obj:`PIL.Image.BILINEAR`, :obj:`PIL.Image.HAMMING`, :obj:`PIL.Image.BICUBIC` or :obj:`PIL.Image.LANCZOS`.
-            Only has an effect if :obj:`do_resize` is set to :obj:`True`.
-        do_center_crop (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether to crop the input at the center. If the input size is smaller than :obj:`crop_size` along any edge,
+            integer is provided, then the input will be resized to (size, size). Only has an effect if `do_resize`
+            is set to `True`.
+        resample (`int`, *optional*, defaults to `PIL.Image.BICUBIC`):
+            An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
+            `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`.
+            Only has an effect if `do_resize` is set to `True`.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to crop the input at the center. If the input size is smaller than `crop_size` along any edge,
             the image is padded with 0's and then center cropped.
-        crop_size (:obj:`int`, `optional`, defaults to 224):
-            Desired output size when applying center-cropping. Only has an effect if :obj:`do_center_crop` is set to
-            :obj:`True`.
-        do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to normalize the input with :obj:`image_mean` and :obj:`image_std`.
-        image_mean (:obj:`List[int]`, defaults to :obj:`[0.485, 0.456, 0.406]`):
+        crop_size (`int`, *optional*, defaults to 224):
+            Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to
+            `True`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the input with `image_mean` and `image_std`.
+        image_mean (`List[int]`, defaults to `[0.485, 0.456, 0.406]`):
             The sequence of means for each channel, to be used when normalizing images.
-        image_std (:obj:`List[int]`, defaults to :obj:`[0.229, 0.224, 0.225]`):
+        image_std (`List[int]`, defaults to `[0.229, 0.224, 0.225]`):
             The sequence of standard deviations for each channel, to be used when normalizing images.
     """
 
@@ -96,27 +96,29 @@ class DeiTFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
         """
         Main method to prepare for the model one or several image(s).
 
-        .. warning::
+        <Tip warning={true}>
 
-           NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
-           PIL images.
+        NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
+        PIL images.
+
+        </Tip>
 
         Args:
-            images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                 number of channels, H and W are image height and width.
 
-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`, defaults to :obj:`'np'`):
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*, defaults to `'np'`):
                 If set, will return tensors of a particular framework. Acceptable values are:
 
-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return NumPy :obj:`np.ndarray` objects.
-                * :obj:`'jax'`: Return JAX :obj:`jnp.ndarray` objects.
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
 
         Returns:
-            :class:`~transformers.BatchFeature`: A :class:`~transformers.BatchFeature` with the following fields:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
             - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
               width).
diff --git a/src/transformers/models/deit/modeling_deit.py b/src/transformers/models/deit/modeling_deit.py
index dbcf11c37a..190ea7c7eb 100644
--- a/src/transformers/models/deit/modeling_deit.py
+++ b/src/transformers/models/deit/modeling_deit.py
@@ -487,22 +487,23 @@ class DeiTModel(DeiTPreTrainedModel):
         r"""
         Returns:
 
-        Examples::
+        Examples:
 
-            >>> from transformers import DeiTFeatureExtractor, DeiTModel
-            >>> from PIL import Image
-            >>> import requests
+        ```python
+        >>> from transformers import DeiTFeatureExtractor, DeiTModel
+        >>> from PIL import Image
+        >>> import requests
 
-            >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
-            >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> image = Image.open(requests.get(url, stream=True).raw)
 
-            >>> feature_extractor = DeiTFeatureExtractor.from_pretrained('facebook/deit-base-distilled-patch16-224')
-            >>> model = DeiTModel.from_pretrained('facebook/deit-base-distilled-patch16-224', add_pooling_layer=False)
+        >>> feature_extractor = DeiTFeatureExtractor.from_pretrained('facebook/deit-base-distilled-patch16-224')
+        >>> model = DeiTModel.from_pretrained('facebook/deit-base-distilled-patch16-224', add_pooling_layer=False)
 
-            >>> inputs = feature_extractor(images=image, return_tensors="pt")
-            >>> outputs = model(**inputs)
-            >>> last_hidden_states = outputs.last_hidden_state
-        """
+        >>> inputs = feature_extractor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -729,25 +730,26 @@ class DeiTForImageClassificationWithTeacher(DeiTPreTrainedModel):
         """
         Returns:
 
-        Examples::
+        Examples:
 
-            >>> from transformers import DeiTFeatureExtractor, DeiTForImageClassificationWithTeacher
-            >>> from PIL import Image
-            >>> import requests
+        ```python
+        >>> from transformers import DeiTFeatureExtractor, DeiTForImageClassificationWithTeacher
+        >>> from PIL import Image
+        >>> import requests
 
-            >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
-            >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> image = Image.open(requests.get(url, stream=True).raw)
 
-            >>> feature_extractor = DeiTFeatureExtractor.from_pretrained('facebook/deit-base-distilled-patch16-224')
-            >>> model = DeiTForImageClassificationWithTeacher.from_pretrained('facebook/deit-base-distilled-patch16-224')
+        >>> feature_extractor = DeiTFeatureExtractor.from_pretrained('facebook/deit-base-distilled-patch16-224')
+        >>> model = DeiTForImageClassificationWithTeacher.from_pretrained('facebook/deit-base-distilled-patch16-224')
 
-            >>> inputs = feature_extractor(images=image, return_tensors="pt")
-            >>> outputs = model(**inputs)
-            >>> logits = outputs.logits
-            >>> # model predicts one of the 1000 ImageNet classes
-            >>> predicted_class_idx = logits.argmax(-1).item()
-            >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
-        """
+        >>> inputs = feature_extractor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> logits = outputs.logits
+        >>> # model predicts one of the 1000 ImageNet classes
+        >>> predicted_class_idx = logits.argmax(-1).item()
+        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
+        ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         outputs = self.deit(
diff --git a/src/transformers/models/detr/configuration_detr.py b/src/transformers/models/detr/configuration_detr.py
index 2f6392a3c6..3edc9da80e 100644
--- a/src/transformers/models/detr/configuration_detr.py
+++ b/src/transformers/models/detr/configuration_detr.py
@@ -28,93 +28,92 @@ DETR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class DetrConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.DetrModel`. It is used to
+    This is the configuration class to store the configuration of a [`DetrModel`]. It is used to
     instantiate a DETR model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the DETR `facebook/detr-resnet-50
-    <https://huggingface.co/facebook/detr-resnet-50>`__ architecture.
+    configuration with the defaults will yield a similar configuration to that of the DETR [facebook/detr-resnet-50](https://huggingface.co/facebook/detr-resnet-50) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        num_queries (:obj:`int`, `optional`, defaults to 100):
+        num_queries (`int`, *optional*, defaults to 100):
             Number of object queries, i.e. detection slots. This is the maximal number of objects
-            :class:`~transformers.DetrModel` can detect in a single image. For COCO, we recommend 100 queries.
-        d_model (:obj:`int`, `optional`, defaults to 256):
+            [`DetrModel`] can detect in a single image. For COCO, we recommend 100 queries.
+        d_model (`int`, *optional*, defaults to 256):
             Dimension of the layers.
-        encoder_layers (:obj:`int`, `optional`, defaults to 6):
+        encoder_layers (`int`, *optional*, defaults to 6):
             Number of encoder layers.
-        decoder_layers (:obj:`int`, `optional`, defaults to 6):
+        decoder_layers (`int`, *optional*, defaults to 6):
             Number of decoder layers.
-        encoder_attention_heads (:obj:`int`, `optional`, defaults to 8):
+        encoder_attention_heads (`int`, *optional*, defaults to 8):
             Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (:obj:`int`, `optional`, defaults to 8):
+        decoder_attention_heads (`int`, *optional*, defaults to 8):
             Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
+        decoder_ffn_dim (`int`, *optional*, defaults to 2048):
             Dimension of the "intermediate" (often named feed-forward) layer in decoder.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
+        encoder_ffn_dim (`int`, *optional*, defaults to 2048):
             Dimension of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"relu"`):
+        activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        activation_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for activations inside the fully connected layer.
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
+        init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        init_xavier_std (:obj:`float`, `optional`, defaults to 1):
+        init_xavier_std (`float`, *optional*, defaults to 1):
             The scaling factor used for the Xavier initialization gain in the HM Attention map module.
-        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        auxiliary_loss (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        auxiliary_loss (`bool`, *optional*, defaults to `False`):
             Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
-        position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"sine"`):
-            Type of position embeddings to be used on top of the image features. One of :obj:`"sine"` or
-            :obj:`"learned"`.
-        backbone (:obj:`str`, `optional`, defaults to :obj:`"resnet50"`):
+        position_embedding_type (`str`, *optional*, defaults to `"sine"`):
+            Type of position embeddings to be used on top of the image features. One of `"sine"` or
+            `"learned"`.
+        backbone (`str`, *optional*, defaults to `"resnet50"`):
             Name of convolutional backbone to use. Supports any convolutional backbone from the timm package. For a
-            list of all available models, see `this page
-            <https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model>`__.
-        dilation (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            list of all available models, see [this page](https://rwightman.github.io/pytorch-image-models/#load-a-pretrained-model).
+        dilation (`bool`, *optional*, defaults to `False`):
             Whether to replace stride with dilation in the last convolutional block (DC5).
-        class_cost (:obj:`float`, `optional`, defaults to 1):
+        class_cost (`float`, *optional*, defaults to 1):
             Relative weight of the classification error in the Hungarian matching cost.
-        bbox_cost (:obj:`float`, `optional`, defaults to 5):
+        bbox_cost (`float`, *optional*, defaults to 5):
             Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost.
-        giou_cost (:obj:`float`, `optional`, defaults to 2):
+        giou_cost (`float`, *optional*, defaults to 2):
             Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
-        mask_loss_coefficient (:obj:`float`, `optional`, defaults to 1):
+        mask_loss_coefficient (`float`, *optional*, defaults to 1):
             Relative weight of the Focal loss in the panoptic segmentation loss.
-        dice_loss_coefficient (:obj:`float`, `optional`, defaults to 1):
+        dice_loss_coefficient (`float`, *optional*, defaults to 1):
             Relative weight of the DICE/F-1 loss in the panoptic segmentation loss.
-        bbox_loss_coefficient (:obj:`float`, `optional`, defaults to 5):
+        bbox_loss_coefficient (`float`, *optional*, defaults to 5):
             Relative weight of the L1 bounding box loss in the object detection loss.
-        giou_loss_coefficient (:obj:`float`, `optional`, defaults to 2):
+        giou_loss_coefficient (`float`, *optional*, defaults to 2):
             Relative weight of the generalized IoU loss in the object detection loss.
-        eos_coefficient (:obj:`float`, `optional`, defaults to 0.1):
+        eos_coefficient (`float`, *optional*, defaults to 0.1):
             Relative classification weight of the 'no-object' class in the object detection loss.
 
-    Examples::
+    Examples:
 
-        >>> from transformers import DetrModel, DetrConfig
+    ```python
+    >>> from transformers import DetrModel, DetrConfig
 
-        >>> # Initializing a DETR facebook/detr-resnet-50 style configuration
-        >>> configuration = DetrConfig()
+    >>> # Initializing a DETR facebook/detr-resnet-50 style configuration
+    >>> configuration = DetrConfig()
 
-        >>> # Initializing a model from the facebook/detr-resnet-50 style configuration
-        >>> model = DetrModel(configuration)
+    >>> # Initializing a model from the facebook/detr-resnet-50 style configuration
+    >>> model = DetrModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "detr"
     keys_to_ignore_at_inference = ["past_key_values"]
     attribute_map = {
diff --git a/src/transformers/models/detr/feature_extraction_detr.py b/src/transformers/models/detr/feature_extraction_detr.py
index e208961c03..a2f93ac2a2 100644
--- a/src/transformers/models/detr/feature_extraction_detr.py
+++ b/src/transformers/models/detr/feature_extraction_detr.py
@@ -124,28 +124,28 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
     r"""
     Constructs a DETR feature extractor.
 
-    This feature extractor inherits from :class:`~transformers.FeatureExtractionMixin` which contains most of the main
+    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main
     methods. Users should refer to this superclass for more information regarding those methods.
 
 
     Args:
-        format (:obj:`str`, `optional`, defaults to :obj:`"coco_detection"`):
+        format (`str`, *optional*, defaults to `"coco_detection"`):
             Data format of the annotations. One of "coco_detection" or "coco_panoptic".
-        do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether to resize the input to a certain :obj:`size`.
-        size (:obj:`int`, `optional`, defaults to 800):
-            Resize the input to the given size. Only has an effect if :obj:`do_resize` is set to :obj:`True`. If size
-            is a sequence like :obj:`(width, height)`, output size will be matched to this. If size is an int, smaller
-            edge of the image will be matched to this number. i.e, if :obj:`height > width`, then image will be
-            rescaled to :obj:`(size * height / width, size)`.
-        max_size (:obj:`int`, `optional`, defaults to :obj:`1333`):
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input to a certain `size`.
+        size (`int`, *optional*, defaults to 800):
+            Resize the input to the given size. Only has an effect if `do_resize` is set to `True`. If size
+            is a sequence like `(width, height)`, output size will be matched to this. If size is an int, smaller
+            edge of the image will be matched to this number. i.e, if `height > width`, then image will be
+            rescaled to `(size * height / width, size)`.
+        max_size (`int`, *optional*, defaults to `1333`):
             The largest size an image dimension can have (otherwise it's capped). Only has an effect if
-            :obj:`do_resize` is set to :obj:`True`.
-        do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            `do_resize` is set to `True`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
             Whether or not to normalize the input with mean and standard deviation.
-        image_mean (:obj:`int`, `optional`, defaults to :obj:`[0.485, 0.456, 0.406]`):
+        image_mean (`int`, *optional*, defaults to `[0.485, 0.456, 0.406]`):
             The sequence of means for each channel, to be used when normalizing images. Defaults to the ImageNet mean.
-        image_std (:obj:`int`, `optional`, defaults to :obj:`[0.229, 0.224, 0.225]`):
+        image_std (`int`, *optional*, defaults to `[0.229, 0.224, 0.225]`):
             The sequence of standard deviations for each channel, to be used when normalizing images. Defaults to the
             ImageNet std.
     """
@@ -416,39 +416,37 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
         padded up to the largest image in a batch, and a pixel mask is created that indicates which pixels are
         real/which are padding.
 
-        .. warning::
+        <Tip warning={true}>
 
-           NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
-           PIL images.
+        NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
+        PIL images.
+
+        </Tip>
 
         Args:
-            images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                 number of channels, H and W are image height and width.
 
-            annotations (:obj:`Dict`, :obj:`List[Dict]`, `optional`):
+            annotations (`Dict`, `List[Dict]`, *optional*):
                 The corresponding annotations in COCO format.
 
-                In case :class:`~transformers.DetrFeatureExtractor` was initialized with :obj:`format =
-                "coco_detection"`, the annotations for each image should have the following format: {'image_id': int,
+                In case [`DetrFeatureExtractor`] was initialized with `format = "coco_detection"`, the annotations for each image should have the following format: {'image_id': int,
                 'annotations': [annotation]}, with the annotations being a list of COCO object annotations.
 
-                In case :class:`~transformers.DetrFeatureExtractor` was initialized with :obj:`format =
-                "coco_panoptic"`, the annotations for each image should have the following format: {'image_id': int,
+                In case [`DetrFeatureExtractor`] was initialized with `format = "coco_panoptic"`, the annotations for each image should have the following format: {'image_id': int,
                 'file_name': str, 'segments_info': [segment_info]} with segments_info being a list of COCO panoptic
                 annotations.
 
-            return_segmentation_masks (:obj:`Dict`, :obj:`List[Dict]`, `optional`, defaults to :obj:`False`):
-                Whether to also include instance segmentation masks as part of the labels in case :obj:`format =
-                "coco_detection"`.
+            return_segmentation_masks (`Dict`, `List[Dict]`, *optional*, defaults to `False`):
+                Whether to also include instance segmentation masks as part of the labels in case `format = "coco_detection"`.
 
-            masks_path (:obj:`pathlib.Path`, `optional`):
+            masks_path (`pathlib.Path`, *optional*):
                 Path to the directory containing the PNG files that store the class-agnostic image segmentations. Only
-                relevant in case :class:`~transformers.DetrFeatureExtractor` was initialized with :obj:`format =
-                "coco_panoptic"`.
+                relevant in case [`DetrFeatureExtractor`] was initialized with `format = "coco_panoptic"`.
 
-            pad_and_return_pixel_mask (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            pad_and_return_pixel_mask (`bool`, *optional*, defaults to `True`):
                 Whether or not to pad images up to the largest image in a batch and create a pixel mask.
 
                 If left to the default, will return a pixel mask that is:
@@ -456,17 +454,17 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
                 - 1 for pixels that are real (i.e. **not masked**),
                 - 0 for pixels that are padding (i.e. **masked**).
 
-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
-                If set, will return tensors instead of NumPy arrays. If set to :obj:`'pt'`, return PyTorch
-                :obj:`torch.Tensor` objects.
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
+                If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch
+                `torch.Tensor` objects.
 
         Returns:
-            :class:`~transformers.BatchFeature`: A :class:`~transformers.BatchFeature` with the following fields:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
             - **pixel_values** -- Pixel values to be fed to a model.
-            - **pixel_mask** -- Pixel mask to be fed to a model (when :obj:`pad_and_return_pixel_mask=True` or if
-              `"pixel_mask"` is in :obj:`self.model_input_names`).
-            - **labels** -- Optional labels to be fed to a model (when :obj:`annotations` are provided)
+            - **pixel_mask** -- Pixel mask to be fed to a model (when `pad_and_return_pixel_mask=True` or if
+              *"pixel_mask"* is in `self.model_input_names`).
+            - **labels** -- Optional labels to be fed to a model (when `annotations` are provided)
         """
         # Input type checking for clearer error
 
@@ -634,21 +632,21 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
         self, pixel_values_list: List["torch.Tensor"], return_tensors: Optional[Union[str, TensorType]] = None
     ):
         """
-        Pad images up to the largest image in a batch and create a corresponding :obj:`pixel_mask`.
+        Pad images up to the largest image in a batch and create a corresponding `pixel_mask`.
 
         Args:
-            pixel_values_list (:obj:`List[torch.Tensor]`):
+            pixel_values_list (`List[torch.Tensor]`):
                 List of images (pixel values) to be padded. Each image should be a tensor of shape (C, H, W).
-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
-                If set, will return tensors instead of NumPy arrays. If set to :obj:`'pt'`, return PyTorch
-                :obj:`torch.Tensor` objects.
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
+                If set, will return tensors instead of NumPy arrays. If set to `'pt'`, return PyTorch
+                `torch.Tensor` objects.
 
         Returns:
-            :class:`~transformers.BatchFeature`: A :class:`~transformers.BatchFeature` with the following fields:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
             - **pixel_values** -- Pixel values to be fed to a model.
-            - **pixel_mask** -- Pixel mask to be fed to a model (when :obj:`pad_and_return_pixel_mask=True` or if
-              `"pixel_mask"` is in :obj:`self.model_input_names`).
+            - **pixel_mask** -- Pixel mask to be fed to a model (when `pad_and_return_pixel_mask=True` or if
+              *"pixel_mask"* is in `self.model_input_names`).
 
         """
 
@@ -676,19 +674,19 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
     # inspired by https://github.com/facebookresearch/detr/blob/master/models/detr.py#L258
     def post_process(self, outputs, target_sizes):
         """
-        Converts the output of :class:`~transformers.DetrForObjectDetection` into the format expected by the COCO api.
+        Converts the output of [`DetrForObjectDetection`] into the format expected by the COCO api.
         Only supports PyTorch.
 
         Args:
-            outputs (:class:`~transformers.DetrObjectDetectionOutput`):
+            outputs ([`DetrObjectDetectionOutput`]):
                 Raw outputs of the model.
-            target_sizes (:obj:`torch.Tensor` of shape :obj:`(batch_size, 2)`, `optional`):
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)`, *optional*):
                 Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
                 image size (before any data augmentation). For visualization, this should be the image size after data
                 augment, but before padding.
 
         Returns:
-            :obj:`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an
             image in the batch as predicted by the model.
         """
         out_logits, out_bbox = outputs.logits, outputs.pred_boxes
@@ -714,21 +712,21 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
 
     def post_process_segmentation(self, outputs, target_sizes, threshold=0.9, mask_threshold=0.5):
         """
-        Converts the output of :class:`~transformers.DetrForSegmentation` into image segmentation predictions. Only
+        Converts the output of [`DetrForSegmentation`] into image segmentation predictions. Only
         supports PyTorch.
 
         Parameters:
-            outputs (:class:`~transformers.DetrSegmentationOutput`):
+            outputs ([`DetrSegmentationOutput`]):
                 Raw outputs of the model.
-            target_sizes (:obj:`torch.Tensor` of shape :obj:`(batch_size, 2)` or :obj:`List[Tuple]` of length :obj:`batch_size`):
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
                 Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction.
-            threshold (:obj:`float`, `optional`, defaults to 0.9):
+            threshold (`float`, *optional*, defaults to 0.9):
                 Threshold to use to filter out queries.
-            mask_threshold (:obj:`float`, `optional`, defaults to 0.5):
+            mask_threshold (`float`, *optional*, defaults to 0.5):
                 Threshold to use when turning the predicted masks into binary values.
 
         Returns:
-            :obj:`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, and masks for an
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, and masks for an
             image in the batch as predicted by the model.
         """
         out_logits, raw_masks = outputs.logits, outputs.pred_masks
@@ -757,26 +755,26 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
     # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L218
     def post_process_instance(self, results, outputs, orig_target_sizes, max_target_sizes, threshold=0.5):
         """
-        Converts the output of :class:`~transformers.DetrForSegmentation` into actual instance segmentation
+        Converts the output of [`DetrForSegmentation`] into actual instance segmentation
         predictions. Only supports PyTorch.
 
         Args:
-            results (:obj:`List[Dict]`):
-                Results list obtained by :meth:`~transformers.DetrFeatureExtractor.post_process`, to which "masks"
+            results (`List[Dict]`):
+                Results list obtained by [`~DetrFeatureExtractor.post_process`], to which "masks"
                 results will be added.
-            outputs (:class:`~transformers.DetrSegmentationOutput`):
+            outputs ([`DetrSegmentationOutput`]):
                 Raw outputs of the model.
-            orig_target_sizes (:obj:`torch.Tensor` of shape :obj:`(batch_size, 2)`):
+            orig_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
                 Tensor containing the size (h, w) of each image of the batch. For evaluation, this must be the original
                 image size (before any data augmentation).
-            max_target_sizes (:obj:`torch.Tensor` of shape :obj:`(batch_size, 2)`):
+            max_target_sizes (`torch.Tensor` of shape `(batch_size, 2)`):
                 Tensor containing the maximum size (h, w) of each image of the batch. For evaluation, this must be the
                 original image size (before any data augmentation).
-            threshold (:obj:`float`, `optional`, defaults to 0.5):
+            threshold (`float`, *optional*, defaults to 0.5):
                 Threshold to use when turning the predicted masks into binary values.
 
         Returns:
-            :obj:`List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, boxes and masks
+            `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels, boxes and masks
             for an image in the batch as predicted by the model.
         """
 
@@ -801,26 +799,26 @@ class DetrFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
     # inspired by https://github.com/facebookresearch/detr/blob/master/models/segmentation.py#L241
     def post_process_panoptic(self, outputs, processed_sizes, target_sizes=None, is_thing_map=None, threshold=0.85):
         """
-        Converts the output of :class:`~transformers.DetrForSegmentation` into actual panoptic predictions. Only
+        Converts the output of [`DetrForSegmentation`] into actual panoptic predictions. Only
         supports PyTorch.
 
         Parameters:
-            outputs (:class:`~transformers.DetrSegmentationOutput`):
+            outputs ([`DetrSegmentationOutput`]):
                 Raw outputs of the model.
-            processed_sizes (:obj:`torch.Tensor` of shape :obj:`(batch_size, 2)` or :obj:`List[Tuple]` of length :obj:`batch_size`):
+            processed_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`):
                 Torch Tensor (or list) containing the size (h, w) of each image of the batch, i.e. the size after data
                 augmentation but before batching.
-            target_sizes (:obj:`torch.Tensor` of shape :obj:`(batch_size, 2)` or :obj:`List[Tuple]` of length :obj:`batch_size`, `optional`):
+            target_sizes (`torch.Tensor` of shape `(batch_size, 2)` or `List[Tuple]` of length `batch_size`, *optional*):
                 Torch Tensor (or list) corresponding to the requested final size (h, w) of each prediction. If left to
-                None, it will default to the :obj:`processed_sizes`.
-            is_thing_map (:obj:`torch.Tensor` of shape :obj:`(batch_size, 2)`, `optional`):
+                None, it will default to the `processed_sizes`.
+            is_thing_map (`torch.Tensor` of shape `(batch_size, 2)`, *optional*):
                 Dictionary mapping class indices to either True or False, depending on whether or not they are a thing.
-                If not set, defaults to the :obj:`is_thing_map` of COCO panoptic.
-            threshold (:obj:`float`, `optional`, defaults to 0.85):
+                If not set, defaults to the `is_thing_map` of COCO panoptic.
+            threshold (`float`, *optional*, defaults to 0.85):
                 Threshold to use to filter out queries.
 
         Returns:
-            :obj:`List[Dict]`: A list of dictionaries, each dictionary containing a PNG string and segments_info values
+            `List[Dict]`: A list of dictionaries, each dictionary containing a PNG string and segments_info values
             for an image in the batch as predicted by the model.
         """
         if target_sizes is None:
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
index 0895290aa7..7d1140577a 100644
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -1205,21 +1205,22 @@ class DetrModel(DetrPreTrainedModel):
         r"""
         Returns:
 
-        Examples::
+        Examples:
 
-            >>> from transformers import DetrFeatureExtractor, DetrModel
-            >>> from PIL import Image
-            >>> import requests
+        ```python
+        >>> from transformers import DetrFeatureExtractor, DetrModel
+        >>> from PIL import Image
+        >>> import requests
 
-            >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
-            >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> image = Image.open(requests.get(url, stream=True).raw)
 
-            >>> feature_extractor = DetrFeatureExtractor.from_pretrained('facebook/detr-resnet-50')
-            >>> model = DetrModel.from_pretrained('facebook/detr-resnet-50')
-            >>> inputs = feature_extractor(images=image, return_tensors="pt")
-            >>> outputs = model(**inputs)
-            >>> last_hidden_states = outputs.last_hidden_state
-        """
+        >>> feature_extractor = DetrFeatureExtractor.from_pretrained('facebook/detr-resnet-50')
+        >>> model = DetrModel.from_pretrained('facebook/detr-resnet-50')
+        >>> inputs = feature_extractor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/distilbert/configuration_distilbert.py b/src/transformers/models/distilbert/configuration_distilbert.py
index 733714e721..09ffe1619c 100644
--- a/src/transformers/models/distilbert/configuration_distilbert.py
+++ b/src/transformers/models/distilbert/configuration_distilbert.py
@@ -36,62 +36,62 @@ DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class DistilBertConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.DistilBertModel` or a
-    :class:`~transformers.TFDistilBertModel`. It is used to instantiate a DistilBERT model according to the specified
+    This is the configuration class to store the configuration of a [`DistilBertModel`] or a
+    [`TFDistilBertModel`]. It is used to instantiate a DistilBERT model according to the specified
     arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
-    configuration to that of the DistilBERT `distilbert-base-uncased
-    <https://huggingface.co/distilbert-base-uncased>`__ architecture.
+    configuration to that of the DistilBERT [distilbert-base-uncased](https://huggingface.co/distilbert-base-uncased) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the DistilBERT model. Defines the number of different tokens that can be represented by
-            the :obj:`inputs_ids` passed when calling :class:`~transformers.DistilBertModel` or
-            :class:`~transformers.TFDistilBertModel`.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+            the `inputs_ids` passed when calling [`DistilBertModel`] or
+            [`TFDistilBertModel`].
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        sinusoidal_pos_embds (:obj:`boolean`, `optional`, defaults to :obj:`False`):
+        sinusoidal_pos_embds (`boolean`, *optional*, defaults to `False`):
             Whether to use sinusoidal positional embeddings.
-        n_layers (:obj:`int`, `optional`, defaults to 6):
+        n_layers (`int`, *optional*, defaults to 6):
             Number of hidden layers in the Transformer encoder.
-        n_heads (:obj:`int`, `optional`, defaults to 12):
+        n_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        dim (:obj:`int`, `optional`, defaults to 768):
+        dim (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        hidden_dim (:obj:`int`, `optional`, defaults to 3072):
+        hidden_dim (`int`, *optional*, defaults to 3072):
             The size of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+        dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
+        attention_dropout (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        activation (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+        activation (`str` or `Callable`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        qa_dropout (:obj:`float`, `optional`, defaults to 0.1):
+        qa_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probabilities used in the question answering model
-            :class:`~transformers.DistilBertForQuestionAnswering`.
-        seq_classif_dropout (:obj:`float`, `optional`, defaults to 0.2):
+            [`DistilBertForQuestionAnswering`].
+        seq_classif_dropout (`float`, *optional*, defaults to 0.2):
             The dropout probabilities used in the sequence classification and the multiple choice model
-            :class:`~transformers.DistilBertForSequenceClassification`.
+            [`DistilBertForSequenceClassification`].
 
-    Examples::
+    Examples:
 
-        >>> from transformers import DistilBertModel, DistilBertConfig
+    ```python
+    >>> from transformers import DistilBertModel, DistilBertConfig
 
-        >>> # Initializing a DistilBERT configuration
-        >>> configuration = DistilBertConfig()
+    >>> # Initializing a DistilBERT configuration
+    >>> configuration = DistilBertConfig()
 
-        >>> # Initializing a model from the configuration
-        >>> model = DistilBertModel(configuration)
+    >>> # Initializing a model from the configuration
+    >>> model = DistilBertModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "distilbert"
     attribute_map = {
         "hidden_size": "dim",
diff --git a/src/transformers/models/distilbert/tokenization_distilbert.py b/src/transformers/models/distilbert/tokenization_distilbert.py
index 50dc80bdf4..a04c1059c7 100644
--- a/src/transformers/models/distilbert/tokenization_distilbert.py
+++ b/src/transformers/models/distilbert/tokenization_distilbert.py
@@ -57,10 +57,10 @@ class DistilBertTokenizer(BertTokenizer):
     r"""
     Construct a DistilBERT tokenizer.
 
-    :class:`~transformers.DistilBertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
+    [`DistilBertTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end
     tokenization: punctuation splitting and wordpiece.
 
-    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning
     parameters.
     """
 
diff --git a/src/transformers/models/distilbert/tokenization_distilbert_fast.py b/src/transformers/models/distilbert/tokenization_distilbert_fast.py
index 4007d4e871..3b052f5cef 100644
--- a/src/transformers/models/distilbert/tokenization_distilbert_fast.py
+++ b/src/transformers/models/distilbert/tokenization_distilbert_fast.py
@@ -64,12 +64,12 @@ PRETRAINED_INIT_CONFIGURATION = {
 
 class DistilBertTokenizerFast(BertTokenizerFast):
     r"""
-    Construct a "fast" DistilBERT tokenizer (backed by HuggingFace's `tokenizers` library).
+    Construct a "fast" DistilBERT tokenizer (backed by HuggingFace's *tokenizers* library).
 
-    :class:`~transformers.DistilBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
+    [`DistilBertTokenizerFast`] is identical to [`BertTokenizerFast`] and runs
     end-to-end tokenization: punctuation splitting and wordpiece.
 
-    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning
     parameters.
     """
 
diff --git a/src/transformers/models/dpr/configuration_dpr.py b/src/transformers/models/dpr/configuration_dpr.py
index a9b5f96556..dd0a9dfddc 100644
--- a/src/transformers/models/dpr/configuration_dpr.py
+++ b/src/transformers/models/dpr/configuration_dpr.py
@@ -32,51 +32,49 @@ DPR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class DPRConfig(PretrainedConfig):
     r"""
-    :class:`~transformers.DPRConfig` is the configuration class to store the configuration of a `DPRModel`.
+    [`DPRConfig`] is the configuration class to store the configuration of a *DPRModel*.
 
-    This is the configuration class to store the configuration of a :class:`~transformers.DPRContextEncoder`,
-    :class:`~transformers.DPRQuestionEncoder`, or a :class:`~transformers.DPRReader`. It is used to instantiate the
+    This is the configuration class to store the configuration of a [`DPRContextEncoder`],
+    [`DPRQuestionEncoder`], or a [`DPRReader`]. It is used to instantiate the
     components of the DPR model.
 
-    This class is a subclass of :class:`~transformers.BertConfig`. Please check the superclass for the documentation of
+    This class is a subclass of [`BertConfig`]. Please check the superclass for the documentation of
     all kwargs.
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
-            Vocabulary size of the DPR model. Defines the different tokens that can be represented by the `inputs_ids`
-            passed to the forward method of :class:`~transformers.BertModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the DPR model. Defines the different tokens that can be represented by the *inputs_ids*
+            passed to the forward method of [`BertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the *token_type_ids* passed into [`BertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
-            Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
-            :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
-            :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
-            <https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
-            `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
-            <https://arxiv.org/abs/2009.13658>`__.
-        projection_dim (:obj:`int`, `optional`, defaults to 0):
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`,
+            `"relative_key_query"`. For positional embeddings use `"absolute"`. For more information on
+            `"relative_key"`, please refer to [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). For more information on `"relative_key_query"`, please refer to
+            *Method 4* in [Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        projection_dim (`int`, *optional*, defaults to 0):
             Dimension of the projection for the context and question encoders. If it is set to zero (default), then no
             projection is done.
     """
diff --git a/src/transformers/models/dpr/modeling_dpr.py b/src/transformers/models/dpr/modeling_dpr.py
index c845c31aa9..333106e4fb 100644
--- a/src/transformers/models/dpr/modeling_dpr.py
+++ b/src/transformers/models/dpr/modeling_dpr.py
@@ -64,7 +64,7 @@ class DPRContextEncoderOutput(ModelOutput):
     Class for outputs of [`DPRQuestionEncoder`].
 
     Args:
-        pooler_output: (:obj:`torch.FloatTensor` of shape `(batch_size, embeddings_size)`):
+        pooler_output (`torch.FloatTensor` of shape `(batch_size, embeddings_size)`):
             The DPR encoder outputs the *pooler_output* that corresponds to the context representation. Last layer
             hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
             This output is to be used to embed contexts for nearest neighbors queries with questions embeddings.
@@ -91,7 +91,7 @@ class DPRQuestionEncoderOutput(ModelOutput):
     Class for outputs of [`DPRQuestionEncoder`].
 
     Args:
-        pooler_output: (:obj:`torch.FloatTensor` of shape `(batch_size, embeddings_size)`):
+        pooler_output (`torch.FloatTensor` of shape `(batch_size, embeddings_size)`):
             The DPR encoder outputs the *pooler_output* that corresponds to the question representation. Last layer
             hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
             This output is to be used to embed questions for nearest neighbors queries with context embeddings.
@@ -118,11 +118,11 @@ class DPRReaderOutput(ModelOutput):
     Class for outputs of [`DPRQuestionEncoder`].
 
     Args:
-        start_logits: (:obj:`torch.FloatTensor` of shape `(n_passages, sequence_length)`):
+        start_logits (`torch.FloatTensor` of shape `(n_passages, sequence_length)`):
             Logits of the start index of the span for each passage.
-        end_logits: (:obj:`torch.FloatTensor` of shape `(n_passages, sequence_length)`):
+        end_logits (`torch.FloatTensor` of shape `(n_passages, sequence_length)`):
             Logits of the end index of the span for each passage.
-        relevance_logits: (``torch.FloatTensor``` of shape `(n_passages, )`):
+        relevance_logits (`torch.FloatTensor` of shape `(n_passages, )`):
             Outputs of the QA classifier of the DPRReader that corresponds to the scores of each passage to answer the
             question, compared to all the other passages.
         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
@@ -350,17 +350,17 @@ DPR_ENCODERS_INPUTS_DOCSTRING = r"""
 
             (a) For sequence pairs (for a pair title+text for example):
 
-    ```
-    tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-    token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
-    ```
+            ```
+            tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+            token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
+            ```
 
             (b) For single sequences (for a question for example):
 
-    ```
-    tokens:         [CLS] the dog is hairy . [SEP]
-    token_type_ids:   0   0   0   0  0     0   0
-    ```
+            ```
+            tokens:         [CLS] the dog is hairy . [SEP]
+            token_type_ids:   0   0   0   0  0     0   0
+            ```
 
             DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
             rather than the left.
@@ -463,14 +463,15 @@ class DPRContextEncoder(DPRPretrainedContextEncoder):
         r"""
         Return:
 
-        Examples::
+        Examples:
 
-            >>> from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
-            >>> tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
-            >>> model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
-            >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"]
-            >>> embeddings = model(input_ids).pooler_output
-        """
+        ```python
+        >>> from transformers import DPRContextEncoder, DPRContextEncoderTokenizer
+        >>> tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
+        >>> model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
+        >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"]
+        >>> embeddings = model(input_ids).pooler_output
+        ```"""
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -542,13 +543,15 @@ class DPRQuestionEncoder(DPRPretrainedQuestionEncoder):
         r"""
         Return:
 
-        Examples::
+        Examples:
 
-            >>> from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
-            >>> tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
-            >>> model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
-            >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"]
-            >>> embeddings = model(input_ids).pooler_output
+        ```python
+        >>> from transformers import DPRQuestionEncoder, DPRQuestionEncoderTokenizer
+        >>> tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
+        >>> model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
+        >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"]
+        >>> embeddings = model(input_ids).pooler_output
+        ```
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -619,22 +622,23 @@ class DPRReader(DPRPretrainedReader):
         r"""
         Return:
 
-        Examples::
-
-            >>> from transformers import DPRReader, DPRReaderTokenizer
-            >>> tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
-            >>> model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base')
-            >>> encoded_inputs = tokenizer(
-            ...         questions=["What is love ?"],
-            ...         titles=["Haddaway"],
-            ...         texts=["'What Is Love' is a song recorded by the artist Haddaway"],
-            ...         return_tensors='pt'
-            ...     )
-            >>> outputs = model(**encoded_inputs)
-            >>> start_logits = outputs.start_logits
-            >>> end_logits = outputs.end_logits
-            >>> relevance_logits = outputs.relevance_logits
+        Examples:
 
+        ```python
+        >>> from transformers import DPRReader, DPRReaderTokenizer
+        >>> tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
+        >>> model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base')
+        >>> encoded_inputs = tokenizer(
+        ...         questions=["What is love ?"],
+        ...         titles=["Haddaway"],
+        ...         texts=["'What Is Love' is a song recorded by the artist Haddaway"],
+        ...         return_tensors='pt'
+        ...     )
+        >>> outputs = model(**encoded_inputs)
+        >>> start_logits = outputs.start_logits
+        >>> end_logits = outputs.end_logits
+        >>> relevance_logits = outputs.relevance_logits
+        ```
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
diff --git a/src/transformers/models/dpr/modeling_tf_dpr.py b/src/transformers/models/dpr/modeling_tf_dpr.py
index 4ec0e7b5fb..75e925cb2a 100644
--- a/src/transformers/models/dpr/modeling_tf_dpr.py
+++ b/src/transformers/models/dpr/modeling_tf_dpr.py
@@ -61,7 +61,7 @@ class TFDPRContextEncoderOutput(ModelOutput):
     Class for outputs of [`TFDPRContextEncoder`].
 
     Args:
-        pooler_output: (:obj:`tf.Tensor` of shape `(batch_size, embeddings_size)`):
+        pooler_output (`tf.Tensor` of shape `(batch_size, embeddings_size)`):
             The DPR encoder outputs the *pooler_output* that corresponds to the context representation. Last layer
             hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
             This output is to be used to embed contexts for nearest neighbors queries with questions embeddings.
@@ -88,7 +88,7 @@ class TFDPRQuestionEncoderOutput(ModelOutput):
     Class for outputs of [`TFDPRQuestionEncoder`].
 
     Args:
-        pooler_output: (:obj:`tf.Tensor` of shape `(batch_size, embeddings_size)`):
+        pooler_output (`tf.Tensor` of shape `(batch_size, embeddings_size)`):
             The DPR encoder outputs the *pooler_output* that corresponds to the question representation. Last layer
             hidden-state of the first token of the sequence (classification token) further processed by a Linear layer.
             This output is to be used to embed questions for nearest neighbors queries with context embeddings.
@@ -115,11 +115,11 @@ class TFDPRReaderOutput(ModelOutput):
     Class for outputs of [`TFDPRReaderEncoder`].
 
     Args:
-        start_logits: (:obj:`tf.Tensor` of shape `(n_passages, sequence_length)`):
+        start_logits (`tf.Tensor` of shape `(n_passages, sequence_length)`):
             Logits of the start index of the span for each passage.
-        end_logits: (:obj:`tf.Tensor` of shape `(n_passages, sequence_length)`):
+        end_logits (`tf.Tensor` of shape `(n_passages, sequence_length)`):
             Logits of the end index of the span for each passage.
-        relevance_logits: (``tf.Tensor``` of shape `(n_passages, )`):
+        relevance_logits (`tf.Tensor` of shape `(n_passages, )`):
             Outputs of the QA classifier of the DPRReader that corresponds to the scores of each passage to answer the
             question, compared to all the other passages.
         hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
@@ -485,17 +485,17 @@ TF_DPR_ENCODERS_INPUTS_DOCSTRING = r"""
 
             (a) For sequence pairs (for a pair title+text for example):
 
-    ```
-    tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-    token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
-    ```
+            ```
+            tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+            token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
+            ```
 
             (b) For single sequences (for a question for example):
 
-    ```
-    tokens:         [CLS] the dog is hairy . [SEP]
-    token_type_ids:   0   0   0   0  0     0   0
-    ```
+            ```
+            tokens:         [CLS] the dog is hairy . [SEP]
+            token_type_ids:   0   0   0   0  0     0   0
+            ```
 
             DPR is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
             rather than the left.
@@ -610,13 +610,15 @@ class TFDPRContextEncoder(TFDPRPretrainedContextEncoder):
         r"""
         Return:
 
-        Examples::
+        Examples:
 
-            >>> from transformers import TFDPRContextEncoder, DPRContextEncoderTokenizer
-            >>> tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
-            >>> model = TFDPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', from_pt=True)
-            >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='tf')["input_ids"]
-            >>> embeddings = model(input_ids).pooler_output
+        ```python
+        >>> from transformers import TFDPRContextEncoder, DPRContextEncoderTokenizer
+        >>> tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base')
+        >>> model = TFDPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', from_pt=True)
+        >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='tf')["input_ids"]
+        >>> embeddings = model(input_ids).pooler_output
+        ```
         """
         inputs = input_processing(
             func=self.call,
@@ -708,13 +710,15 @@ class TFDPRQuestionEncoder(TFDPRPretrainedQuestionEncoder):
         r"""
         Return:
 
-        Examples::
+        Examples:
 
-            >>> from transformers import TFDPRQuestionEncoder, DPRQuestionEncoderTokenizer
-            >>> tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
-            >>> model = TFDPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base', from_pt=True)
-            >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='tf')["input_ids"]
-            >>> embeddings = model(input_ids).pooler_output
+        ```python
+        >>> from transformers import TFDPRQuestionEncoder, DPRQuestionEncoderTokenizer
+        >>> tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base')
+        >>> model = TFDPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base', from_pt=True)
+        >>> input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='tf')["input_ids"]
+        >>> embeddings = model(input_ids).pooler_output
+        ```
         """
         inputs = input_processing(
             func=self.call,
@@ -804,22 +808,23 @@ class TFDPRReader(TFDPRPretrainedReader):
         r"""
         Return:
 
-        Examples::
-
-            >>> from transformers import TFDPRReader, DPRReaderTokenizer
-            >>> tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
-            >>> model = TFDPRReader.from_pretrained('facebook/dpr-reader-single-nq-base', from_pt=True)
-            >>> encoded_inputs = tokenizer(
-            ...         questions=["What is love ?"],
-            ...         titles=["Haddaway"],
-            ...         texts=["'What Is Love' is a song recorded by the artist Haddaway"],
-            ...         return_tensors='tf'
-            ...     )
-            >>> outputs = model(encoded_inputs)
-            >>> start_logits = outputs.start_logits
-            >>> end_logits = outputs.end_logits
-            >>> relevance_logits = outputs.relevance_logits
+        Examples:
 
+        ```python
+        >>> from transformers import TFDPRReader, DPRReaderTokenizer
+        >>> tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
+        >>> model = TFDPRReader.from_pretrained('facebook/dpr-reader-single-nq-base', from_pt=True)
+        >>> encoded_inputs = tokenizer(
+        ...         questions=["What is love ?"],
+        ...         titles=["Haddaway"],
+        ...         texts=["'What Is Love' is a song recorded by the artist Haddaway"],
+        ...         return_tensors='tf'
+        ...     )
+        >>> outputs = model(encoded_inputs)
+        >>> start_logits = outputs.start_logits
+        >>> end_logits = outputs.end_logits
+        >>> relevance_logits = outputs.relevance_logits
+        ```
         """
         inputs = input_processing(
             func=self.call,
diff --git a/src/transformers/models/dpr/tokenization_dpr.py b/src/transformers/models/dpr/tokenization_dpr.py
index 23bfff9062..46ab974262 100644
--- a/src/transformers/models/dpr/tokenization_dpr.py
+++ b/src/transformers/models/dpr/tokenization_dpr.py
@@ -91,10 +91,10 @@ class DPRContextEncoderTokenizer(BertTokenizer):
     r"""
     Construct a DPRContextEncoder tokenizer.
 
-    :class:`~transformers.DPRContextEncoderTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs
+    [`DPRContextEncoderTokenizer`] is identical to [`BertTokenizer`] and runs
     end-to-end tokenization: punctuation splitting and wordpiece.
 
-    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning
     parameters.
     """
 
@@ -108,10 +108,10 @@ class DPRQuestionEncoderTokenizer(BertTokenizer):
     r"""
     Constructs a DPRQuestionEncoder tokenizer.
 
-    :class:`~transformers.DPRQuestionEncoderTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs
+    [`DPRQuestionEncoderTokenizer`] is identical to [`BertTokenizer`] and runs
     end-to-end tokenization: punctuation splitting and wordpiece.
 
-    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning
     parameters.
     """
 
@@ -130,70 +130,70 @@ DPRReaderOutput = collections.namedtuple("DPRReaderOutput", ["start_logits", "en
 
 CUSTOM_DPR_READER_DOCSTRING = r"""
     Return a dictionary with the token ids of the input strings and other information to give to
-    :obj:`.decode_best_spans`. It converts the strings of a question and different passages (title and text) in a
-    sequence of IDs (integers), using the tokenizer and vocabulary. The resulting :obj:`input_ids` is a matrix of size
-    :obj:`(n_passages, sequence_length)` with the format:
+    `.decode_best_spans`. It converts the strings of a question and different passages (title and text) in a
+    sequence of IDs (integers), using the tokenizer and vocabulary. The resulting `input_ids` is a matrix of size
+    `(n_passages, sequence_length)` with the format:
 
-    ::
-
-        [CLS] <question token ids> [SEP] <titles ids> [SEP] <texts ids>
+    ```
+    [CLS] <question token ids> [SEP] <titles ids> [SEP] <texts ids>
+    ```
 
     Args:
-        questions (:obj:`str` or :obj:`List[str]`):
+        questions (`str` or `List[str]`):
             The questions to be encoded. You can specify one question for many passages. In this case, the question
-            will be duplicated like :obj:`[questions] * n_passages`. Otherwise you have to specify as many questions as
-            in :obj:`titles` or :obj:`texts`.
-        titles (:obj:`str` or :obj:`List[str]`):
+            will be duplicated like `[questions] * n_passages`. Otherwise you have to specify as many questions as
+            in `titles` or `texts`.
+        titles (`str` or `List[str]`):
             The passages titles to be encoded. This can be a string or a list of strings if there are several passages.
-        texts (:obj:`str` or :obj:`List[str]`):
+        texts (`str` or `List[str]`):
             The passages texts to be encoded. This can be a string or a list of strings if there are several passages.
-        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
+        padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
             Activates and controls padding. Accepts the following values:
 
-            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+            - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
               sequence if provided).
-            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
               maximum acceptable input length for the model if that argument is not provided.
-            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+            - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
               different lengths).
-        truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
+        truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
             Activates and controls truncation. Accepts the following values:
 
-            * :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument
-              :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
+            - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument
+              `max_length` or to the maximum acceptable input length for the model if that argument is not
               provided. This will truncate token by token, removing a token from the longest sequence in the pair if a
               pair of sequences (or a batch of pairs) is provided.
-            * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to the
+            - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
               maximum acceptable input length for the model if that argument is not provided. This will only truncate
               the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-            * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
+            - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to
               the maximum acceptable input length for the model if that argument is not provided. This will only
               truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-            * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence
+            - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence
               lengths greater than the model maximum admissible input size).
-        max_length (:obj:`int`, `optional`):
+        max_length (`int`, *optional*):
                 Controls the maximum length to use by one of the truncation/padding parameters.
 
-                If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
+                If left unset or set to `None`, this will use the predefined model maximum length if a maximum
                 length is required by one of the truncation/padding parameters. If the model has no specific maximum
                 input length (like XLNet) truncation/padding to a maximum length will be deactivated.
-        return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+        return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
 
-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
-        return_attention_mask (:obj:`bool`, `optional`):
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+        return_attention_mask (`bool`, *optional*):
             Whether or not to return the attention mask. If not set, will return the attention mask according to the
-            specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+            specific tokenizer's default, defined by the `return_outputs` attribute.
 
-            `What are attention masks? <../glossary.html#attention-mask>`__
+            [What are attention masks?](../glossary#attention-mask)
 
     Returns:
-        :obj:`Dict[str, List[List[int]]]`: A dictionary with the following keys:
+        `Dict[str, List[List[int]]]`: A dictionary with the following keys:
 
-        - ``input_ids``: List of token ids to be fed to a model.
-        - ``attention_mask``: List of indices specifying which tokens should be attended to by the model.
+        - `input_ids`: List of token ids to be fed to a model.
+        - `attention_mask`: List of indices specifying which tokens should be attended to by the model.
     """
 
 
@@ -268,33 +268,31 @@ class CustomDPRReaderTokenizerMixin:
         """
         Get the span predictions for the extractive Q&A model.
 
-        Returns: `List` of `DPRReaderOutput` sorted by descending `(relevance_score, span_score)`. Each
-        `DPRReaderOutput` is a `Tuple` with:
+        Returns: *List* of *DPRReaderOutput* sorted by descending *(relevance_score, span_score)*. Each
+        *DPRReaderOutput* is a *Tuple* with:
 
-            - **span_score**: ``float`` that corresponds to the score given by the reader for this span compared to
+            - **span_score**: `float` that corresponds to the score given by the reader for this span compared to
               other spans in the same passage. It corresponds to the sum of the start and end logits of the span.
-            - **relevance_score**: ``float`` that corresponds to the score of the each passage to answer the question,
+            - **relevance_score**: `float` that corresponds to the score of the each passage to answer the question,
               compared to all the other passages. It corresponds to the output of the QA classifier of the DPRReader.
-            - **doc_id**: ``int``` the id of the passage.
-            - **start_index**: ``int`` the start index of the span (inclusive).
-            - **end_index**: ``int`` the end index of the span (inclusive).
+            - **doc_id**: ``int``` the id of the passage. - **start_index**: `int` the start index of the span (inclusive). - **end_index**: `int` the end index of the span (inclusive).
 
-        Examples::
+        Examples:
 
-            >>> from transformers import DPRReader, DPRReaderTokenizer
-            >>> tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
-            >>> model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base')
-            >>> encoded_inputs = tokenizer(
-            ...         questions=["What is love ?"],
-            ...         titles=["Haddaway"],
-            ...         texts=["'What Is Love' is a song recorded by the artist Haddaway"],
-            ...         return_tensors='pt'
-            ...     )
-            >>> outputs = model(**encoded_inputs)
-            >>> predicted_spans = tokenizer.decode_best_spans(encoded_inputs, outputs)
-            >>> print(predicted_spans[0].text)  # best span
-
-        """
+        ```python
+        >>> from transformers import DPRReader, DPRReaderTokenizer
+        >>> tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
+        >>> model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base')
+        >>> encoded_inputs = tokenizer(
+        ...         questions=["What is love ?"],
+        ...         titles=["Haddaway"],
+        ...         texts=["'What Is Love' is a song recorded by the artist Haddaway"],
+        ...         return_tensors='pt'
+        ...     )
+        >>> outputs = model(**encoded_inputs)
+        >>> predicted_spans = tokenizer.decode_best_spans(encoded_inputs, outputs)
+        >>> print(predicted_spans[0].text)  # best span
+        ```"""
         input_ids = reader_input["input_ids"]
         start_logits, end_logits, relevance_logits = reader_output[:3]
         n_passages = len(relevance_logits)
@@ -373,11 +371,11 @@ class DPRReaderTokenizer(CustomDPRReaderTokenizerMixin, BertTokenizer):
     r"""
     Construct a DPRReader tokenizer.
 
-    :class:`~transformers.DPRReaderTokenizer` is almost identical to :class:`~transformers.BertTokenizer` and runs
+    [`DPRReaderTokenizer`] is almost identical to [`BertTokenizer`] and runs
     end-to-end tokenization: punctuation splitting and wordpiece. The difference is that is has three inputs strings:
-    question, titles and texts that are combined to be fed to the :class:`~transformers.DPRReader` model.
+    question, titles and texts that are combined to be fed to the [`DPRReader`] model.
 
-    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning
     parameters.
     """
 
diff --git a/src/transformers/models/dpr/tokenization_dpr_fast.py b/src/transformers/models/dpr/tokenization_dpr_fast.py
index 1f5a37be24..d4d5f41418 100644
--- a/src/transformers/models/dpr/tokenization_dpr_fast.py
+++ b/src/transformers/models/dpr/tokenization_dpr_fast.py
@@ -90,12 +90,12 @@ READER_PRETRAINED_INIT_CONFIGURATION = {
 
 class DPRContextEncoderTokenizerFast(BertTokenizerFast):
     r"""
-    Construct a "fast" DPRContextEncoder tokenizer (backed by HuggingFace's `tokenizers` library).
+    Construct a "fast" DPRContextEncoder tokenizer (backed by HuggingFace's *tokenizers* library).
 
-    :class:`~transformers.DPRContextEncoderTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and
+    [`DPRContextEncoderTokenizerFast`] is identical to [`BertTokenizerFast`] and
     runs end-to-end tokenization: punctuation splitting and wordpiece.
 
-    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning
     parameters.
     """
 
@@ -108,12 +108,12 @@ class DPRContextEncoderTokenizerFast(BertTokenizerFast):
 
 class DPRQuestionEncoderTokenizerFast(BertTokenizerFast):
     r"""
-    Constructs a "fast" DPRQuestionEncoder tokenizer (backed by HuggingFace's `tokenizers` library).
+    Constructs a "fast" DPRQuestionEncoder tokenizer (backed by HuggingFace's *tokenizers* library).
 
-    :class:`~transformers.DPRQuestionEncoderTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and
+    [`DPRQuestionEncoderTokenizerFast`] is identical to [`BertTokenizerFast`] and
     runs end-to-end tokenization: punctuation splitting and wordpiece.
 
-    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning
     parameters.
     """
 
@@ -133,68 +133,68 @@ DPRReaderOutput = collections.namedtuple("DPRReaderOutput", ["start_logits", "en
 
 CUSTOM_DPR_READER_DOCSTRING = r"""
     Return a dictionary with the token ids of the input strings and other information to give to
-    :obj:`.decode_best_spans`. It converts the strings of a question and different passages (title and text) in a
-    sequence of IDs (integers), using the tokenizer and vocabulary. The resulting :obj:`input_ids` is a matrix of size
-    :obj:`(n_passages, sequence_length)` with the format:
+    `.decode_best_spans`. It converts the strings of a question and different passages (title and text) in a
+    sequence of IDs (integers), using the tokenizer and vocabulary. The resulting `input_ids` is a matrix of size
+    `(n_passages, sequence_length)` with the format:
 
     [CLS] <question token ids> [SEP] <titles ids> [SEP] <texts ids>
 
     Args:
-        questions (:obj:`str` or :obj:`List[str]`):
+        questions (`str` or `List[str]`):
             The questions to be encoded. You can specify one question for many passages. In this case, the question
-            will be duplicated like :obj:`[questions] * n_passages`. Otherwise you have to specify as many questions as
-            in :obj:`titles` or :obj:`texts`.
-        titles (:obj:`str` or :obj:`List[str]`):
+            will be duplicated like `[questions] * n_passages`. Otherwise you have to specify as many questions as
+            in `titles` or `texts`.
+        titles (`str` or `List[str]`):
             The passages titles to be encoded. This can be a string or a list of strings if there are several passages.
-        texts (:obj:`str` or :obj:`List[str]`):
+        texts (`str` or `List[str]`):
             The passages texts to be encoded. This can be a string or a list of strings if there are several passages.
-        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
+        padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
             Activates and controls padding. Accepts the following values:
 
-            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+            - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
               sequence if provided).
-            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
               maximum acceptable input length for the model if that argument is not provided.
-            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+            - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
               different lengths).
-        truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
+        truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
             Activates and controls truncation. Accepts the following values:
 
-            * :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument
-              :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
+            - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument
+              `max_length` or to the maximum acceptable input length for the model if that argument is not
               provided. This will truncate token by token, removing a token from the longest sequence in the pair if a
               pair of sequences (or a batch of pairs) is provided.
-            * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to the
+            - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
               maximum acceptable input length for the model if that argument is not provided. This will only truncate
               the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-            * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
+            - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to
               the maximum acceptable input length for the model if that argument is not provided. This will only
               truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-            * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence
+            - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence
               lengths greater than the model maximum admissible input size).
-        max_length (:obj:`int`, `optional`):
+        max_length (`int`, *optional*):
                 Controls the maximum length to use by one of the truncation/padding parameters.
 
-                If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
+                If left unset or set to `None`, this will use the predefined model maximum length if a maximum
                 length is required by one of the truncation/padding parameters. If the model has no specific maximum
                 input length (like XLNet) truncation/padding to a maximum length will be deactivated.
-        return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+        return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
 
-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
-        return_attention_mask (:obj:`bool`, `optional`):
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+        return_attention_mask (`bool`, *optional*):
             Whether or not to return the attention mask. If not set, will return the attention mask according to the
-            specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+            specific tokenizer's default, defined by the `return_outputs` attribute.
 
-            `What are attention masks? <../glossary.html#attention-mask>`__
+            [What are attention masks?](../glossary#attention-mask)
 
     Return:
-        :obj:`Dict[str, List[List[int]]]`: A dictionary with the following keys:
+        `Dict[str, List[List[int]]]`: A dictionary with the following keys:
 
-        - ``input_ids``: List of token ids to be fed to a model.
-        - ``attention_mask``: List of indices specifying which tokens should be attended to by the model.
+        - `input_ids`: List of token ids to be fed to a model.
+        - `attention_mask`: List of indices specifying which tokens should be attended to by the model.
     """
 
 
@@ -269,33 +269,31 @@ class CustomDPRReaderTokenizerMixin:
         """
         Get the span predictions for the extractive Q&A model.
 
-        Returns: `List` of `DPRReaderOutput` sorted by descending `(relevance_score, span_score)`. Each
-        `DPRReaderOutput` is a `Tuple` with:
+        Returns: *List* of *DPRReaderOutput* sorted by descending *(relevance_score, span_score)*. Each
+        *DPRReaderOutput* is a *Tuple* with:
 
-            - **span_score**: ``float`` that corresponds to the score given by the reader for this span compared to
+            - **span_score**: `float` that corresponds to the score given by the reader for this span compared to
               other spans in the same passage. It corresponds to the sum of the start and end logits of the span.
-            - **relevance_score**: ``float`` that corresponds to the score of the each passage to answer the question,
+            - **relevance_score**: `float` that corresponds to the score of the each passage to answer the question,
               compared to all the other passages. It corresponds to the output of the QA classifier of the DPRReader.
-            - **doc_id**: ``int``` the id of the passage.
-            - ***start_index**: ``int`` the start index of the span (inclusive).
-            - **end_index**: ``int`` the end index of the span (inclusive).
+            - **doc_id**: ``int``` the id of the passage. - ***start_index**: `int` the start index of the span (inclusive). - **end_index**: `int` the end index of the span (inclusive).
 
-        Examples::
+        Examples:
 
-            >>> from transformers import DPRReader, DPRReaderTokenizer
-            >>> tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
-            >>> model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base')
-            >>> encoded_inputs = tokenizer(
-            ...         questions=["What is love ?"],
-            ...         titles=["Haddaway"],
-            ...         texts=["'What Is Love' is a song recorded by the artist Haddaway"],
-            ...         return_tensors='pt'
-            ...     )
-            >>> outputs = model(**encoded_inputs)
-            >>> predicted_spans = tokenizer.decode_best_spans(encoded_inputs, outputs)
-            >>> print(predicted_spans[0].text)  # best span
-
-        """
+        ```python
+        >>> from transformers import DPRReader, DPRReaderTokenizer
+        >>> tokenizer = DPRReaderTokenizer.from_pretrained('facebook/dpr-reader-single-nq-base')
+        >>> model = DPRReader.from_pretrained('facebook/dpr-reader-single-nq-base')
+        >>> encoded_inputs = tokenizer(
+        ...         questions=["What is love ?"],
+        ...         titles=["Haddaway"],
+        ...         texts=["'What Is Love' is a song recorded by the artist Haddaway"],
+        ...         return_tensors='pt'
+        ...     )
+        >>> outputs = model(**encoded_inputs)
+        >>> predicted_spans = tokenizer.decode_best_spans(encoded_inputs, outputs)
+        >>> print(predicted_spans[0].text)  # best span
+        ```"""
         input_ids = reader_input["input_ids"]
         start_logits, end_logits, relevance_logits = reader_output[:3]
         n_passages = len(relevance_logits)
@@ -372,13 +370,13 @@ class CustomDPRReaderTokenizerMixin:
 @add_end_docstrings(CUSTOM_DPR_READER_DOCSTRING)
 class DPRReaderTokenizerFast(CustomDPRReaderTokenizerMixin, BertTokenizerFast):
     r"""
-    Constructs a "fast" DPRReader tokenizer (backed by HuggingFace's `tokenizers` library).
+    Constructs a "fast" DPRReader tokenizer (backed by HuggingFace's *tokenizers* library).
 
-    :class:`~transformers.DPRReaderTokenizerFast` is almost identical to :class:`~transformers.BertTokenizerFast` and
+    [`DPRReaderTokenizerFast`] is almost identical to [`BertTokenizerFast`] and
     runs end-to-end tokenization: punctuation splitting and wordpiece. The difference is that is has three inputs
-    strings: question, titles and texts that are combined to be fed to the :class:`~transformers.DPRReader` model.
+    strings: question, titles and texts that are combined to be fed to the [`DPRReader`] model.
 
-    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning
     parameters.
 
     """
diff --git a/src/transformers/models/electra/configuration_electra.py b/src/transformers/models/electra/configuration_electra.py
index b0fb6ea73c..963460318e 100644
--- a/src/transformers/models/electra/configuration_electra.py
+++ b/src/transformers/models/electra/configuration_electra.py
@@ -33,96 +33,94 @@ ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class ElectraConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.ElectraModel` or a
-    :class:`~transformers.TFElectraModel`. It is used to instantiate a ELECTRA model according to the specified
+    This is the configuration class to store the configuration of a [`ElectraModel`] or a
+    [`TFElectraModel`]. It is used to instantiate a ELECTRA model according to the specified
     arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
-    configuration to that of the ELECTRA `google/electra-small-discriminator
-    <https://huggingface.co/google/electra-small-discriminator>`__ architecture.
+    configuration to that of the ELECTRA [google/electra-small-discriminator](https://huggingface.co/google/electra-small-discriminator) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the ELECTRA model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.ElectraModel` or
-            :class:`~transformers.TFElectraModel`.
-        embedding_size (:obj:`int`, `optional`, defaults to 128):
+            `inputs_ids` passed when calling [`ElectraModel`] or
+            [`TFElectraModel`].
+        embedding_size (`int`, *optional*, defaults to 128):
             Dimensionality of the encoder layers and the pooler layer.
-        hidden_size (:obj:`int`, `optional`, defaults to 256):
+        hidden_size (`int`, *optional*, defaults to 256):
             Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 4):
+        num_attention_heads (`int`, *optional*, defaults to 4):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 1024):
+        intermediate_size (`int`, *optional*, defaults to 1024):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.ElectraModel` or
-            :class:`~transformers.TFElectraModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`ElectraModel`] or
+            [`TFElectraModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        summary_type (:obj:`str`, `optional`, defaults to :obj:`"first"`):
+        summary_type (`str`, *optional*, defaults to `"first"`):
             Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
 
             Has to be one of the following options:
 
-                - :obj:`"last"`: Take the last token hidden state (like XLNet).
-                - :obj:`"first"`: Take the first token hidden state (like BERT).
-                - :obj:`"mean"`: Take the mean of all tokens hidden states.
-                - :obj:`"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
-                - :obj:`"attn"`: Not implemented now, use multi-head attention.
-        summary_use_proj (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                - `"last"`: Take the last token hidden state (like XLNet).
+                - `"first"`: Take the first token hidden state (like BERT).
+                - `"mean"`: Take the mean of all tokens hidden states.
+                - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
+                - `"attn"`: Not implemented now, use multi-head attention.
+        summary_use_proj (`bool`, *optional*, defaults to `True`):
             Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
 
             Whether or not to add a projection after the vector extraction.
-        summary_activation (:obj:`str`, `optional`):
+        summary_activation (`str`, *optional*):
             Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
 
-            Pass :obj:`"gelu"` for a gelu activation to the output, any other value will result in no activation.
-        summary_last_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            Pass `"gelu"` for a gelu activation to the output, any other value will result in no activation.
+        summary_last_dropout (`float`, *optional*, defaults to 0.0):
             Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
 
             The dropout ratio to be used after the projection and activation.
-        position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
-            Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
-            :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
-            :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
-            <https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
-            `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
-            <https://arxiv.org/abs/2009.13658>`__.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`,
+            `"relative_key_query"`. For positional embeddings use `"absolute"`. For more information on
+            `"relative_key"`, please refer to [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). For more information on `"relative_key_query"`, please refer to
+            *Method 4* in [Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if ``config.is_decoder=True``.
-        classifier_dropout (:obj:`float`, `optional`):
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
             The dropout ratio for the classification head.
 
-    Examples::
+    Examples:
 
-        >>> from transformers import ElectraModel, ElectraConfig
+    ```python
+    >>> from transformers import ElectraModel, ElectraConfig
 
-        >>> # Initializing a ELECTRA electra-base-uncased style configuration
-        >>> configuration = ElectraConfig()
+    >>> # Initializing a ELECTRA electra-base-uncased style configuration
+    >>> configuration = ElectraConfig()
 
-        >>> # Initializing a model from the electra-base-uncased style configuration
-        >>> model = ElectraModel(configuration)
+    >>> # Initializing a model from the electra-base-uncased style configuration
+    >>> model = ElectraModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "electra"
 
     def __init__(
diff --git a/src/transformers/models/electra/modeling_flax_electra.py b/src/transformers/models/electra/modeling_flax_electra.py
index 020a18eba6..afa028fb45 100644
--- a/src/transformers/models/electra/modeling_flax_electra.py
+++ b/src/transformers/models/electra/modeling_flax_electra.py
@@ -814,17 +814,19 @@ class FlaxElectraForPreTraining(FlaxElectraPreTrainedModel):
 FLAX_ELECTRA_FOR_PRETRAINING_DOCSTRING = """
     Returns:
 
-    Example::
+    Example:
 
-        >>> from transformers import ElectraTokenizer, FlaxElectraForPreTraining
+    ```python
+    >>> from transformers import ElectraTokenizer, FlaxElectraForPreTraining
 
-        >>> tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
-        >>> model = FlaxElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
+    >>> tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
+    >>> model = FlaxElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
 
-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
-        >>> outputs = model(**inputs)
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
+    >>> outputs = model(**inputs)
 
-        >>> prediction_logits = outputs.logits
+    >>> prediction_logits = outputs.logits
+    ```
 """
 
 overwrite_call_docstring(
diff --git a/src/transformers/models/electra/modeling_tf_electra.py b/src/transformers/models/electra/modeling_tf_electra.py
index 10f8ac6cfc..2dedc146f6 100644
--- a/src/transformers/models/electra/modeling_tf_electra.py
+++ b/src/transformers/models/electra/modeling_tf_electra.py
@@ -1082,17 +1082,18 @@ class TFElectraForPreTraining(TFElectraPreTrainedModel):
         r"""
         Returns:
 
-        Examples::
+        Examples:
 
-            >>> import tensorflow as tf
-            >>> from transformers import ElectraTokenizer, TFElectraForPreTraining
+        ```python
+        >>> import tensorflow as tf
+        >>> from transformers import ElectraTokenizer, TFElectraForPreTraining
 
-            >>> tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
-            >>> model = TFElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
-            >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-            >>> outputs = model(input_ids)
-            >>> scores = outputs[0]
-        """
+        >>> tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-discriminator')
+        >>> model = TFElectraForPreTraining.from_pretrained('google/electra-small-discriminator')
+        >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> scores = outputs[0]
+        ```"""
         inputs = input_processing(
             func=self.call,
             config=self.config,
diff --git a/src/transformers/models/electra/tokenization_electra.py b/src/transformers/models/electra/tokenization_electra.py
index 89c6c922e9..8f087263d8 100644
--- a/src/transformers/models/electra/tokenization_electra.py
+++ b/src/transformers/models/electra/tokenization_electra.py
@@ -53,10 +53,10 @@ class ElectraTokenizer(BertTokenizer):
     r"""
     Construct an ELECTRA tokenizer.
 
-    :class:`~transformers.ElectraTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
+    [`ElectraTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end
     tokenization: punctuation splitting and wordpiece.
 
-    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning
     parameters.
     """
 
diff --git a/src/transformers/models/electra/tokenization_electra_fast.py b/src/transformers/models/electra/tokenization_electra_fast.py
index 67259d83ea..41c7bd5536 100644
--- a/src/transformers/models/electra/tokenization_electra_fast.py
+++ b/src/transformers/models/electra/tokenization_electra_fast.py
@@ -60,12 +60,12 @@ PRETRAINED_INIT_CONFIGURATION = {
 
 class ElectraTokenizerFast(BertTokenizerFast):
     r"""
-    Construct a "fast" ELECTRA tokenizer (backed by HuggingFace's `tokenizers` library).
+    Construct a "fast" ELECTRA tokenizer (backed by HuggingFace's *tokenizers* library).
 
-    :class:`~transformers.ElectraTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
+    [`ElectraTokenizerFast`] is identical to [`BertTokenizerFast`] and runs
     end-to-end tokenization: punctuation splitting and wordpiece.
 
-    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning
     parameters.
     """
     vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py
index b12e32a2c3..4fc7f6b563 100644
--- a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py
@@ -25,49 +25,50 @@ logger = logging.get_logger(__name__)
 
 class EncoderDecoderConfig(PretrainedConfig):
     r"""
-    :class:`~transformers.EncoderDecoderConfig` is the configuration class to store the configuration of a
-    :class:`~transformers.EncoderDecoderModel`. It is used to instantiate an Encoder Decoder model according to the
+    [`EncoderDecoderConfig`] is the configuration class to store the configuration of a
+    [`EncoderDecoderModel`]. It is used to instantiate an Encoder Decoder model according to the
     specified arguments, defining the encoder and decoder configs.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        kwargs (`optional`):
+        kwargs (*optional*):
             Dictionary of keyword arguments. Notably:
 
-                - **encoder** (:class:`~transformers.PretrainedConfig`, `optional`) -- An instance of a configuration
+                - **encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration
                   object that defines the encoder config.
-                - **decoder** (:class:`~transformers.PretrainedConfig`, `optional`) -- An instance of a configuration
+                - **decoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration
                   object that defines the decoder config.
 
-    Examples::
+    Examples:
 
-        >>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
+    ```python
+    >>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
 
-        >>> # Initializing a BERT bert-base-uncased style configuration
-        >>> config_encoder = BertConfig()
-        >>> config_decoder = BertConfig()
+    >>> # Initializing a BERT bert-base-uncased style configuration
+    >>> config_encoder = BertConfig()
+    >>> config_decoder = BertConfig()
 
-        >>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
+    >>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
 
-        >>> # Initializing a Bert2Bert model from the bert-base-uncased style configurations
-        >>> model = EncoderDecoderModel(config=config)
+    >>> # Initializing a Bert2Bert model from the bert-base-uncased style configurations
+    >>> model = EncoderDecoderModel(config=config)
 
-        >>> # Accessing the model configuration
-        >>> config_encoder = model.config.encoder
-        >>> config_decoder  = model.config.decoder
-        >>> # set decoder config to causal lm
-        >>> config_decoder.is_decoder = True
-        >>> config_decoder.add_cross_attention = True
+    >>> # Accessing the model configuration
+    >>> config_encoder = model.config.encoder
+    >>> config_decoder  = model.config.decoder
+    >>> # set decoder config to causal lm
+    >>> config_decoder.is_decoder = True
+    >>> config_decoder.add_cross_attention = True
 
-        >>> # Saving the model, including its configuration
-        >>> model.save_pretrained('my-model')
+    >>> # Saving the model, including its configuration
+    >>> model.save_pretrained('my-model')
 
-        >>> # loading model and config from pretrained folder
-        >>> encoder_decoder_config = EncoderDecoderConfig.from_pretrained('my-model')
-        >>> model = EncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config)
-    """
+    >>> # loading model and config from pretrained folder
+    >>> encoder_decoder_config = EncoderDecoderConfig.from_pretrained('my-model')
+    >>> model = EncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config)
+    ```"""
     model_type = "encoder-decoder"
     is_composition = True
 
@@ -92,11 +93,11 @@ class EncoderDecoderConfig(PretrainedConfig):
         cls, encoder_config: PretrainedConfig, decoder_config: PretrainedConfig, **kwargs
     ) -> PretrainedConfig:
         r"""
-        Instantiate a :class:`~transformers.EncoderDecoderConfig` (or a derived class) from a pre-trained encoder model
+        Instantiate a [`EncoderDecoderConfig`] (or a derived class) from a pre-trained encoder model
         configuration and decoder model configuration.
 
         Returns:
-            :class:`EncoderDecoderConfig`: An instance of a configuration object
+            [`EncoderDecoderConfig`]: An instance of a configuration object
         """
         logger.info("Set `config.is_decoder=True` and `config.add_cross_attention=True` for decoder_config")
         decoder_config.is_decoder = True
@@ -106,10 +107,10 @@ class EncoderDecoderConfig(PretrainedConfig):
 
     def to_dict(self):
         """
-        Serializes this instance to a Python dictionary. Override the default `to_dict()` from `PretrainedConfig`.
+        Serializes this instance to a Python dictionary. Override the default *to_dict()* from *PretrainedConfig*.
 
         Returns:
-            :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
         """
         output = copy.deepcopy(self.__dict__)
         output["encoder"] = self.encoder.to_dict()
diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
index 27d69e9ef9..96205ea334 100644
--- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
@@ -444,32 +444,32 @@ class EncoderDecoderModel(PreTrainedModel):
         r"""
         Returns:
 
-        Examples::
+        Examples:
 
-            >>> from transformers import EncoderDecoderModel, BertTokenizer
-            >>> import torch
+        ```python
+        >>> from transformers import EncoderDecoderModel, BertTokenizer
+        >>> import torch
 
-            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-            >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert from pre-trained checkpoints
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert from pre-trained checkpoints
 
-            >>> # training
-            >>> model.config.decoder_start_token_id = tokenizer.cls_token_id
-            >>> model.config.pad_token_id = tokenizer.pad_token_id
-            >>> model.config.vocab_size = model.config.decoder.vocab_size
+        >>> # training
+        >>> model.config.decoder_start_token_id = tokenizer.cls_token_id
+        >>> model.config.pad_token_id = tokenizer.pad_token_id
+        >>> model.config.vocab_size = model.config.decoder.vocab_size
 
-            >>> input_ids = tokenizer("This is a really long text", return_tensors="pt").input_ids
-            >>> labels = tokenizer("This is the corresponding summary", return_tensors="pt").input_ids
-            >>> outputs = model(input_ids=input_ids, labels=input_ids)
-            >>> loss, logits = outputs.loss, outputs.logits
+        >>> input_ids = tokenizer("This is a really long text", return_tensors="pt").input_ids
+        >>> labels = tokenizer("This is the corresponding summary", return_tensors="pt").input_ids
+        >>> outputs = model(input_ids=input_ids, labels=input_ids)
+        >>> loss, logits = outputs.loss, outputs.logits
 
-            >>> # save and load from pretrained
-            >>> model.save_pretrained("bert2bert")
-            >>> model = EncoderDecoderModel.from_pretrained("bert2bert")
+        >>> # save and load from pretrained
+        >>> model.save_pretrained("bert2bert")
+        >>> model = EncoderDecoderModel.from_pretrained("bert2bert")
 
-            >>> # generation
-            >>> generated = model.generate(input_ids)
-
-        """
+        >>> # generation
+        >>> generated = model.generate(input_ids)
+        ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         kwargs_encoder = {argument: value for argument, value in kwargs.items() if not argument.startswith("decoder_")}
diff --git a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
index 3cfb2eb334..186b2ee527 100644
--- a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
@@ -428,20 +428,20 @@ class FlaxEncoderDecoderModel(FlaxPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import FlaxEncoderDecoderModel, BertTokenizer
+        ```python
+        >>> from transformers import FlaxEncoderDecoderModel, BertTokenizer
 
-            >>> # initialize a bert2gpt2 from pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
-            >>> model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-cased', 'gpt2')
+        >>> # initialize a bert2gpt2 from pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
+        >>> model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-cased', 'gpt2')
 
-            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
 
-            >>> text = "My friends are cool but they eat too many carbs."
-            >>> input_ids = tokenizer.encode(text, return_tensors='np')
-            >>> encoder_outputs = model.encode(input_ids)
-
-        """
+        >>> text = "My friends are cool but they eat too many carbs."
+        >>> input_ids = tokenizer.encode(text, return_tensors='np')
+        >>> encoder_outputs = model.encode(input_ids)
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -505,27 +505,27 @@ class FlaxEncoderDecoderModel(FlaxPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import FlaxEncoderDecoderModel, BertTokenizer
-            >>> import jax.numpy as jnp
+        ```python
+        >>> from transformers import FlaxEncoderDecoderModel, BertTokenizer
+        >>> import jax.numpy as jnp
 
-            >>> # initialize a bert2gpt2 from pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
-            >>> model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-cased', 'gpt2')
+        >>> # initialize a bert2gpt2 from pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
+        >>> model = FlaxEncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-cased', 'gpt2')
 
-            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
 
-            >>> text = "My friends are cool but they eat too many carbs."
-            >>> input_ids = tokenizer.encode(text, max_length=1024, return_tensors='np')
-            >>> encoder_outputs = model.encode(input_ids)
+        >>> text = "My friends are cool but they eat too many carbs."
+        >>> input_ids = tokenizer.encode(text, max_length=1024, return_tensors='np')
+        >>> encoder_outputs = model.encode(input_ids)
 
-            >>> decoder_start_token_id = model.config.decoder.bos_token_id
-            >>> decoder_input_ids = jnp.ones((input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> decoder_start_token_id = model.config.decoder.bos_token_id
+        >>> decoder_input_ids = jnp.ones((input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
 
-            >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
-            >>> logits = outputs.logits
-
-        """
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> logits = outputs.logits
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -631,32 +631,33 @@ class FlaxEncoderDecoderModel(FlaxPreTrainedModel):
         r"""
         Returns:
 
-        Examples::
+        Examples:
 
-            >>> from transformers import FlaxEncoderDecoderModel, BertTokenizer, GPT2Tokenizer
+        ```python
+        >>> from transformers import FlaxEncoderDecoderModel, BertTokenizer, GPT2Tokenizer
 
-            >>> # load a fine-tuned bert2gpt2 model
-            >>> model = FlaxEncoderDecoderModel.from_pretrained("patrickvonplaten/bert2gpt2-cnn_dailymail-fp16")
-            >>> # load input & output tokenizer
-            >>> tokenizer_input = BertTokenizer.from_pretrained('bert-base-cased')
-            >>> tokenizer_output = GPT2Tokenizer.from_pretrained('gpt2')
+        >>> # load a fine-tuned bert2gpt2 model
+        >>> model = FlaxEncoderDecoderModel.from_pretrained("patrickvonplaten/bert2gpt2-cnn_dailymail-fp16")
+        >>> # load input & output tokenizer
+        >>> tokenizer_input = BertTokenizer.from_pretrained('bert-base-cased')
+        >>> tokenizer_output = GPT2Tokenizer.from_pretrained('gpt2')
 
-            >>> article = '''Sigma Alpha Epsilon is under fire for a video showing party-bound fraternity members
-            ... singing a racist chant. SAE's national chapter suspended the students,
-            ... but University of Oklahoma President David Boren took it a step further,
-            ... saying the university's affiliation with the fraternity is permanently done.'''
+        >>> article = '''Sigma Alpha Epsilon is under fire for a video showing party-bound fraternity members
+        ... singing a racist chant. SAE's national chapter suspended the students,
+        ... but University of Oklahoma President David Boren took it a step further,
+        ... saying the university's affiliation with the fraternity is permanently done.'''
 
-            >>> input_ids = tokenizer_input(article, add_special_tokens=True, return_tensors='np').input_ids
+        >>> input_ids = tokenizer_input(article, add_special_tokens=True, return_tensors='np').input_ids
 
-            >>> # use GPT2's eos_token as the pad as well as eos token
-            >>> model.config.eos_token_id = model.config.decoder.eos_token_id
-            >>> model.config.pad_token_id = model.config.eos_token_id
+        >>> # use GPT2's eos_token as the pad as well as eos token
+        >>> model.config.eos_token_id = model.config.decoder.eos_token_id
+        >>> model.config.pad_token_id = model.config.eos_token_id
 
-            >>> sequences = model.generate(input_ids, num_beams=4, max_length=12).sequences
+        >>> sequences = model.generate(input_ids, num_beams=4, max_length=12).sequences
 
-            >>> summary = tokenizer_output.batch_decode(sequences, skip_special_tokens=True)[0]
-            >>> assert summary == "SAS Alpha Epsilon suspended Sigma Alpha Epsilon members"
-        """
+        >>> summary = tokenizer_output.batch_decode(sequences, skip_special_tokens=True)[0]
+        >>> assert summary == "SAS Alpha Epsilon suspended Sigma Alpha Epsilon members"
+        ```"""
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
diff --git a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
index 9dc68878f3..4735d94a3f 100644
--- a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
@@ -263,26 +263,28 @@ class TFEncoderDecoderModel(TFPreTrainedModel):
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         r"""
-        Initializing `TFEncoderDecoderModel` from a pytorch checkpoint is not supported currently.
+        Initializing *TFEncoderDecoderModel* from a pytorch checkpoint is not supported currently.
 
-        If there are only pytorch checkpoints for a particular encoder-decoder model, a workaround is::
+        If there are only pytorch checkpoints for a particular encoder-decoder model, a workaround is:
 
-            >>> # a workaround to load from pytorch checkpoint
-            >>> _model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16")
-            >>> _model.encoder.save_pretrained("./encoder")
-            >>> _model.decoder.save_pretrained("./decoder")
-            >>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained(
-            ...     "./encoder", "./decoder", encoder_from_pt=True, decoder_from_pt=True
-            ... )
-            >>> # This is only for copying some specific attributes of this particular model.
-            >>> model.config = _model.config
+        ```python
+        >>> # a workaround to load from pytorch checkpoint
+        >>> _model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16")
+        >>> _model.encoder.save_pretrained("./encoder")
+        >>> _model.decoder.save_pretrained("./decoder")
+        >>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained(
+        ...     "./encoder", "./decoder", encoder_from_pt=True, decoder_from_pt=True
+        ... )
+        >>> # This is only for copying some specific attributes of this particular model.
+        >>> model.config = _model.config
+        ```
 
-        Example::
+        Example:
 
-            >>> from transformers import TFEncoderDecoderModel
-            >>> model = TFEncoderDecoderModel.from_pretrained("ydshieh/bert2bert-cnn_dailymail-fp16")
-
-        """
+        ```python
+        >>> from transformers import TFEncoderDecoderModel
+        >>> model = TFEncoderDecoderModel.from_pretrained("ydshieh/bert2bert-cnn_dailymail-fp16")
+        ```"""
 
         from_pt = kwargs.pop("from_pt", False)
         if from_pt:
@@ -481,31 +483,31 @@ class TFEncoderDecoderModel(TFPreTrainedModel):
         r"""
         Returns:
 
-        Examples::
+        Examples:
 
-            >>> from transformers import TFEncoderDecoderModel, BertTokenizer
+        ```python
+        >>> from transformers import TFEncoderDecoderModel, BertTokenizer
 
-            >>> # initialize a bert2gpt2 from a pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
-            >>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-cased', 'gpt2')
+        >>> # initialize a bert2gpt2 from a pretrained BERT and GPT2 models. Note that the cross-attention layers will be randomly initialized
+        >>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-cased', 'gpt2')
 
-            >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
 
-            >>> # forward
-            >>> input_ids = tokenizer.encode("Hello, my dog is cute", add_special_tokens=True, return_tensors='tf')  # Batch size 1
-            >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
+        >>> # forward
+        >>> input_ids = tokenizer.encode("Hello, my dog is cute", add_special_tokens=True, return_tensors='tf')  # Batch size 1
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)
 
-            >>> # training
-            >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)
-            >>> loss, logits = outputs.loss, outputs.logits
+        >>> # training
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)
+        >>> loss, logits = outputs.loss, outputs.logits
 
-            >>> # save and load from pretrained
-            >>> model.save_pretrained("bert2gpt2")
-            >>> model = TFEncoderDecoderModel.from_pretrained("bert2gpt2")
+        >>> # save and load from pretrained
+        >>> model.save_pretrained("bert2gpt2")
+        >>> model = TFEncoderDecoderModel.from_pretrained("bert2gpt2")
 
-            >>> # generation
-            >>> generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.bos_token_id)
-
-        """
+        >>> # generation
+        >>> generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.bos_token_id)
+        ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         kwargs_encoder = {argument: value for argument, value in kwargs.items() if not argument.startswith("decoder_")}
diff --git a/src/transformers/models/flaubert/configuration_flaubert.py b/src/transformers/models/flaubert/configuration_flaubert.py
index a372ff47ce..14509347f4 100644
--- a/src/transformers/models/flaubert/configuration_flaubert.py
+++ b/src/transformers/models/flaubert/configuration_flaubert.py
@@ -30,105 +30,105 @@ FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class FlaubertConfig(XLMConfig):
     """
-    This is the configuration class to store the configuration of a :class:`~transformers.FlaubertModel` or a
-    :class:`~transformers.TFFlaubertModel`. It is used to instantiate a FlauBERT model according to the specified
+    This is the configuration class to store the configuration of a [`FlaubertModel`] or a
+    [`TFFlaubertModel`]. It is used to instantiate a FlauBERT model according to the specified
     arguments, defining the model architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        pre_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        pre_norm (`bool`, *optional*, defaults to `False`):
             Whether to apply the layer normalization before or after the feed forward layer following the attention in
             each layer (Vaswani et al., Tensor2Tensor for Neural Machine Translation. 2018)
-        layerdrop (:obj:`float`, `optional`, defaults to 0.0):
+        layerdrop (`float`, *optional*, defaults to 0.0):
             Probability to drop layers during training (Fan et al., Reducing Transformer Depth on Demand with
             Structured Dropout. ICLR 2020)
-        vocab_size (:obj:`int`, `optional`, defaults to 30145):
+        vocab_size (`int`, *optional*, defaults to 30145):
             Vocabulary size of the FlauBERT model. Defines the number of different tokens that can be represented by
-            the :obj:`inputs_ids` passed when calling :class:`~transformers.FlaubertModel` or
-            :class:`~transformers.TFFlaubertModel`.
-        emb_dim (:obj:`int`, `optional`, defaults to 2048):
+            the `inputs_ids` passed when calling [`FlaubertModel`] or
+            [`TFFlaubertModel`].
+        emb_dim (`int`, *optional*, defaults to 2048):
             Dimensionality of the encoder layers and the pooler layer.
-        n_layer (:obj:`int`, `optional`, defaults to 12):
+        n_layer (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        n_head (:obj:`int`, `optional`, defaults to 16):
+        n_head (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+        dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
+        attention_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for the attention mechanism
-        gelu_activation (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to use a `gelu` activation instead of `relu`.
-        sinusoidal_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        gelu_activation (`bool`, *optional*, defaults to `True`):
+            Whether or not to use a *gelu* activation instead of *relu*.
+        sinusoidal_embeddings (`bool`, *optional*, defaults to `False`):
             Whether or not to use sinusoidal positional embeddings instead of absolute positional embeddings.
-        causal (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        causal (`bool`, *optional*, defaults to `False`):
             Whether or not the model should behave in a causal manner. Causal models use a triangular attention mask in
             order to only attend to the left-side context instead if a bidirectional context.
-        asm (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        asm (`bool`, *optional*, defaults to `False`):
             Whether or not to use an adaptive log softmax projection layer instead of a linear layer for the prediction
             layer.
-        n_langs (:obj:`int`, `optional`, defaults to 1):
+        n_langs (`int`, *optional*, defaults to 1):
             The number of languages the model handles. Set to 1 for monolingual models.
-        use_lang_emb (:obj:`bool`, `optional`, defaults to :obj:`True`)
-            Whether to use language embeddings. Some models use additional language embeddings, see `the multilingual
-            models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__ for
+        use_lang_emb (`bool`, *optional*, defaults to `True`)
+            Whether to use language embeddings. Some models use additional language embeddings, see [the multilingual
+            models page](http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings) for
             information on how to use them.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        embed_init_std (:obj:`float`, `optional`, defaults to 2048^-0.5):
+        embed_init_std (`float`, *optional*, defaults to 2048^-0.5):
             The standard deviation of the truncated_normal_initializer for initializing the embedding matrices.
-        init_std (:obj:`int`, `optional`, defaults to 50257):
+        init_std (`int`, *optional*, defaults to 50257):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices except the
             embedding matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        bos_index (:obj:`int`, `optional`, defaults to 0):
+        bos_index (`int`, *optional*, defaults to 0):
             The index of the beginning of sentence token in the vocabulary.
-        eos_index (:obj:`int`, `optional`, defaults to 1):
+        eos_index (`int`, *optional*, defaults to 1):
             The index of the end of sentence token in the vocabulary.
-        pad_index (:obj:`int`, `optional`, defaults to 2):
+        pad_index (`int`, *optional*, defaults to 2):
             The index of the padding token in the vocabulary.
-        unk_index (:obj:`int`, `optional`, defaults to 3):
+        unk_index (`int`, *optional*, defaults to 3):
             The index of the unknown token in the vocabulary.
-        mask_index (:obj:`int`, `optional`, defaults to 5):
+        mask_index (`int`, *optional*, defaults to 5):
             The index of the masking token in the vocabulary.
-        is_encoder(:obj:`bool`, `optional`, defaults to :obj:`True`):
+        is_encoder(`bool`, *optional*, defaults to `True`):
             Whether or not the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
-        summary_type (:obj:`string`, `optional`, defaults to "first"):
+        summary_type (`string`, *optional*, defaults to "first"):
             Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
 
             Has to be one of the following options:
 
-                - :obj:`"last"`: Take the last token hidden state (like XLNet).
-                - :obj:`"first"`: Take the first token hidden state (like BERT).
-                - :obj:`"mean"`: Take the mean of all tokens hidden states.
-                - :obj:`"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
-                - :obj:`"attn"`: Not implemented now, use multi-head attention.
-        summary_use_proj (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                - `"last"`: Take the last token hidden state (like XLNet).
+                - `"first"`: Take the first token hidden state (like BERT).
+                - `"mean"`: Take the mean of all tokens hidden states.
+                - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
+                - `"attn"`: Not implemented now, use multi-head attention.
+        summary_use_proj (`bool`, *optional*, defaults to `True`):
             Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
 
             Whether or not to add a projection after the vector extraction.
-        summary_activation (:obj:`str`, `optional`):
+        summary_activation (`str`, *optional*):
             Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
 
-            Pass :obj:`"tanh"` for a tanh activation to the output, any other value will result in no activation.
-        summary_proj_to_labels (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
+        summary_proj_to_labels (`bool`, *optional*, defaults to `True`):
             Used in the sequence classification and multiple choice models.
 
-            Whether the projection outputs should have :obj:`config.num_labels` or :obj:`config.hidden_size` classes.
-        summary_first_dropout (:obj:`float`, `optional`, defaults to 0.1):
+            Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
+        summary_first_dropout (`float`, *optional*, defaults to 0.1):
             Used in the sequence classification and multiple choice models.
 
             The dropout ratio to be used after the projection and activation.
-        start_n_top (:obj:`int`, `optional`, defaults to 5):
+        start_n_top (`int`, *optional*, defaults to 5):
             Used in the SQuAD evaluation script.
-        end_n_top (:obj:`int`, `optional`, defaults to 5):
+        end_n_top (`int`, *optional*, defaults to 5):
             Used in the SQuAD evaluation script.
-        mask_token_id (:obj:`int`, `optional`, defaults to 0):
+        mask_token_id (`int`, *optional*, defaults to 0):
             Model agnostic parameter to identify masked tokens when generating text in an MLM context.
-        lang_id (:obj:`int`, `optional`, defaults to 1):
+        lang_id (`int`, *optional*, defaults to 1):
             The ID of the language used by the model. This parameter is used when generating text in a given language.
     """
 
diff --git a/src/transformers/models/flaubert/tokenization_flaubert.py b/src/transformers/models/flaubert/tokenization_flaubert.py
index ee6c824612..ecdaaf874a 100644
--- a/src/transformers/models/flaubert/tokenization_flaubert.py
+++ b/src/transformers/models/flaubert/tokenization_flaubert.py
@@ -82,11 +82,11 @@ class FlaubertTokenizer(XLMTokenizer):
 
     - Moses preprocessing and tokenization.
     - Normalizing all inputs text.
-    - The arguments ``special_tokens`` and the function ``set_special_tokens``, can be used to add additional symbols
+    - The arguments `special_tokens` and the function `set_special_tokens`, can be used to add additional symbols
       (like "__classify__") to a vocabulary.
-    - The argument :obj:`do_lowercase` controls lower casing (automatically set for pretrained vocabularies).
+    - The argument `do_lowercase` controls lower casing (automatically set for pretrained vocabularies).
 
-    This tokenizer inherits from :class:`~transformers.XLMTokenizer`. Please check the superclass for usage examples
+    This tokenizer inherits from [`XLMTokenizer`]. Please check the superclass for usage examples
     and documentation regarding arguments.
     """
 
diff --git a/src/transformers/models/fnet/configuration_fnet.py b/src/transformers/models/fnet/configuration_fnet.py
index a6922f8355..783064b5d4 100644
--- a/src/transformers/models/fnet/configuration_fnet.py
+++ b/src/transformers/models/fnet/configuration_fnet.py
@@ -29,63 +29,62 @@ FNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class FNetConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.FNetModel`. It is used to
+    This is the configuration class to store the configuration of a [`FNetModel`]. It is used to
     instantiate an FNet model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the FNet `fnet-base
-    <https://huggingface.co/google/fnet-base>`__ architecture.
+    configuration with the defaults will yield a similar configuration to that of the FNet [fnet-base](https://huggingface.co/google/fnet-base) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 32000):
+        vocab_size (`int`, *optional*, defaults to 32000):
             Vocabulary size of the FNet model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.FNetModel` or
-            :class:`~transformers.TFFNetModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            `inputs_ids` passed when calling [`FNetModel`] or
+            [`TFFNetModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimension of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu_new"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_new"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 4):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.FNetModel` or
-            :class:`~transformers.TFFNetModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 4):
+            The vocabulary size of the `token_type_ids` passed when calling [`FNetModel`] or
+            [`TFFNetModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        use_tpu_fourier_optimizations (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Determines whether to use TPU optimized FFTs. If :obj:`True`, the model will favor axis-wise FFTs
-            transforms. Set to :obj:`False` for GPU/CPU hardware, in which case n-dimensional FFTs are used.
-        tpu_short_seq_length (:obj:`int`, `optional`, defaults to 512):
+        use_tpu_fourier_optimizations (`bool`, *optional*, defaults to `False`):
+            Determines whether to use TPU optimized FFTs. If `True`, the model will favor axis-wise FFTs
+            transforms. Set to `False` for GPU/CPU hardware, in which case n-dimensional FFTs are used.
+        tpu_short_seq_length (`int`, *optional*, defaults to 512):
             The sequence length that is expected by the model when using TPUs. This will be used to initialize the DFT
-            matrix only when `use_tpu_fourier_optimizations` is set to :obj:`True` and the input sequence is shorter
+            matrix only when *use_tpu_fourier_optimizations* is set to `True` and the input sequence is shorter
             than or equal to 4096 tokens.
 
-    Example::
+    Example:
 
+    ```python
+    >>> from transformers import FNetModel, FNetConfig
 
-        >>> from transformers import FNetModel, FNetConfig
+    >>> # Initializing a FNet fnet-base style configuration
+    >>> configuration = FNetConfig()
 
-        >>> # Initializing a FNet fnet-base style configuration
-        >>> configuration = FNetConfig()
+    >>> # Initializing a model from the fnet-base style configuration
+    >>> model = FNetModel(configuration)
 
-        >>> # Initializing a model from the fnet-base style configuration
-        >>> model = FNetModel(configuration)
-
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "fnet"
 
     def __init__(
diff --git a/src/transformers/models/fnet/tokenization_fnet.py b/src/transformers/models/fnet/tokenization_fnet.py
index 7d9f248d86..ad7b1d6396 100644
--- a/src/transformers/models/fnet/tokenization_fnet.py
+++ b/src/transformers/models/fnet/tokenization_fnet.py
@@ -45,53 +45,51 @@ SPIECE_UNDERLINE = "▁"
 
 class FNetTokenizer(PreTrainedTokenizer):
     """
-    Construct an FNet tokenizer. Adapted from :class:`~transformers.AlbertTokenizer`. Based on `SentencePiece
-    <https://github.com/google/sentencepiece>`__. This tokenizer inherits from
-    :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods. Users should refer to this
+    Construct an FNet tokenizer. Adapted from [`AlbertTokenizer`]. Based on [SentencePiece](https://github.com/google/sentencepiece). This tokenizer inherits from
+    [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to this
     superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        do_lower_case (`bool`, *optional*, defaults to `False`):
             Whether or not to lowercase the input when tokenizing.
-        remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        remove_space (`bool`, *optional*, defaults to `True`):
             Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
-        keep_accents (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        keep_accents (`bool`, *optional*, defaults to `True`):
             Whether or not to keep accents when tokenizing.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        sp_model_kwargs (:obj:`dict`, `optional`):
-            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
-            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
 
-            - ``enable_sampling``: Enable subword regularization.
-            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
 
-              - ``nbest_size = {0,1}``: No sampling is performed.
-              - ``nbest_size > 1``: samples from the nbest_size results.
-              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                 using forward-filtering-and-backward-sampling algorithm.
-            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
               BPE-dropout.
 
     Attributes:
-        sp_model (:obj:`SentencePieceProcessor`):
-            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -221,17 +219,17 @@ class FNetTokenizer(PreTrainedTokenizer):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. An FNet sequence has the following format:
 
-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
@@ -244,18 +242,18 @@ class FNetTokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
 
         if already_has_special_tokens:
@@ -272,20 +270,22 @@ class FNetTokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Create a mask from the two sequences passed to be used in a sequence-pair classification task. An FNet sequence
-        pair mask has the following format: ::
+        pair mask has the following format: :
 
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second sequence |
+        ```python
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second sequence |
+        ```
 
-        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
             sequence(s).
         """
         sep = [self.sep_token_id]
diff --git a/src/transformers/models/fnet/tokenization_fnet_fast.py b/src/transformers/models/fnet/tokenization_fnet_fast.py
index 099fd9c0c5..d6a43f21e8 100644
--- a/src/transformers/models/fnet/tokenization_fnet_fast.py
+++ b/src/transformers/models/fnet/tokenization_fnet_fast.py
@@ -54,35 +54,34 @@ SPIECE_UNDERLINE = "▁"
 
 class FNetTokenizerFast(PreTrainedTokenizerFast):
     """
-    Construct a "fast" FNetTokenizer (backed by HuggingFace's `tokenizers` library). Adapted from
-    :class:`~transformers.AlbertTokenizerFast`. Based on `Unigram
-    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__. This tokenizer
-    inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main methods. Users should
+    Construct a "fast" FNetTokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
+    [`AlbertTokenizerFast`]. Based on [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This tokenizer
+    inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
     refer to this superclass for more information regarding those methods
 
     Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        do_lower_case (`bool`, *optional*, defaults to `False`):
             Whether or not to lowercase the input when tokenizing.
-        remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        remove_space (`bool`, *optional*, defaults to `True`):
             Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
-        keep_accents (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        keep_accents (`bool`, *optional*, defaults to `True`):
             Whether or not to keep accents when tokenizing.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
     """
@@ -142,17 +141,17 @@ class FNetTokenizerFast(PreTrainedTokenizerFast):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. An FNet sequence has the following format:
 
-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
@@ -167,21 +166,21 @@ class FNetTokenizerFast(PreTrainedTokenizerFast):
         Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An FNet
         sequence pair mask has the following format:
 
-        ::
-
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
 
         if token_ids_1 is None, only returns the first portion of the mask (0s).
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
             sequence(s).
         """
         sep = [self.sep_token_id]
diff --git a/src/transformers/models/fsmt/configuration_fsmt.py b/src/transformers/models/fsmt/configuration_fsmt.py
index 8b60883e69..6e204a8c31 100644
--- a/src/transformers/models/fsmt/configuration_fsmt.py
+++ b/src/transformers/models/fsmt/configuration_fsmt.py
@@ -40,89 +40,89 @@ class DecoderConfig(PretrainedConfig):
 
 class FSMTConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.FSMTModel`. It is used to
+    This is the configuration class to store the configuration of a [`FSMTModel`]. It is used to
     instantiate a FSMT model according to the specified arguments, defining the model architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        langs (:obj:`List[str]`):
+        langs (`List[str]`):
             A list with source language and target_language (e.g., ['en', 'ru']).
-        src_vocab_size (:obj:`int`):
+        src_vocab_size (`int`):
             Vocabulary size of the encoder. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed to the forward method in the encoder.
-        tgt_vocab_size (:obj:`int`):
+            `inputs_ids` passed to the forward method in the encoder.
+        tgt_vocab_size (`int`):
             Vocabulary size of the decoder. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed to the forward method in the decoder.
-        d_model (:obj:`int`, `optional`, defaults to 1024):
+            `inputs_ids` passed to the forward method in the decoder.
+        d_model (`int`, *optional*, defaults to 1024):
             Dimensionality of the layers and the pooler layer.
-        encoder_layers (:obj:`int`, `optional`, defaults to 12):
+        encoder_layers (`int`, *optional*, defaults to 12):
             Number of encoder layers.
-        decoder_layers (:obj:`int`, `optional`, defaults to 12):
+        decoder_layers (`int`, *optional*, defaults to 12):
             Number of decoder layers.
-        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"relu"`):
+        activation_function (`str` or `Callable`, *optional*, defaults to `"relu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        activation_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for activations inside the fully connected layer.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
+        init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        scale_embedding (`bool`, *optional*, defaults to `True`):
             Scale embeddings by diving by sqrt(d_model).
-        bos_token_id (:obj:`int`, `optional`, defaults to 0)
+        bos_token_id (`int`, *optional*, defaults to 0)
             Beginning of stream token id.
-        pad_token_id (:obj:`int`, `optional`, defaults to 1)
+        pad_token_id (`int`, *optional*, defaults to 1)
             Padding token id.
-        eos_token_id (:obj:`int`, `optional`, defaults to 2)
+        eos_token_id (`int`, *optional*, defaults to 2)
             End of stream token id.
-        decoder_start_token_id (:obj:`int`, `optional`):
-            This model starts decoding with :obj:`eos_token_id`
-        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+        decoder_start_token_id (`int`, *optional*):
+            This model starts decoding with `eos_token_id`
+        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
             Google "layerdrop arxiv", as its not explainable in one line.
-        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
             Google "layerdrop arxiv", as its not explainable in one line.
-        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
             Whether this is an encoder/decoder model.
-        tie_word_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether to tie input and output embeddings.
-        num_beams (:obj:`int`, `optional`, defaults to 5)
-            Number of beams for beam search that will be used by default in the :obj:`generate` method of the model. 1
+        num_beams (`int`, *optional*, defaults to 5)
+            Number of beams for beam search that will be used by default in the `generate` method of the model. 1
             means no beam search.
-        length_penalty (:obj:`float`, `optional`, defaults to 1)
-            Exponential penalty to the length that will be used by default in the :obj:`generate` method of the model.
-        early_stopping (:obj:`bool`, `optional`, defaults to :obj:`False`)
-            Flag that will be used by default in the :obj:`generate` method of the model. Whether to stop the beam
-            search when at least ``num_beams`` sentences are finished per batch or not.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        length_penalty (`float`, *optional*, defaults to 1)
+            Exponential penalty to the length that will be used by default in the `generate` method of the model.
+        early_stopping (`bool`, *optional*, defaults to `False`)
+            Flag that will be used by default in the `generate` method of the model. Whether to stop the beam
+            search when at least `num_beams` sentences are finished per batch or not.
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
-        forced_eos_token_id (:obj:`int`, `optional`, defaults to 2):
-            The id of the token to force as the last generated token when :obj:`max_length` is reached. Usually set to
-            :obj:`eos_token_id`.
+        forced_eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+            `eos_token_id`.
 
-        Examples::
+    Examples:
 
-            >>> from transformers import FSMTConfig, FSMTModel
+    ```python
+    >>> from transformers import FSMTConfig, FSMTModel
 
-            >>> config = FSMTConfig.from_pretrained('facebook/wmt19-en-ru')
-            >>> model = FSMTModel(config)
-
-    """
+    >>> config = FSMTConfig.from_pretrained('facebook/wmt19-en-ru')
+    >>> model = FSMTModel(config)
+    ```"""
     model_type = "fsmt"
     attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
 
@@ -208,10 +208,10 @@ class FSMTConfig(PretrainedConfig):
 
     def to_dict(self):
         """
-        Serializes this instance to a Python dictionary. Override the default `to_dict()` from `PretrainedConfig`.
+        Serializes this instance to a Python dictionary. Override the default *to_dict()* from *PretrainedConfig*.
 
         Returns:
-            :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
         """
         output = copy.deepcopy(self.__dict__)
         output["decoder"] = self.decoder.to_dict()
diff --git a/src/transformers/models/fsmt/tokenization_fsmt.py b/src/transformers/models/fsmt/tokenization_fsmt.py
index ff99d75eeb..73a1ca8322 100644
--- a/src/transformers/models/fsmt/tokenization_fsmt.py
+++ b/src/transformers/models/fsmt/tokenization_fsmt.py
@@ -140,39 +140,42 @@ class FSMTTokenizer(PreTrainedTokenizer):
 
     - Moses preprocessing and tokenization.
     - Normalizing all inputs text.
-    - The arguments ``special_tokens`` and the function ``set_special_tokens``, can be used to add additional symbols
+    - The arguments `special_tokens` and the function `set_special_tokens`, can be used to add additional symbols
       (like "__classify__") to a vocabulary.
-    - The argument :obj:`langs` defines a pair of languages.
+    - The argument `langs` defines a pair of languages.
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        langs (:obj:`List[str]`):
-            A list of two languages to translate from and to, for instance :obj:`["en", "ru"]`.
-        src_vocab_file (:obj:`str`):
+        langs (`List[str]`):
+            A list of two languages to translate from and to, for instance `["en", "ru"]`.
+        src_vocab_file (`str`):
             File containing the vocabulary for the source language.
-        tgt_vocab_file (:obj:`st`):
+        tgt_vocab_file (`st`):
             File containing the vocabulary for the target language.
-        merges_file (:obj:`str`):
+        merges_file (`str`):
             File containing the merges.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        do_lower_case (`bool`, *optional*, defaults to `False`):
             Whether or not to lowercase the input when tokenizing.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                sequence. The token used is the :obj:`cls_token`.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
 
     """
@@ -398,17 +401,17 @@ class FSMTTokenizer(PreTrainedTokenizer):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. A FAIRSEQ Transformer sequence has the following format:
 
-        - single sequence: ``<s> X </s>``
-        - pair of sequences: ``<s> A </s> B </s>``
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s> B </s>`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         sep = [self.sep_token_id]
 
@@ -422,18 +425,18 @@ class FSMTTokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
 
         if already_has_special_tokens:
@@ -452,21 +455,21 @@ class FSMTTokenizer(PreTrainedTokenizer):
         Create a mask from the two sequences passed to be used in a sequence-pair classification task. A FAIRSEQ
         Transformer sequence pair mask has the following format:
 
-        ::
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
 
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
-
-        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
             sequence(s).
 
         Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An
diff --git a/src/transformers/models/funnel/configuration_funnel.py b/src/transformers/models/funnel/configuration_funnel.py
index 1d25e765c4..5c2d1c962e 100644
--- a/src/transformers/models/funnel/configuration_funnel.py
+++ b/src/transformers/models/funnel/configuration_funnel.py
@@ -36,69 +36,68 @@ FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class FunnelConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.FunnelModel` or a
-    :class:`~transformers.TFBertModel`. It is used to instantiate a Funnel Transformer model according to the specified
+    This is the configuration class to store the configuration of a [`FunnelModel`] or a
+    [`TFBertModel`]. It is used to instantiate a Funnel Transformer model according to the specified
     arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
-    configuration to that of the Funnel Transformer `funnel-transformer/small
-    <https://huggingface.co/funnel-transformer/small>`__ architecture.
+    configuration to that of the Funnel Transformer [funnel-transformer/small](https://huggingface.co/funnel-transformer/small) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the Funnel transformer. Defines the number of different tokens that can be represented
-            by the :obj:`inputs_ids` passed when calling :class:`~transformers.FunnelModel` or
-            :class:`~transformers.TFFunnelModel`.
-        block_sizes (:obj:`List[int]`, `optional`, defaults to :obj:`[4, 4, 4]`):
+            by the `inputs_ids` passed when calling [`FunnelModel`] or
+            [`TFFunnelModel`].
+        block_sizes (`List[int]`, *optional*, defaults to `[4, 4, 4]`):
             The sizes of the blocks used in the model.
-        block_repeats (:obj:`List[int]`, `optional`):
+        block_repeats (`List[int]`, *optional*):
             If passed along, each layer of each block is repeated the number of times indicated.
-        num_decoder_layers (:obj:`int`, `optional`, defaults to 2):
+        num_decoder_layers (`int`, *optional*, defaults to 2):
             The number of layers in the decoder (when not using the base model).
-        d_model (:obj:`int`, `optional`, defaults to 768):
+        d_model (`int`, *optional*, defaults to 768):
             Dimensionality of the model's hidden states.
-        n_head (:obj:`int`, `optional`, defaults to 12):
+        n_head (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        d_head (:obj:`int`, `optional`, defaults to 64):
+        d_head (`int`, *optional*, defaults to 64):
             Dimensionality of the model's heads.
-        d_inner (:obj:`int`, `optional`, defaults to 3072):
+        d_inner (`int`, *optional*, defaults to 3072):
             Inner dimension in the feed-forward blocks.
-        hidden_act (:obj:`str` or :obj:`callable`, `optional`, defaults to :obj:`"gelu_new"`):
+        hidden_act (`str` or `callable`, *optional*, defaults to `"gelu_new"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
+        attention_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for the attention probabilities.
-        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        activation_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probability used between the two layers of the feed-forward blocks.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 3):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.FunnelModel` or
-            :class:`~transformers.TFFunnelModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.1):
-            The standard deviation of the `uniform initializer` for initializing all weight matrices in attention
+        type_vocab_size (`int`, *optional*, defaults to 3):
+            The vocabulary size of the `token_type_ids` passed when calling [`FunnelModel`] or
+            [`TFFunnelModel`].
+        initializer_range (`float`, *optional*, defaults to 0.1):
+            The standard deviation of the *uniform initializer* for initializing all weight matrices in attention
             layers.
-        initializer_std (:obj:`float`, `optional`):
-            The standard deviation of the `normal initializer` for initializing the embedding matrix and the weight of
+        initializer_std (`float`, *optional*):
+            The standard deviation of the *normal initializer* for initializing the embedding matrix and the weight of
             linear layers. Will default to 1 for the embedding matrix and the value given by Xavier initialization for
             linear layers.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-9):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-9):
             The epsilon used by the layer normalization layers.
-        pooling_type (:obj:`str`, `optional`, defaults to :obj:`"mean"`):
-            Possible values are ``"mean"`` or ``"max"``. The way pooling is performed at the beginning of each block.
-        attention_type (:obj:`str`, `optional`, defaults to :obj:`"relative_shift"`):
-            Possible values are ``"relative_shift"`` or ``"factorized"``. The former is faster on CPU/GPU while the
+        pooling_type (`str`, *optional*, defaults to `"mean"`):
+            Possible values are `"mean"` or `"max"`. The way pooling is performed at the beginning of each block.
+        attention_type (`str`, *optional*, defaults to `"relative_shift"`):
+            Possible values are `"relative_shift"` or `"factorized"`. The former is faster on CPU/GPU while the
             latter is faster on TPU.
-        separate_cls (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        separate_cls (`bool`, *optional*, defaults to `True`):
             Whether or not to separate the cls token when applying pooling.
-        truncate_seq (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            When using ``separate_cls``, whether or not to truncate the last token when pooling, to avoid getting a
+        truncate_seq (`bool`, *optional*, defaults to `False`):
+            When using `separate_cls`, whether or not to truncate the last token when pooling, to avoid getting a
             sequence length that is not a multiple of 2.
-        pool_q_only (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        pool_q_only (`bool`, *optional*, defaults to `False`):
             Whether or not to apply the pooling only to the query or to query, key and values for the attention layers.
     """
     model_type = "funnel"
diff --git a/src/transformers/models/funnel/modeling_tf_funnel.py b/src/transformers/models/funnel/modeling_tf_funnel.py
index 04a4208fae..ca77e070b3 100644
--- a/src/transformers/models/funnel/modeling_tf_funnel.py
+++ b/src/transformers/models/funnel/modeling_tf_funnel.py
@@ -1267,17 +1267,18 @@ class TFFunnelForPreTraining(TFFunnelPreTrainedModel):
         r"""
         Returns:
 
-        Examples::
+        Examples:
 
-            >>> from transformers import FunnelTokenizer, TFFunnelForPreTraining
-            >>> import torch
+        ```python
+        >>> from transformers import FunnelTokenizer, TFFunnelForPreTraining
+        >>> import torch
 
-            >>> tokenizer = TFFunnelTokenizer.from_pretrained('funnel-transformer/small')
-            >>> model = TFFunnelForPreTraining.from_pretrained('funnel-transformer/small')
+        >>> tokenizer = TFFunnelTokenizer.from_pretrained('funnel-transformer/small')
+        >>> model = TFFunnelForPreTraining.from_pretrained('funnel-transformer/small')
 
-            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors= "tf")
-            >>> logits = model(inputs).logits
-        """
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors= "tf")
+        >>> logits = model(inputs).logits
+        ```"""
         inputs = input_processing(
             func=self.call,
             config=self.config,
diff --git a/src/transformers/models/funnel/tokenization_funnel.py b/src/transformers/models/funnel/tokenization_funnel.py
index 8a2f00d847..991c048864 100644
--- a/src/transformers/models/funnel/tokenization_funnel.py
+++ b/src/transformers/models/funnel/tokenization_funnel.py
@@ -59,10 +59,10 @@ class FunnelTokenizer(BertTokenizer):
     r"""
     Construct a Funnel Transformer tokenizer.
 
-    :class:`~transformers.FunnelTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
+    [`FunnelTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end
     tokenization: punctuation splitting and wordpiece.
 
-    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning
     parameters.
     """
 
@@ -113,21 +113,21 @@ class FunnelTokenizer(BertTokenizer):
         Create a mask from the two sequences passed to be used in a sequence-pair classification task. A Funnel
         Transformer sequence pair mask has the following format:
 
-        ::
+        ```
+        2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
 
-            2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
-
-        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
             sequence(s).
         """
         sep = [self.sep_token_id]
diff --git a/src/transformers/models/funnel/tokenization_funnel_fast.py b/src/transformers/models/funnel/tokenization_funnel_fast.py
index 4ccab80d45..a185f005ed 100644
--- a/src/transformers/models/funnel/tokenization_funnel_fast.py
+++ b/src/transformers/models/funnel/tokenization_funnel_fast.py
@@ -70,12 +70,12 @@ PRETRAINED_INIT_CONFIGURATION = {f"funnel-transformer/{name}": {"do_lower_case":
 
 class FunnelTokenizerFast(BertTokenizerFast):
     r"""
-    Construct a "fast" Funnel Transformer tokenizer (backed by HuggingFace's `tokenizers` library).
+    Construct a "fast" Funnel Transformer tokenizer (backed by HuggingFace's *tokenizers* library).
 
-    :class:`~transformers.FunnelTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
+    [`FunnelTokenizerFast`] is identical to [`BertTokenizerFast`] and runs
     end-to-end tokenization: punctuation splitting and wordpiece.
 
-    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning
     parameters.
     """
 
@@ -129,21 +129,21 @@ class FunnelTokenizerFast(BertTokenizerFast):
         Create a mask from the two sequences passed to be used in a sequence-pair classification task. A Funnel
         Transformer sequence pair mask has the following format:
 
-        ::
+        ```
+        2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
 
-            2 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
-
-        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
             sequence(s).
         """
         sep = [self.sep_token_id]
diff --git a/src/transformers/models/gpt2/configuration_gpt2.py b/src/transformers/models/gpt2/configuration_gpt2.py
index be4f8df0a8..9ea843a523 100644
--- a/src/transformers/models/gpt2/configuration_gpt2.py
+++ b/src/transformers/models/gpt2/configuration_gpt2.py
@@ -37,97 +37,98 @@ GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class GPT2Config(PretrainedConfig):
     """
-    This is the configuration class to store the configuration of a :class:`~transformers.GPT2Model` or a
-    :class:`~transformers.TFGPT2Model`. It is used to instantiate a GPT-2 model according to the specified arguments,
+    This is the configuration class to store the configuration of a [`GPT2Model`] or a
+    [`TFGPT2Model`]. It is used to instantiate a GPT-2 model according to the specified arguments,
     defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
-    to that of the GPT-2 `small <https://huggingface.co/gpt2>`__ architecture.
+    to that of the GPT-2 [small](https://huggingface.co/gpt2) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 50257):
+        vocab_size (`int`, *optional*, defaults to 50257):
             Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.GPT2Model` or
-            :class:`~transformers.TFGPT2Model`.
-        n_positions (:obj:`int`, `optional`, defaults to 1024):
+            `inputs_ids` passed when calling [`GPT2Model`] or
+            [`TFGPT2Model`].
+        n_positions (`int`, *optional*, defaults to 1024):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        n_embd (:obj:`int`, `optional`, defaults to 768):
+        n_embd (`int`, *optional*, defaults to 768):
             Dimensionality of the embeddings and hidden states.
-        n_layer (:obj:`int`, `optional`, defaults to 12):
+        n_layer (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        n_head (:obj:`int`, `optional`, defaults to 12):
+        n_head (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        n_inner (:obj:`int`, `optional`, defaults to None):
-            Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd
-        activation_function (:obj:`str`, `optional`, defaults to :obj:`"gelu"`):
-            Activation function, to be selected in the list :obj:`["relu", "silu", "gelu", "tanh", "gelu_new"]`.
-        resid_pdrop (:obj:`float`, `optional`, defaults to 0.1):
+        n_inner (`int`, *optional*, defaults to None):
+            Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
+        activation_function (`str`, *optional*, defaults to `"gelu"`):
+            Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        embd_pdrop (:obj:`int`, `optional`, defaults to 0.1):
+        embd_pdrop (`int`, *optional*, defaults to 0.1):
             The dropout ratio for the embeddings.
-        attn_pdrop (:obj:`float`, `optional`, defaults to 0.1):
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention.
-        layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
             The epsilon to use in the layer normalization layers.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        summary_type (:obj:`string`, `optional`, defaults to :obj:`"cls_index"`):
-            Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel`
-            and :class:`~transformers.TFGPT2DoubleHeadsModel`.
+        summary_type (`string`, *optional*, defaults to `"cls_index"`):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`]
+            and [`TFGPT2DoubleHeadsModel`].
 
             Has to be one of the following options:
 
-                - :obj:`"last"`: Take the last token hidden state (like XLNet).
-                - :obj:`"first"`: Take the first token hidden state (like BERT).
-                - :obj:`"mean"`: Take the mean of all tokens hidden states.
-                - :obj:`"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
-                - :obj:`"attn"`: Not implemented now, use multi-head attention.
-        summary_use_proj (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel`
-            and :class:`~transformers.TFGPT2DoubleHeadsModel`.
+                - `"last"`: Take the last token hidden state (like XLNet).
+                - `"first"`: Take the first token hidden state (like BERT).
+                - `"mean"`: Take the mean of all tokens hidden states.
+                - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
+                - `"attn"`: Not implemented now, use multi-head attention.
+        summary_use_proj (`bool`, *optional*, defaults to `True`):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`]
+            and [`TFGPT2DoubleHeadsModel`].
 
             Whether or not to add a projection after the vector extraction.
-        summary_activation (:obj:`str`, `optional`):
+        summary_activation (`str`, *optional*):
             Argument used when doing sequence summary. Used in for the multiple choice head in
-            :class:`~transformers.GPT2DoubleHeadsModel`.
+            [`GPT2DoubleHeadsModel`].
 
-            Pass :obj:`"tanh"` for a tanh activation to the output, any other value will result in no activation.
-        summary_proj_to_labels (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel`
-            and :class:`~transformers.TFGPT2DoubleHeadsModel`.
+            Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
+        summary_proj_to_labels (`bool`, *optional*, defaults to `True`):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`]
+            and [`TFGPT2DoubleHeadsModel`].
 
-            Whether the projection outputs should have :obj:`config.num_labels` or :obj:`config.hidden_size` classes.
-        summary_first_dropout (:obj:`float`, `optional`, defaults to 0.1):
-            Argument used when doing sequence summary, used in the models :class:`~transformers.GPT2DoubleHeadsModel`
-            and :class:`~transformers.TFGPT2DoubleHeadsModel`.
+            Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
+        summary_first_dropout (`float`, *optional*, defaults to 0.1):
+            Argument used when doing sequence summary, used in the models [`GPT2DoubleHeadsModel`]
+            and [`TFGPT2DoubleHeadsModel`].
 
             The dropout ratio to be used after the projection and activation.
-        scale_attn_weights (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        scale_attn_weights (`bool`, *optional*, defaults to `True`):
             Scale attention weights by dividing by sqrt(hidden_size)..
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
-        scale_attn_by_inverse_layer_idx (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to additionally scale attention weights by ``1 / layer_idx + 1``.
-        reorder_and_upcast_attn (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
+            Whether to additionally scale attention weights by `1 / layer_idx + 1`.
+        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
             Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention
             dot-product/softmax to float() when training with mixed precision.
 
-    Example::
+    Example:
 
-        >>> from transformers import GPT2Model, GPT2Config
+    ```python
+    >>> from transformers import GPT2Model, GPT2Config
 
-        >>> # Initializing a GPT2 configuration
-        >>> configuration = GPT2Config()
+    >>> # Initializing a GPT2 configuration
+    >>> configuration = GPT2Config()
 
-        >>> # Initializing a model from the configuration
-        >>> model = GPT2Model(configuration)
+    >>> # Initializing a model from the configuration
+    >>> model = GPT2Model(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
 
     model_type = "gpt2"
     keys_to_ignore_at_inference = ["past_key_values"]
diff --git a/src/transformers/models/gpt2/modeling_gpt2.py b/src/transformers/models/gpt2/modeling_gpt2.py
index bdc019fcba..46f8f9a29d 100644
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -642,17 +642,19 @@ PARALLELIZE_DOCSTRING = r"""
 DEPARALLELIZE_DOCSTRING = r"""
     Moves the model to cpu from a model parallel state.
 
-    Example::
+    Example:
 
-        # On a 4 GPU machine with gpt2-large:
-        model = GPT2LMHeadModel.from_pretrained('gpt2-large')
-        device_map = {0: [0, 1, 2, 3, 4, 5, 6, 7],
+    ```python
+    # On a 4 GPU machine with gpt2-large:
+    model = GPT2LMHeadModel.from_pretrained('gpt2-large')
+    device_map = {0: [0, 1, 2, 3, 4, 5, 6, 7],
 
-                    1: [8, 9, 10, 11, 12, 13, 14, 15],
-                    2: [16, 17, 18, 19, 20, 21, 22, 23],
-                    3: [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]}
-        model.parallelize(device_map) # Splits the model across several devices
-        model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
+                1: [8, 9, 10, 11, 12, 13, 14, 15],
+                2: [16, 17, 18, 19, 20, 21, 22, 23],
+                3: [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]}
+    model.parallelize(device_map) # Splits the model across several devices
+    model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
+    ```
 """
 
 
diff --git a/src/transformers/models/gpt2/tokenization_gpt2.py b/src/transformers/models/gpt2/tokenization_gpt2.py
index d09e4eedd0..e334c6c7c9 100644
--- a/src/transformers/models/gpt2/tokenization_gpt2.py
+++ b/src/transformers/models/gpt2/tokenization_gpt2.py
@@ -108,42 +108,43 @@ class GPT2Tokenizer(PreTrainedTokenizer):
     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 
-    ::
+    ```
+    >>> from transformers import GPT2Tokenizer
+    >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+    >>> tokenizer("Hello world")['input_ids']
+    [15496, 995]
+    >>> tokenizer(" Hello world")['input_ids']
+    [18435, 995]
+    ```
 
-        >>> from transformers import GPT2Tokenizer
-        >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-        >>> tokenizer("Hello world")['input_ids']
-        [15496, 995]
-        >>> tokenizer(" Hello world")['input_ids']
-        [18435, 995]
-
-    You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
+    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
     call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
 
-    .. note::
+    <Tip>
 
-        When used with ``is_split_into_words=True``, this tokenizer will add a space before each word (even the first
-        one).
+    When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first
+    one).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    </Tip>
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             Path to the vocabulary file.
-        merges_file (:obj:`str`):
+        merges_file (`str`):
             Path to the merges file.
-        errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
-            Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
-            <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+        bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
             The beginning of sequence token.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
             The end of sequence token.
-        add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
             Whether or not to add an initial space to the input. This allows to treat the leading word just as any
             other word. (GPT2 tokenizer detect beginning of words by the preceding space).
     """
diff --git a/src/transformers/models/gpt2/tokenization_gpt2_fast.py b/src/transformers/models/gpt2/tokenization_gpt2_fast.py
index 1751247f35..54ea4fa27e 100644
--- a/src/transformers/models/gpt2/tokenization_gpt2_fast.py
+++ b/src/transformers/models/gpt2/tokenization_gpt2_fast.py
@@ -69,51 +69,52 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 
 class GPT2TokenizerFast(PreTrainedTokenizerFast):
     """
-    Construct a "fast" GPT-2 tokenizer (backed by HuggingFace's `tokenizers` library). Based on byte-level
+    Construct a "fast" GPT-2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
     Byte-Pair-Encoding.
 
     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 
-    ::
+    ```
+    >>> from transformers import GPT2TokenizerFast
+    >>> tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
+    >>> tokenizer("Hello world")['input_ids']
+    [15496, 995]
+    >>> tokenizer(" Hello world")['input_ids']
+    [18435, 995]
+    ```
 
-        >>> from transformers import GPT2TokenizerFast
-        >>> tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
-        >>> tokenizer("Hello world")['input_ids']
-        [15496, 995]
-        >>> tokenizer(" Hello world")['input_ids']
-        [18435, 995]
-
-    You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer, but since
+    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
     the model was not pretrained this way, it might yield a decrease in performance.
 
-    .. note::
+    <Tip>
 
-        When used with ``is_split_into_words=True``, this tokenizer needs to be instantiated with
-        ``add_prefix_space=True``.
+    When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with
+    `add_prefix_space=True`.
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    </Tip>
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
     methods. Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             Path to the vocabulary file.
-        merges_file (:obj:`str`):
+        merges_file (`str`):
             Path to the merges file.
-        errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
-            Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
-            <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+        bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
             The beginning of sequence token.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`<|endoftext|>`):
+        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
             The end of sequence token.
-        add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
             Whether or not to add an initial space to the input. This allows to treat the leading word just as any
             other word. (GPT2 tokenizer detect beginning of words by the preceding space).
-        trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        trim_offsets (`bool`, *optional*, defaults to `True`):
             Whether or not the post-processing step should trim offsets to avoid including whitespaces.
     """
 
diff --git a/src/transformers/models/gpt_neo/configuration_gpt_neo.py b/src/transformers/models/gpt_neo/configuration_gpt_neo.py
index 959d0bc7de..5499334c87 100644
--- a/src/transformers/models/gpt_neo/configuration_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/configuration_gpt_neo.py
@@ -33,66 +33,65 @@ GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class GPTNeoConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.GPTNeoModel`. It is used to
+    This is the configuration class to store the configuration of a [`GPTNeoModel`]. It is used to
     instantiate a GPT Neo model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the GPTNeo `gpt-neo-1.3B
-    <https://huggingface.co/EleutherAI/gpt-neo-1.3B>`__ architecture.
+    configuration with the defaults will yield a similar configuration to that of the GPTNeo [gpt-neo-1.3B](https://huggingface.co/EleutherAI/gpt-neo-1.3B) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 50257):
+        vocab_size (`int`, *optional*, defaults to 50257):
             Vocabulary size of the GPT Neo model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.GPTNeoModel`. Vocabulary size of the model.
-            Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of
-            :class:`~transformers.GPTNeoModel`.
-        attention_types (:obj:`List`, `optional`, defaults to :obj:`[[["global", "local"], 12]]`):
-            The type of attention for each layer in a :obj:`List` of the following format :obj:`[[["attention_type"],
-            num_layerss]]` e.g. for a 24 layer model :obj:`[[["global"], 24]]` or :obj:`[[["global", "local"], 12]]`
-            Choose the value of ``attention_type`` from :obj:`["global", "local"]`
-        hidden_size (:obj:`int`, `optional`, defaults to 2048):
+            `inputs_ids` passed when calling [`GPTNeoModel`]. Vocabulary size of the model.
+            Defines the different tokens that can be represented by the *inputs_ids* passed to the forward method of
+            [`GPTNeoModel`].
+        attention_types (`List`, *optional*, defaults to `[[["global", "local"], 12]]`):
+            The type of attention for each layer in a `List` of the following format `[[["attention_type"], num_layerss]]` e.g. for a 24 layer model `[[["global"], 24]]` or `[[["global", "local"], 12]]`
+            Choose the value of `attention_type` from `["global", "local"]`
+        hidden_size (`int`, *optional*, defaults to 2048):
             Dimensionality of the encoder layers and the pooler layer.
-        num_layers (:obj:`int`, `optional`, defaults to 24):
+        num_layers (`int`, *optional*, defaults to 24):
             Number of hidden layers in the Transformer encoder.
-        num_heads (:obj:`int`, `optional`, defaults to 16):
+        num_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 8192):
+        intermediate_size (`int`, *optional*, defaults to 8192):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu_new"`):
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu_new"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        embed_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        embed_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 2048):
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.GPTNeoModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`GPTNeoModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
             The epsilon used by the layer normalization layers.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if ``config.is_decoder=True``.
+            relevant if `config.is_decoder=True`.
 
-        Example::
+    Example:
 
-            >>> from transformers import GPTNeoModel, GPTNeoConfig
+    ```python
+    >>> from transformers import GPTNeoModel, GPTNeoConfig
 
-            >>> # Initializing a GPTNeo EleutherAI/gpt-neo-1.3B style configuration
-            >>> configuration = GPTNeoConfig()
+    >>> # Initializing a GPTNeo EleutherAI/gpt-neo-1.3B style configuration
+    >>> configuration = GPTNeoConfig()
 
-            >>> # Initializing a model from the EleutherAI/gpt-neo-1.3B style configuration
-            >>> model = GPTNeoModel(configuration)
+    >>> # Initializing a model from the EleutherAI/gpt-neo-1.3B style configuration
+    >>> model = GPTNeoModel(configuration)
 
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "gpt_neo"
     keys_to_ignore_at_inference = ["past_key_values"]
     attribute_map = {"num_attention_heads": "num_heads", "num_hidden_layers": "num_layers"}
diff --git a/src/transformers/models/gptj/configuration_gptj.py b/src/transformers/models/gptj/configuration_gptj.py
index 6c754ddc42..1079169ac3 100644
--- a/src/transformers/models/gptj/configuration_gptj.py
+++ b/src/transformers/models/gptj/configuration_gptj.py
@@ -28,60 +28,60 @@ GPTJ_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class GPTJConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.GPTJModel`. It is used to
+    This is the configuration class to store the configuration of a [`GPTJModel`]. It is used to
     instantiate a GPT-J model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the GPT-J `gpt-j-6B
-    <https://huggingface.co/EleutherAI/gpt-j-6B>`__ architecture. Configuration objects inherit from
-    :class:`~transformers.PretrainedConfig` and can be used to control the model outputs. Read the documentation from
-    :class:`~transformers.PretrainedConfig` for more information.
+    configuration with the defaults will yield a similar configuration to that of the GPT-J [gpt-j-6B](https://huggingface.co/EleutherAI/gpt-j-6B) architecture. Configuration objects inherit from
+    [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from
+    [`PretrainedConfig`] for more information.
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 50400):
+        vocab_size (`int`, *optional*, defaults to 50400):
             Vocabulary size of the GPT-J model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.GPTJModel`.
-        n_positions (:obj:`int`, `optional`, defaults to 2048):
+            `inputs_ids` passed when calling [`GPTJModel`].
+        n_positions (`int`, *optional*, defaults to 2048):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        n_embd (:obj:`int`, `optional`, defaults to 4096):
+        n_embd (`int`, *optional*, defaults to 4096):
             Dimensionality of the embeddings and hidden states.
-        n_layer (:obj:`int`, `optional`, defaults to 28):
+        n_layer (`int`, *optional*, defaults to 28):
             Number of hidden layers in the Transformer encoder.
-        n_head (:obj:`int`, `optional`, defaults to 16):
+        n_head (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
-        rotary_dim (:obj:`int`, `optional`, defaults to 64):
+        rotary_dim (`int`, *optional*, defaults to 64):
             Number of dimensions in the embedding that Rotary Position Embedding is applied to.
-        n_inner (:obj:`int`, `optional`, defaults to None):
-            Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd
-        activation_function (:obj:`str`, `optional`, defaults to :obj:`"gelu_new"`):
-            Activation function, to be selected in the list :obj:`["relu", "silu", "gelu", "tanh", "gelu_new"]`.
-        resid_pdrop (:obj:`float`, `optional`, defaults to 0.1):
+        n_inner (`int`, *optional*, defaults to None):
+            Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
+        activation_function (`str`, *optional*, defaults to `"gelu_new"`):
+            Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        embd_pdrop (:obj:`int`, `optional`, defaults to 0.1):
+        embd_pdrop (`int`, *optional*, defaults to 0.1):
             The dropout ratio for the embeddings.
-        attn_pdrop (:obj:`float`, `optional`, defaults to 0.1):
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention.
-        layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
             The epsilon to use in the layer normalization layers.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        scale_attn_weights (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        scale_attn_weights (`bool`, *optional*, defaults to `True`):
             Scale attention weights by dividing by sqrt(hidden_size).
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
 
-    Example::
+    Example:
 
-        >>> from transformers import GPTJModel, GPTJConfig
+    ```python
+    >>> from transformers import GPTJModel, GPTJConfig
 
-        >>> # Initializing a GPT-J 6B configuration
-        >>> configuration = GPTJConfig()
+    >>> # Initializing a GPT-J 6B configuration
+    >>> configuration = GPTJConfig()
 
-        >>> # Initializing a model from the configuration
-        >>> model = GPTJModel(configuration)
+    >>> # Initializing a model from the configuration
+    >>> model = GPTJModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "gptj"
     attribute_map = {
         "max_position_embeddings": "n_positions",
diff --git a/src/transformers/models/gptj/modeling_gptj.py b/src/transformers/models/gptj/modeling_gptj.py
index 0c6b60f65f..05e7f16700 100755
--- a/src/transformers/models/gptj/modeling_gptj.py
+++ b/src/transformers/models/gptj/modeling_gptj.py
@@ -424,15 +424,18 @@ PARALLELIZE_DOCSTRING = r"""
 DEPARALLELIZE_DOCSTRING = r"""
     Moves the model to CPU from a model parallel state.
 
-    Example::
-        # On a 4 GPU machine with gpt-j-6B:
-        model = GPTJForCausalLM.from_pretrained('EleutherAI/gpt-j-6B')
-        device_map = {0: [0, 1, 2, 3, 4, 5, 6],
-                      1: [7, 8, 9, 10, 11, 12, 13],
-                      2: [14, 15, 16, 17, 18, 19, 20],
-                      3: [21, 22, 23, 24, 25, 26, 27]}
-        model.parallelize(device_map) # Splits the model across several devices
-        model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
+    Example:
+
+    ```python
+    # On a 4 GPU machine with gpt-j-6B:
+    model = GPTJForCausalLM.from_pretrained('EleutherAI/gpt-j-6B')
+    device_map = {0: [0, 1, 2, 3, 4, 5, 6],
+                  1: [7, 8, 9, 10, 11, 12, 13],
+                  2: [14, 15, 16, 17, 18, 19, 20],
+                  3: [21, 22, 23, 24, 25, 26, 27]}
+    model.parallelize(device_map) # Splits the model across several devices
+    model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
+    ```
 """
 
 
diff --git a/src/transformers/models/herbert/tokenization_herbert.py b/src/transformers/models/herbert/tokenization_herbert.py
index 7f954f43b9..c025c02e9b 100644
--- a/src/transformers/models/herbert/tokenization_herbert.py
+++ b/src/transformers/models/herbert/tokenization_herbert.py
@@ -49,7 +49,7 @@ class HerbertTokenizer(XLMTokenizer):
 
     - Such pretokenized input is BPE subtokenized
 
-    This tokenizer inherits from :class:`~transformers.XLMTokenizer` which contains most of the methods. Users should
+    This tokenizer inherits from [`XLMTokenizer`] which contains most of the methods. Users should
     refer to the superclass for more information regarding methods.
     """
 
diff --git a/src/transformers/models/herbert/tokenization_herbert_fast.py b/src/transformers/models/herbert/tokenization_herbert_fast.py
index 2961d5c94c..7d08b18983 100644
--- a/src/transformers/models/herbert/tokenization_herbert_fast.py
+++ b/src/transformers/models/herbert/tokenization_herbert_fast.py
@@ -39,20 +39,20 @@ PRETRAINED_INIT_CONFIGURATION = {}
 
 class HerbertTokenizerFast(PreTrainedTokenizerFast):
     """
-    Construct a "Fast" BPE tokenizer for HerBERT (backed by HuggingFace's `tokenizers` library).
+    Construct a "Fast" BPE tokenizer for HerBERT (backed by HuggingFace's *tokenizers* library).
 
     Peculiarities:
 
     - uses BERT's pre-tokenizer: BertPreTokenizer splits tokens on spaces, and also on punctuation. Each occurrence of
       a punctuation character will be treated separately.
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the methods. Users
     should refer to the superclass for more information regarding methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             Path to the vocabulary file.
-        merges_file (:obj:`str`):
+        merges_file (`str`):
             Path to the merges file.
     """
 
@@ -94,17 +94,17 @@ class HerbertTokenizerFast(PreTrainedTokenizerFast):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. An HerBERT, like BERT sequence has the following format:
 
-        - single sequence: ``<s> X </s>``
-        - pair of sequences: ``<s> A </s> B </s>``
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s> B </s>`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
 
         cls = [self.cls_token_id]
@@ -119,18 +119,18 @@ class HerbertTokenizerFast(PreTrainedTokenizerFast):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
         if already_has_special_tokens:
             return super().get_special_tokens_mask(
@@ -148,19 +148,19 @@ class HerbertTokenizerFast(PreTrainedTokenizerFast):
         Create a mask from the two sequences passed to be used in a sequence-pair classification task. HerBERT, like
         BERT sequence pair mask has the following format:
 
-        ::
-
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
             sequence(s).
         """
         sep = [self.sep_token_id]
diff --git a/src/transformers/models/hubert/configuration_hubert.py b/src/transformers/models/hubert/configuration_hubert.py
index 84ed7a70bc..b1528c03fb 100644
--- a/src/transformers/models/hubert/configuration_hubert.py
+++ b/src/transformers/models/hubert/configuration_hubert.py
@@ -28,129 +28,126 @@ HUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class HubertConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.HubertModel`. It is used to
+    This is the configuration class to store the configuration of a [`HubertModel`]. It is used to
     instantiate an Hubert model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the Hubert
-    `facebook/hubert-base-ls960 <https://huggingface.co/facebook/hubert-base-ls960>`__ architecture.
+    [facebook/hubert-base-ls960](https://huggingface.co/facebook/hubert-base-ls960) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 32):
+        vocab_size (`int`, *optional*, defaults to 32):
             Vocabulary size of the Hubert model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.HubertModel`. Vocabulary size of the model.
-            Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of
-            :class:`~transformers.HubertModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            `inputs_ids` passed when calling [`HubertModel`]. Vocabulary size of the model.
+            Defines the different tokens that can be represented by the *inputs_ids* passed to the forward method of
+            [`HubertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout(:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout(`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout(:obj:`float`, `optional`, defaults to 0.1):
+        attention_dropout(`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        final_dropout (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probabilitiy for the final projection layer of :class:`Wav2Vec2ForCTC`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        final_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probabilitiy for the final projection layer of [`Wav2Vec2ForCTC`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        feat_extract_norm (:obj:`str`, `optional`, defaults to :obj:`"group"`):
-            The norm to be applied to 1D convolutional layers in feature extractor. One of :obj:`"group"` for group
-            normalization of only the first 1D convolutional layer or :obj:`"layer"` for layer normalization of all 1D
+        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
+            The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group
+            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
             convolutional layers.
-        feat_proj_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probability for output of the feature extractor.
-        feat_proj_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        feat_proj_layer_norm (`bool`, *optional*, defaults to `True`):
             Whether to apply LayerNorm to the output of the feature extractor.
-        feat_extract_activation (:obj:`str, `optional`, defaults to :obj:`"gelu"`):
+        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the 1D convolutional layers of the feature
-            extractor. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        conv_dim (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(512, 512, 512, 512, 512, 512, 512)`):
+            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
             A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
-            feature extractor. The length of `conv_dim` defines the number of 1D convolutional layers.
-        conv_stride (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(5, 2, 2, 2, 2, 2, 2)`):
+            feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers.
+        conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
             A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length
-            of `conv_stride` defines the number of convolutional layers and has to match the the length of `conv_dim`.
-        conv_kernel (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(10, 3, 3, 3, 3, 3, 3)`):
+            of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
+        conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
             A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The
-            length of `conv_kernel` defines the number of convolutional layers and has to match the the length of
-            `conv_dim`.
-        conv_bias (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
+            *conv_dim*.
+        conv_bias (`bool`, *optional*, defaults to `False`):
             Whether the 1D convolutional layers have a bias.
-        num_conv_pos_embeddings (:obj:`int`, `optional`, defaults to 128):
+        num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
             Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
             embeddings layer.
-        num_conv_pos_embedding_groups (:obj:`int`, `optional`, defaults to 16):
+        num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
             Number of groups of 1D convolutional positional embeddings layer.
-        do_stable_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether do apply `stable` layer norm architecture of the Transformer encoder. ``do_stable_layer_norm is
-            True`` corresponds to applying layer norm before the attention layer, whereas ``do_stable_layer_norm is
-            False`` corresponds to applying layer norm after the attention layer.
-        apply_spec_augment (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_stable_layer_norm (`bool`, *optional*, defaults to `False`):
+            Whether do apply *stable* layer norm architecture of the Transformer encoder. `do_stable_layer_norm is True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is False` corresponds to applying layer norm after the attention layer.
+        apply_spec_augment (`bool`, *optional*, defaults to `True`):
             Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see
-            `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
-            <https://arxiv.org/abs/1904.08779>`__.
-        mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
+            [SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition](https://arxiv.org/abs/1904.08779).
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
             Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
             procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
             reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
-            masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
-            the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
-        mask_time_length (:obj:`int`, `optional`, defaults to 10):
+            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease
+            the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+        mask_time_length (`int`, *optional*, defaults to 10):
             Length of vector span along the time axis.
-        mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
-            The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
-            step, irrespectively of ``mask_feature_prob``. Only relevant if
+        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
             ''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
-        mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
             Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
             masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
             the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
-            span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
-            overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
-            is True``.
-        mask_feature_length (:obj:`int`, `optional`, defaults to 10):
+            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that
+            overlap may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
             Length of vector span along the feature axis.
-        mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
-            The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
-            step, irrespectively of ``mask_feature_prob``. Only relevant if
+        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
+            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
             ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
-        ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`):
-            Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an
-            instance of :class:`~transformers.HubertForCTC`.
-        ctc_zero_infinity (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to zero infinite losses and the associated gradients of ``torch.nn.CTCLoss``. Infinite losses
+        ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
+            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+            instance of [`HubertForCTC`].
+        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses
             mainly occur when the inputs are too short to be aligned to the targets. Only relevant when training an
-            instance of :class:`~transformers.HubertForCTC`.
-        use_weighted_layer_sum (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            instance of [`HubertForCTC`].
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
             Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
-            instance of :class:`~transformers.HubertForSequenceClassification`.
-        classifier_proj_size (:obj:`int`, `optional`, defaults to 256):
+            instance of [`HubertForSequenceClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 256):
             Dimensionality of the projection before token mean-pooling for classification.
 
-    Example::
+    Example:
 
-        >>> from transformers import HubertModel, HubertConfig
+    ```python
+    >>> from transformers import HubertModel, HubertConfig
 
-        >>> # Initializing a Hubert facebook/hubert-base-ls960 style configuration
-        >>> configuration = HubertConfig()
+    >>> # Initializing a Hubert facebook/hubert-base-ls960 style configuration
+    >>> configuration = HubertConfig()
 
-        >>> # Initializing a model from the facebook/hubert-base-ls960 style configuration
-        >>> model = HubertModel(configuration)
+    >>> # Initializing a model from the facebook/hubert-base-ls960 style configuration
+    >>> model = HubertModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "hubert"
 
     def __init__(
diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py
index 416f6ce63d..d75ad2a056 100755
--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@@ -977,26 +977,27 @@ class HubertModel(HubertPreTrainedModel):
 
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import Wav2Vec2Processor, HubertModel
-            >>> from datasets import load_dataset
-            >>> import soundfile as sf
+        ```python
+        >>> from transformers import Wav2Vec2Processor, HubertModel
+        >>> from datasets import load_dataset
+        >>> import soundfile as sf
 
-            >>> processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
-            >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
+        >>> processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
+        >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")
 
-            >>> def map_to_array(batch):
-            ...     speech, _ = sf.read(batch["file"])
-            ...     batch["speech"] = speech
-            ...     return batch
+        >>> def map_to_array(batch):
+        ...     speech, _ = sf.read(batch["file"])
+        ...     batch["speech"] = speech
+        ...     return batch
 
-            >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-            >>> ds = ds.map(map_to_array)
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> ds = ds.map(map_to_array)
 
-            >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
-            >>> hidden_states = model(input_values).last_hidden_state
-        """
+        >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
+        >>> hidden_states = model(input_values).last_hidden_state
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py
index d25ee4f38c..ab10009ad9 100644
--- a/src/transformers/models/hubert/modeling_tf_hubert.py
+++ b/src/transformers/models/hubert/modeling_tf_hubert.py
@@ -1405,26 +1405,27 @@ class TFHubertModel(TFHubertPreTrainedModel):
 
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import Wav2Vec2Processor, TFHubertModel
-            >>> from datasets import load_dataset
-            >>> import soundfile as sf
+        ```python
+        >>> from transformers import Wav2Vec2Processor, TFHubertModel
+        >>> from datasets import load_dataset
+        >>> import soundfile as sf
 
-            >>> processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-base-960h")
-            >>> model = TFHubertModel.from_pretrained("facebook/hubert-base-960h")
+        >>> processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-base-960h")
+        >>> model = TFHubertModel.from_pretrained("facebook/hubert-base-960h")
 
-            >>> def map_to_array(batch):
-            ...     speech, _ = sf.read(batch["file"])
-            ...     batch["speech"] = speech
-            ...     return batch
+        >>> def map_to_array(batch):
+        ...     speech, _ = sf.read(batch["file"])
+        ...     batch["speech"] = speech
+        ...     return batch
 
-            >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-            >>> ds = ds.map(map_to_array)
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> ds = ds.map(map_to_array)
 
-            >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values  # Batch size 1
-            >>> hidden_states = model(input_values).last_hidden_state
-        """
+        >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values  # Batch size 1
+        >>> hidden_states = model(input_values).last_hidden_state
+        ```"""
 
         inputs = input_values_processing(
             func=self.call,
diff --git a/src/transformers/models/ibert/configuration_ibert.py b/src/transformers/models/ibert/configuration_ibert.py
index 397b6fd1e6..b389a30bf0 100644
--- a/src/transformers/models/ibert/configuration_ibert.py
+++ b/src/transformers/models/ibert/configuration_ibert.py
@@ -31,55 +31,53 @@ IBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class IBertConfig(PretrainedConfig):
     """
-    This is the configuration class to store the configuration of a :class:`~transformers.IBertModel`. It is used to
+    This is the configuration class to store the configuration of a [`IBertModel`]. It is used to
     instantiate a I-BERT model according to the specified arguments,
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the I-BERT model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.IBertModel`
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            `inputs_ids` passed when calling [`IBertModel`]
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.IBertModel`
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`IBertModel`]
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
-            Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
-            :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
-            :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
-            <https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
-            `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
-            <https://arxiv.org/abs/2009.13658>`__.
-        quant_mode (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`,
+            `"relative_key_query"`. For positional embeddings use `"absolute"`. For more information on
+            `"relative_key"`, please refer to [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). For more information on `"relative_key_query"`, please refer to
+            *Method 4* in [Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        quant_mode (`bool`, *optional*, defaults to `False`):
             Whether to quantize the model or not.
-        force_dequant (:obj:`str`, `optional`, defaults to :obj:`"none"`):
+        force_dequant (`str`, *optional*, defaults to `"none"`):
             Force dequantize specific nonlinear layer. Dequatized layers are then executed with full precision.
-            :obj:`"none"`, :obj:`"gelu"`, :obj:`"softmax"`, :obj:`"layernorm"` and :obj:`"nonlinear"` are supported. As
-            deafult, it is set as :obj:`"none"`, which does not dequantize any layers. Please specify :obj:`"gelu"`,
-            :obj:`"softmax"`, or :obj:`"layernorm"` to dequantize GELU, Softmax, or LayerNorm, respectively.
-            :obj:`"nonlinear"` will dequantize all nonlinear layers, i.e., GELU, Softmax, and LayerNorm.
+            `"none"`, `"gelu"`, `"softmax"`, `"layernorm"` and `"nonlinear"` are supported. As
+            deafult, it is set as `"none"`, which does not dequantize any layers. Please specify `"gelu"`,
+            `"softmax"`, or `"layernorm"` to dequantize GELU, Softmax, or LayerNorm, respectively.
+            `"nonlinear"` will dequantize all nonlinear layers, i.e., GELU, Softmax, and LayerNorm.
     """
 
     model_type = "ibert"
diff --git a/src/transformers/models/ibert/quant_modules.py b/src/transformers/models/ibert/quant_modules.py
index 386988c06d..83b173a873 100644
--- a/src/transformers/models/ibert/quant_modules.py
+++ b/src/transformers/models/ibert/quant_modules.py
@@ -30,15 +30,15 @@ logger = logging.get_logger(__name__)
 
 class QuantEmbedding(nn.Module):
     """
-    Quantized version of :obj:`torch.nn.Embedding`. Adds quantization-specific arguments on top of
-    :obj:`torch.nn.Embedding`.
+    Quantized version of `torch.nn.Embedding`. Adds quantization-specific arguments on top of
+    `torch.nn.Embedding`.
 
     Args:
-        weight_bit (:obj:`int`, `optional`, defaults to :obj:`8`):
+        weight_bit (`int`, *optional*, defaults to `8`):
             Bitwidth for the quantized weight.
-        momentum (:obj:`float`, `optional`, defaults to :obj:`0.95`):
+        momentum (`float`, *optional*, defaults to `0.95`):
             Momentum for updating the activation quantization range.
-        quant_mode (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        quant_mode (`bool`, *optional*, defaults to `False`):
             Whether or not the layer is quantized.
     """
 
@@ -117,15 +117,15 @@ class QuantAct(nn.Module):
     Quantizes the given activation.
 
     Args:
-        activation_bit (:obj:`int`):
+        activation_bit (`int`):
             Bitwidth for the quantized activation.
-        act_range_momentum (:obj:`float`, `optional`, defaults to :obj:`0.95`):
+        act_range_momentum (`float`, *optional*, defaults to `0.95`):
             Momentum for updating the activation quantization range.
-        per_channel (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        per_channel (`bool`, *optional*, defaults to `False`):
             Whether to or not use channel-wise quantization.
-        channel_len (:obj:`int`, `optional`):
-            Specify the channel length when set the `per_channel` True.
-        quant_mode (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        channel_len (`int`, *optional*):
+            Specify the channel length when set the *per_channel* True.
+        quant_mode (`bool`, *optional*, defaults to `False`):
             Whether or not the layer is quantized.
     """
 
@@ -221,16 +221,16 @@ class QuantAct(nn.Module):
 
 class QuantLinear(nn.Module):
     """
-    Quantized version of :obj:`torch.nn.Linear`. Adds quantization-specific arguments on top of :obj:`torch.nn.Linear`.
+    Quantized version of `torch.nn.Linear`. Adds quantization-specific arguments on top of `torch.nn.Linear`.
 
     Args:
-        weight_bit (:obj:`int`, `optional`, defaults to :obj:`8`):
+        weight_bit (`int`, *optional*, defaults to `8`):
             Bitwidth for the quantized weight.
-        bias_bit (:obj:`int`, `optional`, defaults to :obj:`32`):
+        bias_bit (`int`, *optional*, defaults to `32`):
             Bitwidth for the quantized bias.
-        per_channel (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        per_channel (`bool`, *optional*, defaults to `False`):
             Whether or not to use channel-wise quantization.
-        quant_mode (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        quant_mode (`bool`, *optional*, defaults to `False`):
             Whether or not the layer is quantized.
     """
 
@@ -301,12 +301,12 @@ class QuantLinear(nn.Module):
 
 class IntGELU(nn.Module):
     """
-    Quantized version of :obj:`torch.nn.GELU`. Adds quantization-specific arguments on top of :obj:`torch.nn.GELU`.
+    Quantized version of `torch.nn.GELU`. Adds quantization-specific arguments on top of `torch.nn.GELU`.
 
     Args:
-        quant_mode (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        quant_mode (`bool`, *optional*, defaults to `False`):
             Whether or not the layer is quantized.
-        force_dequant (:obj:`str`, `optional`, defaults to :obj:`"none"`):
+        force_dequant (`str`, *optional*, defaults to `"none"`):
             Force dequantize the layer if either "gelu" or "nonlinear" is given.
     """
 
@@ -358,15 +358,15 @@ class IntGELU(nn.Module):
 
 class IntSoftmax(nn.Module):
     """
-    Quantized version of :obj:`torch.nn.Softmax`. Adds quantization-specific arguments on top of
-    :obj:`torch.nn.Softmax`.
+    Quantized version of `torch.nn.Softmax`. Adds quantization-specific arguments on top of
+    `torch.nn.Softmax`.
 
     Args:
-        output_bit (:obj:`int`):
+        output_bit (`int`):
             Bitwidth for the layer output activation.
-        quant_mode (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        quant_mode (`bool`, *optional*, defaults to `False`):
             Whether or not the layer is quantized.
-        force_dequant (:obj:`str`, `optional`, defaults to :obj:`"none"`):
+        force_dequant (`str`, *optional*, defaults to `"none"`):
             Force dequantize the layer if either "softmax" or "nonlinear" is given.
     """
 
@@ -430,15 +430,15 @@ class IntSoftmax(nn.Module):
 
 class IntLayerNorm(nn.Module):
     """
-    Quantized version of :obj:`torch.nn.LayerNorm`. Adds quantization-specific arguments on top of
-    :obj:`torch.nn.LayerNorm`.
+    Quantized version of `torch.nn.LayerNorm`. Adds quantization-specific arguments on top of
+    `torch.nn.LayerNorm`.
 
     Args:
-        output_bit (:obj:`int`, `optional`, defaults to :obj:`8`):
+        output_bit (`int`, *optional*, defaults to `8`):
             Bitwidth for the layer output activation.
-        quant_mode (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        quant_mode (`bool`, *optional*, defaults to `False`):
             Whether or not the layer is quantized.
-        force_dequant (:obj:`str`, `optional`, defaults to :obj:`"none"`):
+        force_dequant (`str`, *optional*, defaults to `"none"`):
             Force dequantize the layer if either "layernorm" or "nonlinear" is given.
     """
 
@@ -535,17 +535,17 @@ def get_percentile_min_max(input, lower_percentile, upper_percentile, output_ten
     Calculate the percentile max and min values in a given tensor
 
     Args:
-        input (:obj:`torch.Tensor`):
+        input (`torch.Tensor`):
             The target tensor to calculate percentile max and min.
-        lower_percentile (:obj:`float`):
+        lower_percentile (`float`):
             If 0.1, means we return the value of the smallest 0.1% value in the tensor as percentile min.
-        upper_percentile (:obj:`float`):
+        upper_percentile (`float`):
             If 99.9, means we return the value of the largest 0.1% value in the tensor as percentile max.
-        output_tensor (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        output_tensor (`bool`, *optional*, defaults to `False`):
             If True, this function returns tensors, otherwise it returns values.
 
     Returns:
-        :obj:`Tuple(torch.Tensor, torch.Tensor)`: Percentile min and max value of `input`
+        `Tuple(torch.Tensor, torch.Tensor)`: Percentile min and max value of *input*
     """
     input_length = input.shape[0]
 
@@ -571,17 +571,17 @@ def linear_quantize(input, scale, zero_point, inplace=False):
     Quantize single-precision input tensor to integers with the given scaling factor and zeropoint.
 
     Args:
-        input (:obj:`torch.Tensor`):
+        input (`torch.Tensor`):
             Single-precision input tensor to be quantized.
-        scale (:obj:`torch.Tensor`):
+        scale (`torch.Tensor`):
             Scaling factor for quantization.
-        zero_pint (:obj:`torch.Tensor`):
+        zero_pint (`torch.Tensor`):
             Shift for quantization.
-        inplace (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        inplace (`bool`, *optional*, defaults to `False`):
             Whether to compute inplace or not.
 
     Returns:
-        :obj:`torch.Tensor`: Linearly quantized value of `input` according to `scale` and `zero_point`.
+        `torch.Tensor`: Linearly quantized value of *input* according to *scale* and *zero_point*.
     """
     # reshape scale and zeropoint for convolutional weights and activation
     if len(input.shape) == 4:
@@ -606,16 +606,16 @@ def symmetric_linear_quantization_params(num_bits, saturation_min, saturation_ma
     Compute the scaling factor with the given quantization range for symmetric quantization.
 
     Args:
-        saturation_min (:obj:`torch.Tensor`):
+        saturation_min (`torch.Tensor`):
             Lower bound for quantization range.
-        saturation_max (:obj:`torch.Tensor`):
+        saturation_max (`torch.Tensor`):
             Upper bound for quantization range.
-        per_channel (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        per_channel (`bool`, *optional*, defaults to `False`):
             Whether to or not use channel-wise quantization.
 
     Returns:
-        :obj:`torch.Tensor`: Scaling factor that linearly quantizes the given range between `saturation_min` and
-        `saturation_max`.
+        `torch.Tensor`: Scaling factor that linearly quantizes the given range between *saturation_min* and
+        *saturation_max*.
     """
     # in this part, we do not need any gradient computation,
     # in order to enforce this, we put torch.no_grad()
@@ -642,18 +642,18 @@ class SymmetricQuantFunction(Function):
     def forward(ctx, x, k, percentile_mode, scale):
         """
         Args:
-            x (:obj:`torch.Tensor`):
+            x (`torch.Tensor`):
                 Floating point tensor to be quantized.
-            k (:obj:`int`):
+            k (`int`):
                 Quantization bitwidth.
-            percentile_mode (:obj:`bool`):
+            percentile_mode (`bool`):
                 Whether or not to use percentile calibration.
-            scale (:obj:`torch.Tensor`):
-                Pre-calculated scaling factor for `x`. Note that the current implementation of SymmetricQuantFunction
+            scale (`torch.Tensor`):
+                Pre-calculated scaling factor for *x*. Note that the current implementation of SymmetricQuantFunction
                 requires pre-calculated scaling factor.
 
         Returns:
-            :obj:`torch.Tensor`: Symmetric-quantized value of `input`.
+            `torch.Tensor`: Symmetric-quantized value of *input*.
         """
         zero_point = torch.tensor(0.0).to(scale.device)
 
@@ -712,7 +712,7 @@ def batch_frexp(inputs, max_bit=31):
     Decompose the scaling factor into mantissa and twos exponent.
 
     Args:
-        scaling_factor (:obj:`torch.Tensor`):
+        scaling_factor (`torch.Tensor`):
             Target scaling factor to decompose.
 
     Returns:
@@ -746,22 +746,22 @@ class FixedPointMul(Function):
     Function to perform fixed-point arithmetic that can match integer arithmetic on hardware.
 
     Args:
-        pre_act (:obj:`torch.Tensor`):
+        pre_act (`torch.Tensor`):
             Input tensor.
-        pre_act_scaling_factor (:obj:`torch.Tensor`):
-            Scaling factor of the input tensor `pre_act`.
-        bit_num (:obj:`int`):
+        pre_act_scaling_factor (`torch.Tensor`):
+            Scaling factor of the input tensor *pre_act*.
+        bit_num (`int`):
             Quantization bitwidth.
-        z_scaling_factor (:obj:`torch.Tensor`):
+        z_scaling_factor (`torch.Tensor`):
             Scaling factor of the output tensor.
-        identity (:obj:`torch.Tensor`, `optional`):
+        identity (`torch.Tensor`, *optional*):
             Identity tensor, if exists.
-        identity_scaling_factor (:obj:`torch.Tensor`, `optional`):
-            Scaling factor of the identity tensor `identity`, if exists.
+        identity_scaling_factor (`torch.Tensor`, *optional*):
+            Scaling factor of the identity tensor *identity*, if exists.
 
     Returns:
-        :obj:`torch.Tensor`: Output tensor(`pre_act` if `identity` is not given, otherwise the addition of `pre_act`
-        and `identity`), whose scale is rescaled to `z_scaling_factor`.
+        `torch.Tensor`: Output tensor(*pre_act* if *identity* is not given, otherwise the addition of *pre_act*
+        and *identity*), whose scale is rescaled to *z_scaling_factor*.
     """
 
     @staticmethod
diff --git a/src/transformers/models/imagegpt/configuration_imagegpt.py b/src/transformers/models/imagegpt/configuration_imagegpt.py
index 5a8d0db144..5cfec7e4b3 100644
--- a/src/transformers/models/imagegpt/configuration_imagegpt.py
+++ b/src/transformers/models/imagegpt/configuration_imagegpt.py
@@ -29,67 +29,68 @@ IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class ImageGPTConfig(PretrainedConfig):
     """
-    This is the configuration class to store the configuration of a :class:`~transformers.ImageGPTModel` or a
-    :class:`~transformers.TFImageGPTModel`. It is used to instantiate a GPT-2 model according to the specified
+    This is the configuration class to store the configuration of a [`ImageGPTModel`] or a
+    [`TFImageGPTModel`]. It is used to instantiate a GPT-2 model according to the specified
     arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
-    configuration to that of the ImageGPT `small <https://huggingface.co/imagegpt>`__ architecture.
+    configuration to that of the ImageGPT [small](https://huggingface.co/imagegpt) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 512):
+        vocab_size (`int`, *optional*, defaults to 512):
             Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.ImageGPTModel` or
-            :class:`~transformers.TFImageGPTModel`.
-        n_positions (:obj:`int`, `optional`, defaults to 32*32):
+            `inputs_ids` passed when calling [`ImageGPTModel`] or
+            [`TFImageGPTModel`].
+        n_positions (`int`, *optional*, defaults to 32*32):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        n_embd (:obj:`int`, `optional`, defaults to 512):
+        n_embd (`int`, *optional*, defaults to 512):
             Dimensionality of the embeddings and hidden states.
-        n_layer (:obj:`int`, `optional`, defaults to 24):
+        n_layer (`int`, *optional*, defaults to 24):
             Number of hidden layers in the Transformer encoder.
-        n_head (:obj:`int`, `optional`, defaults to 8):
+        n_head (`int`, *optional*, defaults to 8):
             Number of attention heads for each attention layer in the Transformer encoder.
-        n_inner (:obj:`int`, `optional`, defaults to None):
-            Dimensionality of the inner feed-forward layers. :obj:`None` will set it to 4 times n_embd
-        activation_function (:obj:`str`, `optional`, defaults to :obj:`"quick_gelu"`):
+        n_inner (`int`, *optional*, defaults to None):
+            Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
+        activation_function (`str`, *optional*, defaults to `"quick_gelu"`):
             Activation function (can be one of the activation functions defined in src/transformers/activations.py).
             Defaults to "quick_gelu".
-        resid_pdrop (:obj:`float`, `optional`, defaults to 0.1):
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        embd_pdrop (:obj:`int`, `optional`, defaults to 0.1):
+        embd_pdrop (`int`, *optional*, defaults to 0.1):
             The dropout ratio for the embeddings.
-        attn_pdrop (:obj:`float`, `optional`, defaults to 0.1):
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention.
-        layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
             The epsilon to use in the layer normalization layers.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        scale_attn_weights (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        scale_attn_weights (`bool`, *optional*, defaults to `True`):
             Scale attention weights by dividing by sqrt(hidden_size)..
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
-        scale_attn_by_inverse_layer_idx (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to additionally scale attention weights by ``1 / layer_idx + 1``.
-        reorder_and_upcast_attn (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
+            Whether to additionally scale attention weights by `1 / layer_idx + 1`.
+        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
             Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention
             dot-product/softmax to float() when training with mixed precision.
 
-    Example::
+    Example:
 
-        >>> from transformers import ImageGPTModel, ImageGPTConfig
+    ```python
+    >>> from transformers import ImageGPTModel, ImageGPTConfig
 
-        >>> # Initializing a ImageGPT configuration
-        >>> configuration = ImageGPTConfig()
+    >>> # Initializing a ImageGPT configuration
+    >>> configuration = ImageGPTConfig()
 
-        >>> # Initializing a model from the configuration
-        >>> model = ImageGPTModel(configuration)
+    >>> # Initializing a model from the configuration
+    >>> model = ImageGPTModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
 
     model_type = "imagegpt"
     keys_to_ignore_at_inference = ["past_key_values"]
diff --git a/src/transformers/models/imagegpt/feature_extraction_imagegpt.py b/src/transformers/models/imagegpt/feature_extraction_imagegpt.py
index 85aec8a634..a6a069afff 100644
--- a/src/transformers/models/imagegpt/feature_extraction_imagegpt.py
+++ b/src/transformers/models/imagegpt/feature_extraction_imagegpt.py
@@ -49,23 +49,23 @@ class ImageGPTFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMix
     resolution (such as 32x32 or 64x64), normalize them and finally color quantize them to obtain sequences of "pixel
     values" (color clusters).
 
-    This feature extractor inherits from :class:`~transformers.FeatureExtractionMixin` which contains most of the main
+    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main
     methods. Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        clusters (:obj:`np.ndarray`):
-            The color clusters to use, as a :obj:`np.ndarray` of shape :obj:`(n_clusters, 3)`.
-        do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether to resize the input to a certain :obj:`size`.
-        size (:obj:`int` or :obj:`Tuple(int)`, `optional`, defaults to 32):
+        clusters (`np.ndarray`):
+            The color clusters to use, as a `np.ndarray` of shape `(n_clusters, 3)`.
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input to a certain `size`.
+        size (`int` or `Tuple(int)`, *optional*, defaults to 32):
             Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an
-            integer is provided, then the input will be resized to (size, size). Only has an effect if :obj:`do_resize`
-            is set to :obj:`True`.
-        resample (:obj:`int`, `optional`, defaults to :obj:`PIL.Image.BILINEAR`):
-            An optional resampling filter. This can be one of :obj:`PIL.Image.NEAREST`, :obj:`PIL.Image.BOX`,
-            :obj:`PIL.Image.BILINEAR`, :obj:`PIL.Image.HAMMING`, :obj:`PIL.Image.BICUBIC` or :obj:`PIL.Image.LANCZOS`.
-            Only has an effect if :obj:`do_resize` is set to :obj:`True`.
-        do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            integer is provided, then the input will be resized to (size, size). Only has an effect if `do_resize`
+            is set to `True`.
+        resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
+            An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
+            `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`.
+            Only has an effect if `do_resize` is set to `True`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
             Whether or not to normalize the input to the range between -1 and +1.
     """
 
@@ -81,14 +81,14 @@ class ImageGPTFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMix
 
     def normalize(self, image):
         """
-        Normalizes :obj:`image` into the range -1 to +1.
+        Normalizes `image` into the range -1 to +1.
 
         Args:
-            image (:obj:`PIL.Image.Image` or :obj:`np.ndarray` or :obj:`torch.Tensor`):
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
                 The image to normalize.
 
         Returns:
-            :obj:`np.ndarray`: The normalized image.
+            `np.ndarray`: The normalized image.
         """
         image = self.to_numpy_array(image, rescale=False, channel_first=False)
 
@@ -105,27 +105,29 @@ class ImageGPTFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMix
         """
         Main method to prepare for the model one or several image(s).
 
-        .. warning::
+        <Tip warning={true}>
 
-           NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
-           PIL images.
+        NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
+        PIL images.
+
+        </Tip>
 
         Args:
-            images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                 number of channels, H and W are image height and width.
 
-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`, defaults to :obj:`'np'`):
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*, defaults to `'np'`):
                 If set, will return tensors of a particular framework. Acceptable values are:
 
-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return NumPy :obj:`np.ndarray` objects.
-                * :obj:`'jax'`: Return JAX :obj:`jnp.ndarray` objects.
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
 
         Returns:
-            :class:`~transformers.BatchFeature`: A :class:`~transformers.BatchFeature` with the following fields:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
             - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
               width).
diff --git a/src/transformers/models/layoutlm/configuration_layoutlm.py b/src/transformers/models/layoutlm/configuration_layoutlm.py
index 913a6bf792..57b3bb4637 100644
--- a/src/transformers/models/layoutlm/configuration_layoutlm.py
+++ b/src/transformers/models/layoutlm/configuration_layoutlm.py
@@ -34,61 +34,60 @@ LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class LayoutLMConfig(BertConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.LayoutLMModel`. It is used to
+    This is the configuration class to store the configuration of a [`LayoutLMModel`]. It is used to
     instantiate a LayoutLM model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the LayoutLM `layoutlm-base-uncased
-    <https://huggingface.co/microsoft/layoutlm-base-uncased>`__ architecture.
+    configuration with the defaults will yield a similar configuration to that of the LayoutLM [layoutlm-base-uncased](https://huggingface.co/microsoft/layoutlm-base-uncased) architecture.
 
-    Configuration objects inherit from :class:`~transformers.BertConfig` and can be used to control the model outputs.
-    Read the documentation from :class:`~transformers.BertConfig` for more information.
+    Configuration objects inherit from [`BertConfig`] and can be used to control the model outputs.
+    Read the documentation from [`BertConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the LayoutLM model. Defines the different tokens that can be represented by the
-            `inputs_ids` passed to the forward method of :class:`~transformers.LayoutLMModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            *inputs_ids* passed to the forward method of [`LayoutLMModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed into :class:`~transformers.LayoutLMModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed into [`LayoutLMModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        max_2d_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
+        max_2d_position_embeddings (`int`, *optional*, defaults to 1024):
             The maximum value that the 2D position embedding might ever used. Typically set this to something large
             just in case (e.g., 1024).
 
-    Examples::
+    Examples:
 
-        >>> from transformers import LayoutLMModel, LayoutLMConfig
+    ```python
+    >>> from transformers import LayoutLMModel, LayoutLMConfig
 
-        >>> # Initializing a LayoutLM configuration
-        >>> configuration = LayoutLMConfig()
+    >>> # Initializing a LayoutLM configuration
+    >>> configuration = LayoutLMConfig()
 
-        >>> # Initializing a model from the configuration
-        >>> model = LayoutLMModel(configuration)
+    >>> # Initializing a model from the configuration
+    >>> model = LayoutLMModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "layoutlm"
 
     def __init__(
diff --git a/src/transformers/models/layoutlm/modeling_layoutlm.py b/src/transformers/models/layoutlm/modeling_layoutlm.py
index 3aae3c62dd..ac0f8b7359 100644
--- a/src/transformers/models/layoutlm/modeling_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_layoutlm.py
@@ -747,34 +747,35 @@ class LayoutLMModel(LayoutLMPreTrainedModel):
         r"""
         Returns:
 
-        Examples::
+        Examples:
 
-            >>> from transformers import LayoutLMTokenizer, LayoutLMModel
-            >>> import torch
+        ```python
+        >>> from transformers import LayoutLMTokenizer, LayoutLMModel
+        >>> import torch
 
-            >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
-            >>> model = LayoutLMModel.from_pretrained('microsoft/layoutlm-base-uncased')
+        >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
+        >>> model = LayoutLMModel.from_pretrained('microsoft/layoutlm-base-uncased')
 
-            >>> words = ["Hello", "world"]
-            >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
+        >>> words = ["Hello", "world"]
+        >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
 
-            >>> token_boxes = []
-            >>> for word, box in zip(words, normalized_word_boxes):
-            ...     word_tokens = tokenizer.tokenize(word)
-            ...     token_boxes.extend([box] * len(word_tokens))
-            >>> # add bounding boxes of cls + sep tokens
-            >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
+        >>> token_boxes = []
+        >>> for word, box in zip(words, normalized_word_boxes):
+        ...     word_tokens = tokenizer.tokenize(word)
+        ...     token_boxes.extend([box] * len(word_tokens))
+        >>> # add bounding boxes of cls + sep tokens
+        >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
 
-            >>> encoding = tokenizer(' '.join(words), return_tensors="pt")
-            >>> input_ids = encoding["input_ids"]
-            >>> attention_mask = encoding["attention_mask"]
-            >>> token_type_ids = encoding["token_type_ids"]
-            >>> bbox = torch.tensor([token_boxes])
+        >>> encoding = tokenizer(' '.join(words), return_tensors="pt")
+        >>> input_ids = encoding["input_ids"]
+        >>> attention_mask = encoding["attention_mask"]
+        >>> token_type_ids = encoding["token_type_ids"]
+        >>> bbox = torch.tensor([token_boxes])
 
-            >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids)
+        >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids)
 
-            >>> last_hidden_states = outputs.last_hidden_state
-        """
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
index 88326c109c..aa33734d4d 100644
--- a/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
+++ b/src/transformers/models/layoutlm/modeling_tf_layoutlm.py
@@ -947,34 +947,35 @@ class TFLayoutLMModel(TFLayoutLMPreTrainedModel):
         r"""
         Returns:
 
-        Examples::
+        Examples:
 
-            >>> from transformers import LayoutLMTokenizer, TFLayoutLMModel
-            >>> import tensorflow as tf
+        ```python
+        >>> from transformers import LayoutLMTokenizer, TFLayoutLMModel
+        >>> import tensorflow as tf
 
-            >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
-            >>> model = TFLayoutLMModel.from_pretrained('microsoft/layoutlm-base-uncased')
+        >>> tokenizer = LayoutLMTokenizer.from_pretrained('microsoft/layoutlm-base-uncased')
+        >>> model = TFLayoutLMModel.from_pretrained('microsoft/layoutlm-base-uncased')
 
-            >>> words = ["Hello", "world"]
-            >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
+        >>> words = ["Hello", "world"]
+        >>> normalized_word_boxes = [637, 773, 693, 782], [698, 773, 733, 782]
 
-            >>> token_boxes = []
-            >>> for word, box in zip(words, normalized_word_boxes):
-            ...     word_tokens = tokenizer.tokenize(word)
-            ...     token_boxes.extend([box] * len(word_tokens))
-            >>> # add bounding boxes of cls + sep tokens
-            >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
+        >>> token_boxes = []
+        >>> for word, box in zip(words, normalized_word_boxes):
+        ...     word_tokens = tokenizer.tokenize(word)
+        ...     token_boxes.extend([box] * len(word_tokens))
+        >>> # add bounding boxes of cls + sep tokens
+        >>> token_boxes = [[0, 0, 0, 0]] + token_boxes + [[1000, 1000, 1000, 1000]]
 
-            >>> encoding = tokenizer(' '.join(words), return_tensors="tf")
-            >>> input_ids = encoding["input_ids"]
-            >>> attention_mask = encoding["attention_mask"]
-            >>> token_type_ids = encoding["token_type_ids"]
-            >>> bbox = tf.convert_to_tensor([token_boxes])
+        >>> encoding = tokenizer(' '.join(words), return_tensors="tf")
+        >>> input_ids = encoding["input_ids"]
+        >>> attention_mask = encoding["attention_mask"]
+        >>> token_type_ids = encoding["token_type_ids"]
+        >>> bbox = tf.convert_to_tensor([token_boxes])
 
-            >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids)
+        >>> outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, token_type_ids=token_type_ids)
 
-            >>> last_hidden_states = outputs.last_hidden_state
-        """
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
         inputs = input_processing(
             func=self.call,
             config=self.config,
diff --git a/src/transformers/models/layoutlm/tokenization_layoutlm.py b/src/transformers/models/layoutlm/tokenization_layoutlm.py
index 6a961c7747..603d730e8a 100644
--- a/src/transformers/models/layoutlm/tokenization_layoutlm.py
+++ b/src/transformers/models/layoutlm/tokenization_layoutlm.py
@@ -47,10 +47,10 @@ class LayoutLMTokenizer(BertTokenizer):
     r"""
     Constructs a LayoutLM tokenizer.
 
-    :class:`~transformers.LayoutLMTokenizer is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
+    [`LayoutLMTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end
     tokenization: punctuation splitting + wordpiece.
 
-    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning
     parameters.
     """
 
diff --git a/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py b/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py
index 533645693e..8e8e13a90f 100644
--- a/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py
+++ b/src/transformers/models/layoutlm/tokenization_layoutlm_fast.py
@@ -52,10 +52,10 @@ class LayoutLMTokenizerFast(BertTokenizerFast):
     r"""
     Constructs a "Fast" LayoutLMTokenizer.
 
-    :class:`~transformers.LayoutLMTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
+    [`LayoutLMTokenizerFast`] is identical to [`BertTokenizerFast`] and runs
     end-to-end tokenization: punctuation splitting + wordpiece.
 
-    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning
     parameters.
     """
 
diff --git a/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py b/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py
index f9ad445bf0..de19988365 100644
--- a/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/configuration_layoutlmv2.py
@@ -34,87 +34,87 @@ if is_detectron2_available():
 
 class LayoutLMv2Config(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.LayoutLMv2Model`. It is used
+    This is the configuration class to store the configuration of a [`LayoutLMv2Model`]. It is used
     to instantiate an LayoutLMv2 model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the LayoutLMv2
-    `microsoft/layoutlmv2-base-uncased <https://huggingface.co/microsoft/layoutlmv2-base-uncased>`__ architecture.
+    [microsoft/layoutlmv2-base-uncased](https://huggingface.co/microsoft/layoutlmv2-base-uncased) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the LayoutLMv2 model. Defines the number of different tokens that can be represented by
-            the :obj:`inputs_ids` passed when calling :class:`~transformers.LayoutLMv2Model` or
-            :class:`~transformers.TFLayoutLMv2Model`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            the `inputs_ids` passed when calling [`LayoutLMv2Model`] or
+            [`TFLayoutLMv2Model`].
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimension of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.LayoutLMv2Model`
-            or :class:`~transformers.TFLayoutLMv2Model`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`LayoutLMv2Model`]
+            or [`TFLayoutLMv2Model`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        max_2d_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
+        max_2d_position_embeddings (`int`, *optional*, defaults to 1024):
             The maximum value that the 2D position embedding might ever be used with. Typically set this to something
             large just in case (e.g., 1024).
-        max_rel_pos (:obj:`int`, `optional`, defaults to 128):
+        max_rel_pos (`int`, *optional*, defaults to 128):
             The maximum number of relative positions to be used in the self-attention mechanism.
-        rel_pos_bins (:obj:`int`, `optional`, defaults to 32):
+        rel_pos_bins (`int`, *optional*, defaults to 32):
             The number of relative position bins to be used in the self-attention mechanism.
-        fast_qkv (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        fast_qkv (`bool`, *optional*, defaults to `True`):
             Whether or not to use a single matrix for the queries, keys, values in the self-attention layers.
-        max_rel_2d_pos (:obj:`int`, `optional`, defaults to 256):
+        max_rel_2d_pos (`int`, *optional*, defaults to 256):
             The maximum number of relative 2D positions in the self-attention mechanism.
-        rel_2d_pos_bins (:obj:`int`, `optional`, defaults to 64):
+        rel_2d_pos_bins (`int`, *optional*, defaults to 64):
             The number of 2D relative position bins in the self-attention mechanism.
-        image_feature_pool_shape (:obj:`List[int]`, `optional`, defaults to [7, 7, 256]):
+        image_feature_pool_shape (`List[int]`, *optional*, defaults to [7, 7, 256]):
             The shape of the average-pooled feature map.
-        coordinate_size (:obj:`int`, `optional`, defaults to 128):
+        coordinate_size (`int`, *optional*, defaults to 128):
             Dimension of the coordinate embeddings.
-        shape_size (:obj:`int`, `optional`, defaults to 128):
+        shape_size (`int`, *optional*, defaults to 128):
             Dimension of the width and height embeddings.
-        has_relative_attention_bias (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        has_relative_attention_bias (`bool`, *optional*, defaults to `True`):
             Whether or not to use a relative attention bias in the self-attention mechanism.
-        has_spatial_attention_bias (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        has_spatial_attention_bias (`bool`, *optional*, defaults to `True`):
             Whether or not to use a spatial attention bias in the self-attention mechanism.
-        has_visual_segment_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        has_visual_segment_embedding (`bool`, *optional*, defaults to `False`):
             Whether or not to add visual segment embeddings.
-        detectron2_config_args (:obj:`dict`, `optional`):
-            Dictionary containing the configuration arguments of the Detectron2 visual backbone. Refer to `this file
-            <https://github.com/microsoft/unilm/blob/master/layoutlmft/layoutlmft/models/layoutlmv2/detectron2_config.py>`__
+        detectron2_config_args (`dict`, *optional*):
+            Dictionary containing the configuration arguments of the Detectron2 visual backbone. Refer to [this file](https://github.com/microsoft/unilm/blob/master/layoutlmft/layoutlmft/models/layoutlmv2/detectron2_config.py)
             for details regarding default values.
 
-    Example::
+    Example:
 
-        >>> from transformers import LayoutLMv2Model, LayoutLMv2Config
+    ```python
+    >>> from transformers import LayoutLMv2Model, LayoutLMv2Config
 
-        >>> # Initializing a LayoutLMv2 microsoft/layoutlmv2-base-uncased style configuration
-        >>> configuration = LayoutLMv2Config()
+    >>> # Initializing a LayoutLMv2 microsoft/layoutlmv2-base-uncased style configuration
+    >>> configuration = LayoutLMv2Config()
 
-        >>> # Initializing a model from the microsoft/layoutlmv2-base-uncased style configuration
-        >>> model = LayoutLMv2Model(configuration)
+    >>> # Initializing a model from the microsoft/layoutlmv2-base-uncased style configuration
+    >>> model = LayoutLMv2Model(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "layoutlmv2"
 
     def __init__(
diff --git a/src/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py b/src/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py
index 7a8c4fab7b..b10cedf4ed 100644
--- a/src/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/feature_extraction_layoutlmv2.py
@@ -85,31 +85,32 @@ class LayoutLMv2FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM
     Constructs a LayoutLMv2 feature extractor. This can be used to resize document images to the same size, as well as
     to apply OCR on them in order to get a list of words and normalized bounding boxes.
 
-    This feature extractor inherits from :class:`~transformers.feature_extraction_utils.PreTrainedFeatureExtractor`
+    This feature extractor inherits from [`~feature_extraction_utils.PreTrainedFeatureExtractor`]
     which contains most of the main methods. Users should refer to this superclass for more information regarding those
     methods.
 
     Args:
-        do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether to resize the input to a certain :obj:`size`.
-        size (:obj:`int` or :obj:`Tuple(int)`, `optional`, defaults to 224):
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input to a certain `size`.
+        size (`int` or `Tuple(int)`, *optional*, defaults to 224):
             Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an
-            integer is provided, then the input will be resized to (size, size). Only has an effect if :obj:`do_resize`
-            is set to :obj:`True`.
-        resample (:obj:`int`, `optional`, defaults to :obj:`PIL.Image.BILINEAR`):
-            An optional resampling filter. This can be one of :obj:`PIL.Image.NEAREST`, :obj:`PIL.Image.BOX`,
-            :obj:`PIL.Image.BILINEAR`, :obj:`PIL.Image.HAMMING`, :obj:`PIL.Image.BICUBIC` or :obj:`PIL.Image.LANCZOS`.
-            Only has an effect if :obj:`do_resize` is set to :obj:`True`.
-        apply_ocr (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            integer is provided, then the input will be resized to (size, size). Only has an effect if `do_resize`
+            is set to `True`.
+        resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
+            An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
+            `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`.
+            Only has an effect if `do_resize` is set to `True`.
+        apply_ocr (`bool`, *optional*, defaults to `True`):
             Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes.
-        ocr_lang (:obj:`Optional[str]`, `optional`):
+        ocr_lang (`Optional[str]`, *optional*):
             The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
             used.
 
-            .. note::
+            <Tip>
 
-                LayoutLMv2FeatureExtractor uses Google's Tesseract OCR engine under the hood.
-    """
+            LayoutLMv2FeatureExtractor uses Google's Tesseract OCR engine under the hood.
+
+            </Tip>"""
 
     model_input_names = ["pixel_values"]
 
@@ -130,48 +131,49 @@ class LayoutLMv2FeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionM
         Main method to prepare for the model one or several image(s).
 
         Args:
-            images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                 number of channels, H and W are image height and width.
-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`, defaults to :obj:`'np'`):
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*, defaults to `'np'`):
                 If set, will return tensors of a particular framework. Acceptable values are:
 
-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return NumPy :obj:`np.ndarray` objects.
-                * :obj:`'jax'`: Return JAX :obj:`jnp.ndarray` objects.
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
 
         Returns:
-            :class:`~transformers.BatchFeature`: A :class:`~transformers.BatchFeature` with the following fields:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
             - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
               width).
             - **words** -- Optional words as identified by Tesseract OCR (only when
-              :class:`~transformers.LayoutLMv2FeatureExtractor` was initialized with :obj:`apply_ocr` set to ``True``).
+              [`LayoutLMv2FeatureExtractor`] was initialized with `apply_ocr` set to `True`).
             - **boxes** -- Optional bounding boxes as identified by Tesseract OCR, normalized based on the image size
-              (only when :class:`~transformers.LayoutLMv2FeatureExtractor` was initialized with :obj:`apply_ocr` set to
-              ``True``).
+              (only when [`LayoutLMv2FeatureExtractor`] was initialized with `apply_ocr` set to
+              `True`).
 
-        Examples::
+        Examples:
 
-            >>> from transformers import LayoutLMv2FeatureExtractor
-            >>> from PIL import Image
+        ```python
+        >>> from transformers import LayoutLMv2FeatureExtractor
+        >>> from PIL import Image
 
-            >>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
+        >>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
 
-            >>> # option 1: with apply_ocr=True (default)
-            >>> feature_extractor = LayoutLMv2FeatureExtractor()
-            >>> encoding = feature_extractor(image, return_tensors="pt")
-            >>> print(encoding.keys())
-            >>> # dict_keys(['pixel_values', 'words', 'boxes'])
+        >>> # option 1: with apply_ocr=True (default)
+        >>> feature_extractor = LayoutLMv2FeatureExtractor()
+        >>> encoding = feature_extractor(image, return_tensors="pt")
+        >>> print(encoding.keys())
+        >>> # dict_keys(['pixel_values', 'words', 'boxes'])
 
-            >>> # option 2: with apply_ocr=False
-            >>> feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
-            >>> encoding = feature_extractor(image, return_tensors="pt")
-            >>> print(encoding.keys())
-            >>> # dict_keys(['pixel_values'])
-        """
+        >>> # option 2: with apply_ocr=False
+        >>> feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False)
+        >>> encoding = feature_extractor(image, return_tensors="pt")
+        >>> print(encoding.keys())
+        >>> # dict_keys(['pixel_values'])
+        ```"""
 
         # Input type checking for clearer error
         valid_images = False
diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
index e05bff9ebe..3df07a66f5 100755
--- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
@@ -820,21 +820,22 @@ class LayoutLMv2Model(LayoutLMv2PreTrainedModel):
         r"""
         Returns:
 
-        Examples::
+        Examples:
 
-            >>> from transformers import LayoutLMv2Processor, LayoutLMv2Model
-            >>> from PIL import Image
+        ```python
+        >>> from transformers import LayoutLMv2Processor, LayoutLMv2Model
+        >>> from PIL import Image
 
-            >>> processor = LayoutLMv2Processor.from_pretrained('microsoft/layoutlmv2-base-uncased')
-            >>> model = LayoutLMv2Model.from_pretrained('microsoft/layoutlmv2-base-uncased')
+        >>> processor = LayoutLMv2Processor.from_pretrained('microsoft/layoutlmv2-base-uncased')
+        >>> model = LayoutLMv2Model.from_pretrained('microsoft/layoutlmv2-base-uncased')
 
-            >>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
+        >>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
 
-            >>> encoding = processor(image, return_tensors="pt")
+        >>> encoding = processor(image, return_tensors="pt")
 
-            >>> outputs = model(**encoding)
-            >>> last_hidden_states = outputs.last_hidden_state
-        """
+        >>> outputs = model(**encoding)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
index ed91556bc3..d49dbc99bb 100644
--- a/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/processing_layoutlmv2.py
@@ -29,22 +29,22 @@ class LayoutLMv2Processor:
     Constructs a LayoutLMv2 processor which combines a LayoutLMv2 feature extractor and a LayoutLMv2 tokenizer into a
     single processor.
 
-    :class:`~transformers.LayoutLMv2Processor` offers all the functionalities you need to prepare data for the model.
+    [`LayoutLMv2Processor`] offers all the functionalities you need to prepare data for the model.
 
-    It first uses :class:`~transformers.LayoutLMv2FeatureExtractor` to resize document images to a fixed size, and
+    It first uses [`LayoutLMv2FeatureExtractor`] to resize document images to a fixed size, and
     optionally applies OCR to get words and normalized bounding boxes. These are then provided to
-    :class:`~transformers.LayoutLMv2Tokenizer` or :class:`~transformers.LayoutLMv2TokenizerFast`, which turns the words
-    and bounding boxes into token-level :obj:`input_ids`, :obj:`attention_mask`, :obj:`token_type_ids`, :obj:`bbox`.
-    Optionally, one can provide integer :obj:`word_labels`, which are turned into token-level :obj:`labels` for token
+    [`LayoutLMv2Tokenizer`] or [`LayoutLMv2TokenizerFast`], which turns the words
+    and bounding boxes into token-level `input_ids`, `attention_mask`, `token_type_ids`, `bbox`.
+    Optionally, one can provide integer `word_labels`, which are turned into token-level `labels` for token
     classification tasks (such as FUNSD, CORD).
 
     Args:
-        feature_extractor (:obj:`LayoutLMv2FeatureExtractor`):
-            An instance of :class:`~transformers.LayoutLMv2FeatureExtractor`. The feature extractor is a required
+        feature_extractor (`LayoutLMv2FeatureExtractor`):
+            An instance of [`LayoutLMv2FeatureExtractor`]. The feature extractor is a required
             input.
-        tokenizer (:obj:`LayoutLMv2Tokenizer` or :obj:`LayoutLMv2TokenizerFast`):
-            An instance of :class:`~transformers.LayoutLMv2Tokenizer` or
-            :class:`~transformers.LayoutLMv2TokenizerFast`. The tokenizer is a required input.
+        tokenizer (`LayoutLMv2Tokenizer` or `LayoutLMv2TokenizerFast`):
+            An instance of [`LayoutLMv2Tokenizer`] or
+            [`LayoutLMv2TokenizerFast`]. The tokenizer is a required input.
     """
 
     def __init__(self, feature_extractor, tokenizer):
@@ -62,18 +62,20 @@ class LayoutLMv2Processor:
 
     def save_pretrained(self, save_directory):
         """
-        Save a LayoutLMv2 feature_extractor object and LayoutLMv2 tokenizer object to the directory ``save_directory``,
-        so that it can be re-loaded using the :func:`~transformers.LayoutLMv2Processor.from_pretrained` class method.
+        Save a LayoutLMv2 feature_extractor object and LayoutLMv2 tokenizer object to the directory `save_directory`,
+        so that it can be re-loaded using the [`~LayoutLMv2Processor.from_pretrained`] class method.
 
-        .. note::
+        <Tip>
 
-            This class method is simply calling
-            :meth:`~transformers.feature_extraction_utils.FeatureExtractionMixin.save_pretrained` and
-            :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.save_pretrained`. Please refer to the
-            docstrings of the methods above for more information.
+        This class method is simply calling
+        [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
+        [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the
+        docstrings of the methods above for more information.
+
+        </Tip>
 
         Args:
-            save_directory (:obj:`str` or :obj:`os.PathLike`):
+            save_directory (`str` or `os.PathLike`):
                 Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
                 be created if it does not exist).
         """
@@ -84,35 +86,37 @@ class LayoutLMv2Processor:
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, use_fast=True, **kwargs):
         r"""
-        Instantiate a :class:`~transformers.LayoutLMv2Processor` from a pretrained LayoutLMv2 processor.
+        Instantiate a [`LayoutLMv2Processor`] from a pretrained LayoutLMv2 processor.
 
-        .. note::
+        <Tip>
 
-            This class method is simply calling LayoutLMv2FeatureExtractor's
-            :meth:`~transformers.feature_extraction_utils.FeatureExtractionMixin.from_pretrained` and
-            LayoutLMv2TokenizerFast's
-            :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained`. Please refer to the
-            docstrings of the methods above for more information.
+        This class method is simply calling LayoutLMv2FeatureExtractor's
+        [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and
+        LayoutLMv2TokenizerFast's
+        [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the
+        docstrings of the methods above for more information.
+
+        </Tip>
 
         Args:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
                 This can be either:
 
-                - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
-                  namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing a feature extractor file saved using the
-                  :meth:`~transformers.SequenceFeatureExtractor.save_pretrained` method, e.g.,
-                  ``./my_model_directory/``.
-                - a path or url to a saved feature extractor JSON `file`, e.g.,
-                  ``./my_model_directory/preprocessor_config.json``.
+                - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
+                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                - a path to a *directory* containing a feature extractor file saved using the
+                  [`~SequenceFeatureExtractor.save_pretrained`] method, e.g.,
+                  `./my_model_directory/`.
+                - a path or url to a saved feature extractor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.
 
-            use_fast (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            use_fast (`bool`, *optional*, defaults to `True`):
                 Whether or not to instantiate a fast tokenizer.
 
             **kwargs
-                Additional keyword arguments passed along to both :class:`~transformers.SequenceFeatureExtractor` and
-                :class:`~transformers.PreTrainedTokenizer`
+                Additional keyword arguments passed along to both [`SequenceFeatureExtractor`] and
+                [`PreTrainedTokenizer`]
         """
         feature_extractor = LayoutLMv2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
         if use_fast:
@@ -146,14 +150,12 @@ class LayoutLMv2Processor:
         **kwargs
     ) -> BatchEncoding:
         """
-        This method first forwards the :obj:`images` argument to
-        :meth:`~transformers.LayoutLMv2FeatureExtractor.__call__`. In case :class:`~LayoutLMv2FeatureExtractor` was
-        initialized with :obj:`apply_ocr` set to ``True``, it passes the obtained words and bounding boxes along with
-        the additional arguments to :meth:`~transformers.LayoutLMv2Tokenizer.__call__` and returns the output, together
-        with resized :obj:`images`. In case :class:`~LayoutLMv2FeatureExtractor` was initialized with :obj:`apply_ocr`
-        set to ``False``, it passes the words (:obj:`text`/:obj:`text_pair`) and :obj:`boxes` specified by the user
-        along with the additional arguments to :meth:`~transformers.LayoutLMv2Tokenizer.__call__` and returns the
-        output, together with resized :obj:`images`.
+        This method first forwards the `images` argument to
+        [`~LayoutLMv2FeatureExtractor.__call__`]. In case [`LayoutLMv2FeatureExtractor`] was
+        initialized with `apply_ocr` set to `True`, it passes the obtained words and bounding boxes along with
+        the additional arguments to [`~LayoutLMv2Tokenizer.__call__`] and returns the output, together
+        with resized `images`. In case [`LayoutLMv2FeatureExtractor`] was initialized with `apply_ocr`
+        set to `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional arguments to [`~LayoutLMv2Tokenizer.__call__`] and returns the output, together with resized `images``.
 
         Please refer to the docstring of the above two methods for more information.
         """
diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
index 2c1f6eb712..87057a325d 100644
--- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
@@ -59,51 +59,51 @@ PRETRAINED_INIT_CONFIGURATION = {
 
 
 LAYOUTLMV2_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
-            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            add_special_tokens (`bool`, *optional*, defaults to `True`):
                 Whether or not to encode the sequences with the special tokens relative to their model.
-            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
                 Activates and controls padding. Accepts the following values:
 
-                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a
                   single sequence if provided).
-                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
                   maximum acceptable input length for the model if that argument is not provided.
-                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
                   different lengths).
-            truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
+            truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
                 Activates and controls truncation. Accepts the following values:
 
-                * :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument
-                  :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
+                - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument
+                  `max_length` or to the maximum acceptable input length for the model if that argument is not
                   provided. This will truncate token by token, removing a token from the longest sequence in the pair
                   if a pair of sequences (or a batch of pairs) is provided.
-                * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
+                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to
                   the maximum acceptable input length for the model if that argument is not provided. This will only
                   truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
+                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or
                   to the maximum acceptable input length for the model if that argument is not provided. This will only
                   truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with
+                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with
                   sequence lengths greater than the model maximum admissible input size).
-            max_length (:obj:`int`, `optional`):
+            max_length (`int`, *optional*):
                 Controls the maximum length to use by one of the truncation/padding parameters. If left unset or set to
-                :obj:`None`, this will use the predefined model maximum length if a maximum length is required by one
+                `None`, this will use the predefined model maximum length if a maximum length is required by one
                 of the truncation/padding parameters. If the model has no specific maximum input length (like XLNet)
                 truncation/padding to a maximum length will be deactivated.
-            stride (:obj:`int`, `optional`, defaults to 0):
-                If set to a number along with :obj:`max_length`, the overflowing tokens returned when
-                :obj:`return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
+            stride (`int`, *optional*, defaults to 0):
+                If set to a number along with `max_length`, the overflowing tokens returned when
+                `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
                 returned to provide some overlap between truncated and overflowing sequences. The value of this
                 argument defines the number of overlapping tokens.
-            pad_to_multiple_of (:obj:`int`, `optional`):
+            pad_to_multiple_of (`int`, *optional*):
                 If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                 the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
 
-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
 """
 
 
@@ -145,14 +145,14 @@ def subfinder(mylist, pattern):
 
 class LayoutLMv2Tokenizer(PreTrainedTokenizer):
     r"""
-    Construct a LayoutLMv2 tokenizer. Based on WordPiece. :class:`~transformers.LayoutLMv2Tokenizer` can be used to
-    turn words, word-level bounding boxes and optional word labels to token-level :obj:`input_ids`,
-    :obj:`attention_mask`, :obj:`token_type_ids`, :obj:`bbox`, and optional :obj:`labels` (for token classification).
+    Construct a LayoutLMv2 tokenizer. Based on WordPiece. [`LayoutLMv2Tokenizer`] can be used to
+    turn words, word-level bounding boxes and optional word labels to token-level `input_ids`,
+    `attention_mask`, `token_type_ids`, `bbox`, and optional `labels` (for token classification).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
-    :class:`~transformers.LayoutLMv2Tokenizer` runs end-to-end tokenization: punctuation splitting and wordpiece. It
+    [`LayoutLMv2Tokenizer`] runs end-to-end tokenization: punctuation splitting and wordpiece. It
     also turns the word-level bounding boxes into token-level bounding boxes.
 
     """
@@ -274,17 +274,17 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. A BERT sequence has the following format:
 
-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         if token_ids_1 is None:
             return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -297,18 +297,18 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
 
         if already_has_special_tokens:
@@ -326,16 +326,16 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
         """
         Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
         pair mask has the following format: :: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second
-        sequence | If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        sequence | If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
             sequence(s).
         """
         sep = [self.sep_token_id]
@@ -392,16 +392,16 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
         sequences with word-level normalized bounding boxes and optional labels.
 
         Args:
-            text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+            text (`str`, `List[str]`, `List[List[str]]`):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string, a list of strings
                 (words of a single example or questions of a batch of examples) or a list of list of strings (batch of
                 words).
-            text_pair (:obj:`List[str]`, :obj:`List[List[str]]`):
+            text_pair (`List[str]`, `List[List[str]]`):
                 The sequence or batch of sequences to be encoded. Each sequence should be a list of strings
                 (pretokenized string).
-            boxes (:obj:`List[List[int]]`, :obj:`List[List[List[int]]]`):
+            boxes (`List[List[int]]`, `List[List[List[int]]]`):
                 Word-level bounding boxes. Each bounding box should be normalized to be on a 0-1000 scale.
-            word_labels (:obj:`List[int]`, :obj:`List[List[int]]`, `optional`):
+            word_labels (`List[int]`, `List[List[int]]`, *optional*):
                 Word-level integer labels (for token classification tasks such as FUNSD, CORD).
         """
         # Input type checking for clearer error
@@ -772,12 +772,12 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
     ) -> BatchEncoding:
         """
         Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
-        ``__call__`` should be used instead.
+        `__call__` should be used instead.
 
         Args:
-            text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+            text (`str`, `List[str]`, `List[List[str]]`):
                 The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
-            text_pair (:obj:`List[str]` or :obj:`List[int]`, `optional`):
+            text_pair (`List[str]` or `List[int]`, *optional*):
                 Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
                 list of list of strings (words of a batch of examples).
         """
@@ -893,18 +893,18 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
         """
         Prepares a sequence or a pair of sequences so that it can be used by the model. It adds special tokens,
         truncates sequences if overflowing while taking into account the special tokens and manages a moving window
-        (with user defined stride) for overflowing tokens. Please Note, for `text_pair` different than `None` and
-        `truncation_strategy = longest_first` or `True`, it is not possible to return overflowing tokens. Such a
+        (with user defined stride) for overflowing tokens. Please Note, for *text_pair* different than *None* and
+        *truncation_strategy = longest_first* or *True*, it is not possible to return overflowing tokens. Such a
         combination of arguments will raise an error.
 
-        Word-level :obj:`boxes` are turned into token-level :obj:`bbox`. If provided, word-level :obj:`word_labels` are
-        turned into token-level :obj:`labels`. The word label is used for the first token of the word, while remaining
+        Word-level `boxes` are turned into token-level `bbox`. If provided, word-level `word_labels` are
+        turned into token-level `labels`. The word label is used for the first token of the word, while remaining
         tokens are labeled with -100, such that they will be ignored by the loss function.
 
         Args:
-            text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+            text (`str`, `List[str]`, `List[List[str]]`):
                 The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
-            text_pair (:obj:`List[str]` or :obj:`List[int]`, `optional`):
+            text_pair (`List[str]` or `List[int]`, *optional*):
                 Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
                 list of list of strings (words of a batch of examples).
         """
@@ -1092,42 +1092,42 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
         Truncates a sequence pair in-place following the strategy.
 
         Args:
-            ids (:obj:`List[int]`):
-                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the ``tokenize``
-                and ``convert_tokens_to_ids`` methods.
-            token_boxes (:obj:`List[List[int]]`):
+            ids (`List[int]`):
+                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize`
+                and `convert_tokens_to_ids` methods.
+            token_boxes (`List[List[int]]`):
                 Bounding boxes of the first sequence.
-            pair_ids (:obj:`List[int]`, `optional`):
-                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the ``tokenize``
-                and ``convert_tokens_to_ids`` methods.
-            pair_token_boxes (:obj:`List[List[int]]`, `optional`):
+            pair_ids (`List[int]`, *optional*):
+                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
+                and `convert_tokens_to_ids` methods.
+            pair_token_boxes (`List[List[int]]`, *optional*):
                 Bounding boxes of the second sequence.
-            labels (:obj:`List[int]`, `optional`):
+            labels (`List[int]`, *optional*):
                 Labels of the first sequence (for token classification tasks).
-            num_tokens_to_remove (:obj:`int`, `optional`, defaults to 0):
+            num_tokens_to_remove (`int`, *optional*, defaults to 0):
                 Number of tokens to remove using the truncation strategy.
-            truncation_strategy (:obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
+            truncation_strategy (`str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
                 The strategy to follow for truncation. Can be:
 
-                * :obj:`'longest_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
+                - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
                   to the maximum acceptable input length for the model if that argument is not provided. This will
                   truncate token by token, removing a token from the longest sequence in the pair if a pair of
                   sequences (or a batch of pairs) is provided.
-                * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
+                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to
                   the maximum acceptable input length for the model if that argument is not provided. This will only
                   truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
+                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or
                   to the maximum acceptable input length for the model if that argument is not provided. This will only
                   truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                * :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
+                - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
                   greater than the model maximum admissible input size).
-            stride (:obj:`int`, `optional`, defaults to 0):
+            stride (`int`, *optional*, defaults to 0):
                 If set to a positive number, the overflowing tokens returned will contain some tokens from the main
                 sequence returned. The value of this argument defines the number of additional tokens.
 
         Returns:
-            :obj:`Tuple[List[int], List[int], List[int]]`: The truncated ``ids``, the truncated ``pair_ids`` and the
-            list of overflowing tokens. Note: The `longest_first` strategy returns empty list of overflowing tokens if
+            `Tuple[List[int], List[int], List[int]]`: The truncated `ids`, the truncated `pair_ids` and the
+            list of overflowing tokens. Note: The *longest_first* strategy returns empty list of overflowing tokens if
             a pair of sequences (or a batch of pairs) is provided.
         """
         if num_tokens_to_remove <= 0:
@@ -1291,19 +1291,18 @@ class BasicTokenizer(object):
     Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
 
     Args:
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
             Whether or not to lowercase the input when tokenizing.
-        never_split (:obj:`Iterable`, `optional`):
+        never_split (`Iterable`, *optional*):
             Collection of tokens which will never be split during tokenization. Only has an effect when
-            :obj:`do_basic_tokenize=True`
-        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            `do_basic_tokenize=True`
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
             Whether or not to tokenize Chinese characters.
 
-            This should likely be deactivated for Japanese (see this `issue
-            <https://github.com/huggingface/transformers/issues/328>`__).
-        strip_accents: (:obj:`bool`, `optional`):
+            This should likely be deactivated for Japanese (see this [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents: (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for :obj:`lowercase` (as in the original BERT).
+            value for `lowercase` (as in the original BERT).
     """
 
     def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
@@ -1320,9 +1319,9 @@ class BasicTokenizer(object):
         WordPieceTokenizer.
 
         Args:
-            **never_split**: (`optional`) list of str
+            never_split (`LIst[str]`, *optional*)
                 Kept for backward compatibility purposes. Now implemented directly at the base class level (see
-                :func:`PreTrainedTokenizer.tokenize`) List of token not to split.
+                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
         """
         # union() returns a new set by concatenating the two sets.
         never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
@@ -1449,14 +1448,14 @@ class WordpieceTokenizer(object):
         Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
         tokenization using the given vocabulary.
 
-        For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`.
+        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
 
         Args:
-          text: A single token or whitespace separated tokens. This should have
-            already been passed through `BasicTokenizer`.
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through *BasicTokenizer*.
 
         Returns:
-          A list of wordpiece tokens.
+            A list of wordpiece tokens.
         """
 
         output_tokens = []
diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
index 73a2cc2cb3..cab5df57d8 100644
--- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
+++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2_fast.py
@@ -61,48 +61,48 @@ PRETRAINED_INIT_CONFIGURATION = {
 
 class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
     r"""
-    Construct a "fast" LayoutLMv2 tokenizer (backed by HuggingFace's `tokenizers` library). Based on WordPiece.
+    Construct a "fast" LayoutLMv2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
     methods. Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             File containing the vocabulary.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
             Whether or not to lowercase the input when tokenizing.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
             The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        cls_token_box (:obj:`List[int]`, `optional`, defaults to :obj:`[0, 0, 0, 0]`):
+        cls_token_box (`List[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
             The bounding box to use for the special [CLS] token.
-        sep_token_box (:obj:`List[int]`, `optional`, defaults to :obj:`[1000, 1000, 1000, 1000]`):
+        sep_token_box (`List[int]`, *optional*, defaults to `[1000, 1000, 1000, 1000]`):
             The bounding box to use for the special [SEP] token.
-        pad_token_box (:obj:`List[int]`, `optional`, defaults to :obj:`[0, 0, 0, 0]`):
+        pad_token_box (`List[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
             The bounding box to use for the special [PAD] token.
-        pad_token_label (:obj:`int`, `optional`, defaults to -100):
-            The label to use for padding tokens. Defaults to -100, which is the :obj:`ignore_index` of PyTorch's
+        pad_token_label (`int`, *optional*, defaults to -100):
+            The label to use for padding tokens. Defaults to -100, which is the `ignore_index` of PyTorch's
             CrossEntropyLoss.
-        only_label_first_subword (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        only_label_first_subword (`bool`, *optional*, defaults to `True`):
             Whether or not to only label the first subword, in case word labels are provided.
-        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see `this
-            issue <https://github.com/huggingface/transformers/issues/328>`__).
-        strip_accents: (:obj:`bool`, `optional`):
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
+            issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents: (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for :obj:`lowercase` (as in the original LayoutLMv2).
+            value for `lowercase` (as in the original LayoutLMv2).
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -196,16 +196,16 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
         sequences with word-level normalized bounding boxes and optional labels.
 
         Args:
-            text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+            text (`str`, `List[str]`, `List[List[str]]`):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string, a list of strings
                 (words of a single example or questions of a batch of examples) or a list of list of strings (batch of
                 words).
-            text_pair (:obj:`List[str]`, :obj:`List[List[str]]`):
+            text_pair (`List[str]`, `List[List[str]]`):
                 The sequence or batch of sequences to be encoded. Each sequence should be a list of strings
                 (pretokenized string).
-            boxes (:obj:`List[List[int]]`, :obj:`List[List[List[int]]]`):
+            boxes (`List[List[int]]`, `List[List[List[int]]]`):
                 Word-level bounding boxes. Each bounding box should be normalized to be on a 0-1000 scale.
-            word_labels (:obj:`List[int]`, :obj:`List[List[int]]`, `optional`):
+            word_labels (`List[int]`, `List[List[int]]`, *optional*):
                 Word-level integer labels (for token classification tasks such as FUNSD, CORD).
         """
         # Input type checking for clearer error
@@ -407,12 +407,12 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
     ) -> BatchEncoding:
         """
         Tokenize and prepare for the model a sequence or a pair of sequences. .. warning:: This method is deprecated,
-        ``__call__`` should be used instead.
+        `__call__` should be used instead.
 
         Args:
-            text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+            text (`str`, `List[str]`, `List[List[str]]`):
                 The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
-            text_pair (:obj:`List[str]` or :obj:`List[int]`, `optional`):
+            text_pair (`List[str]` or `List[int]`, *optional*):
                 Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
                 list of list of strings (words of a batch of examples).
         """
@@ -760,17 +760,17 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. A BERT sequence has the following format:
 
-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
 
@@ -785,16 +785,16 @@ class LayoutLMv2TokenizerFast(PreTrainedTokenizerFast):
         """
         Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence
         pair mask has the following format: :: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second
-        sequence | If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        sequence | If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
             sequence(s).
         """
         sep = [self.sep_token_id]
diff --git a/src/transformers/models/layoutxlm/processing_layoutxlm.py b/src/transformers/models/layoutxlm/processing_layoutxlm.py
index 7178797fbf..59c7cb0e6c 100644
--- a/src/transformers/models/layoutxlm/processing_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/processing_layoutxlm.py
@@ -30,21 +30,21 @@ class LayoutXLMProcessor:
     Constructs a LayoutXLM processor which combines a LayoutXLM feature extractor and a LayoutXLM tokenizer into a
     single processor.
 
-    :class:`~transformers.LayoutXLMProcessor` offers all the functionalities you need to prepare data for the model.
+    [`LayoutXLMProcessor`] offers all the functionalities you need to prepare data for the model.
 
-    It first uses :class:`~transformers.LayoutLMv2FeatureExtractor` to resize document images to a fixed size, and
+    It first uses [`LayoutLMv2FeatureExtractor`] to resize document images to a fixed size, and
     optionally applies OCR to get words and normalized bounding boxes. These are then provided to
-    :class:`~transformers.LayoutXLMTokenizer` or :class:`~transformers.LayoutXLMTokenizerFast`, which turns the words
-    and bounding boxes into token-level :obj:`input_ids`, :obj:`attention_mask`, :obj:`token_type_ids`, :obj:`bbox`.
-    Optionally, one can provide integer :obj:`word_labels`, which are turned into token-level :obj:`labels` for token
+    [`LayoutXLMTokenizer`] or [`LayoutXLMTokenizerFast`], which turns the words
+    and bounding boxes into token-level `input_ids`, `attention_mask`, `token_type_ids`, `bbox`.
+    Optionally, one can provide integer `word_labels`, which are turned into token-level `labels` for token
     classification tasks (such as FUNSD, CORD).
 
     Args:
-        feature_extractor (:obj:`LayoutLMv2FeatureExtractor`):
-            An instance of :class:`~transformers.LayoutLMv2FeatureExtractor`. The feature extractor is a required
+        feature_extractor (`LayoutLMv2FeatureExtractor`):
+            An instance of [`LayoutLMv2FeatureExtractor`]. The feature extractor is a required
             input.
-        tokenizer (:obj:`LayoutXLMTokenizer` or :obj:`LayoutXLMTokenizerFast`):
-            An instance of :class:`~transformers.LayoutXLMTokenizer` or :class:`~transformers.LayoutXLMTokenizerFast`.
+        tokenizer (`LayoutXLMTokenizer` or `LayoutXLMTokenizerFast`):
+            An instance of [`LayoutXLMTokenizer`] or [`LayoutXLMTokenizerFast`].
             The tokenizer is a required input.
     """
 
@@ -63,18 +63,20 @@ class LayoutXLMProcessor:
 
     def save_pretrained(self, save_directory):
         """
-        Save a LayoutXLM feature_extractor object and LayoutXLM tokenizer object to the directory ``save_directory``,
-        so that it can be re-loaded using the :func:`~transformers.LayoutXLMProcessor.from_pretrained` class method.
+        Save a LayoutXLM feature_extractor object and LayoutXLM tokenizer object to the directory `save_directory`,
+        so that it can be re-loaded using the [`~LayoutXLMProcessor.from_pretrained`] class method.
 
-        .. note::
+        <Tip>
 
-            This class method is simply calling
-            :meth:`~transformers.feature_extraction_utils.FeatureExtractionMixin.save_pretrained` and
-            :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.save_pretrained`. Please refer to the
-            docstrings of the methods above for more information.
+        This class method is simply calling
+        [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
+        [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the
+        docstrings of the methods above for more information.
+
+        </Tip>
 
         Args:
-            save_directory (:obj:`str` or :obj:`os.PathLike`):
+            save_directory (`str` or `os.PathLike`):
                 Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
                 be created if it does not exist).
         """
@@ -85,34 +87,36 @@ class LayoutXLMProcessor:
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, use_fast=True, **kwargs):
         r"""
-        Instantiate a :class:`~transformers.LayoutXLMProcessor` from a pretrained LayoutXLM processor.
+        Instantiate a [`LayoutXLMProcessor`] from a pretrained LayoutXLM processor.
 
-        .. note::
+        <Tip>
 
-            This class method is simply calling Layoutv2FeatureExtractor's
-            :meth:`~transformers.feature_extraction_utils.FeatureExtractionMixin.from_pretrained` and
-            LayoutXLMTokenizerFast's :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained`.
-            Please refer to the docstrings of the methods above for more information.
+        This class method is simply calling Layoutv2FeatureExtractor's
+        [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and
+        LayoutXLMTokenizerFast's [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`].
+        Please refer to the docstrings of the methods above for more information.
+
+        </Tip>
 
         Args:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
                 This can be either:
 
-                - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
-                  namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing a feature extractor file saved using the
-                  :meth:`~transformers.SequenceFeatureExtractor.save_pretrained` method, e.g.,
-                  ``./my_model_directory/``.
-                - a path or url to a saved feature extractor JSON `file`, e.g.,
-                  ``./my_model_directory/preprocessor_config.json``.
+                - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
+                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                - a path to a *directory* containing a feature extractor file saved using the
+                  [`~SequenceFeatureExtractor.save_pretrained`] method, e.g.,
+                  `./my_model_directory/`.
+                - a path or url to a saved feature extractor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.
 
-            use_fast (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            use_fast (`bool`, *optional*, defaults to `True`):
                 Whether or not to instantiate a fast tokenizer.
 
             **kwargs
-                Additional keyword arguments passed along to both :class:`~transformers.SequenceFeatureExtractor` and
-                :class:`~transformers.PreTrainedTokenizer`
+                Additional keyword arguments passed along to both [`SequenceFeatureExtractor`] and
+                [`PreTrainedTokenizer`]
         """
         feature_extractor = LayoutLMv2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
         if use_fast:
@@ -146,14 +150,12 @@ class LayoutXLMProcessor:
         **kwargs
     ) -> BatchEncoding:
         """
-        This method first forwards the :obj:`images` argument to
-        :meth:`~transformers.LayoutLMv2FeatureExtractor.__call__`. In case :class:`~LayoutLMv2FeatureExtractor` was
-        initialized with :obj:`apply_ocr` set to ``True``, it passes the obtained words and bounding boxes along with
-        the additional arguments to :meth:`~transformers.LayoutXLMTokenizer.__call__` and returns the output, together
-        with resized :obj:`images`. In case :class:`~LayoutLMv2FeatureExtractor` was initialized with :obj:`apply_ocr`
-        set to ``False``, it passes the words (:obj:`text`/:obj:`text_pair`) and :obj:`boxes` specified by the user
-        along with the additional arguments to :meth:`~transformers.LayoutXLMTokenizer.__call__` and returns the
-        output, together with resized :obj:`images`.
+        This method first forwards the `images` argument to
+        [`~LayoutLMv2FeatureExtractor.__call__`]. In case [`LayoutLMv2FeatureExtractor`] was
+        initialized with `apply_ocr` set to `True`, it passes the obtained words and bounding boxes along with
+        the additional arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output, together
+        with resized `images`. In case [`LayoutLMv2FeatureExtractor`] was initialized with `apply_ocr`
+        set to `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output, together with resized `images``.
 
         Please refer to the docstring of the above two methods for more information.
         """
diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
index 0e40cb06fe..75cc9a591c 100644
--- a/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
+++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm.py
@@ -47,75 +47,80 @@ logger = logging.get_logger(__name__)
 
 class LayoutXLMTokenizer(PreTrainedTokenizer):
     """
-    Adapted from :class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Based on
-    `SentencePiece <https://github.com/google/sentencepiece>`__.
+    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
+    [SentencePiece](https://github.com/google/sentencepiece).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             Path to the vocabulary file.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        cls_token_box (:obj:`List[int]`, `optional`, defaults to :obj:`[0, 0, 0, 0]`):
+        cls_token_box (`List[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
             The bounding box to use for the special [CLS] token.
-        sep_token_box (:obj:`List[int]`, `optional`, defaults to :obj:`[1000, 1000, 1000, 1000]`):
+        sep_token_box (`List[int]`, *optional*, defaults to `[1000, 1000, 1000, 1000]`):
             The bounding box to use for the special [SEP] token.
-        pad_token_box (:obj:`List[int]`, `optional`, defaults to :obj:`[0, 0, 0, 0]`):
+        pad_token_box (`List[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
             The bounding box to use for the special [PAD] token.
-        pad_token_label (:obj:`int`, `optional`, defaults to -100):
-            The label to use for padding tokens. Defaults to -100, which is the :obj:`ignore_index` of PyTorch's
+        pad_token_label (`int`, *optional*, defaults to -100):
+            The label to use for padding tokens. Defaults to -100, which is the `ignore_index` of PyTorch's
             CrossEntropyLoss.
-        only_label_first_subword (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        only_label_first_subword (`bool`, *optional*, defaults to `True`):
             Whether or not to only label the first subword, in case word labels are provided.
-        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
             Additional special tokens used by the tokenizer.
-        sp_model_kwargs (:obj:`dict`, `optional`):
-            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
-            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
 
-            - ``enable_sampling``: Enable subword regularization.
-            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
 
-              - ``nbest_size = {0,1}``: No sampling is performed.
-              - ``nbest_size > 1``: samples from the nbest_size results.
-              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                 using forward-filtering-and-backward-sampling algorithm.
 
-            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
               BPE-dropout.
 
     Attributes:
-        sp_model (:obj:`SentencePieceProcessor`):
-            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -212,17 +217,17 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. An XLM-RoBERTa sequence has the following format:
 
-        - single sequence: ``<s> X </s>``
-        - pair of sequences: ``<s> A </s></s> B </s>``
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
 
         if token_ids_1 is None:
@@ -236,18 +241,18 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
 
         if already_has_special_tokens:
@@ -267,13 +272,13 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
         not make use of token type ids, therefore a list of zeros is returned.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of zeros.
+            `List[int]`: List of zeros.
 
         """
 
@@ -357,16 +362,16 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
         sequences with word-level normalized bounding boxes and optional labels.
 
         Args:
-            text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+            text (`str`, `List[str]`, `List[List[str]]`):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string, a list of strings
                 (words of a single example or questions of a batch of examples) or a list of list of strings (batch of
                 words).
-            text_pair (:obj:`List[str]`, :obj:`List[List[str]]`):
+            text_pair (`List[str]`, `List[List[str]]`):
                 The sequence or batch of sequences to be encoded. Each sequence should be a list of strings
                 (pretokenized string).
-            boxes (:obj:`List[List[int]]`, :obj:`List[List[List[int]]]`):
+            boxes (`List[List[int]]`, `List[List[List[int]]]`):
                 Word-level bounding boxes. Each bounding box should be normalized to be on a 0-1000 scale.
-            word_labels (:obj:`List[int]`, :obj:`List[List[int]]`, `optional`):
+            word_labels (`List[int]`, `List[List[int]]`, *optional*):
                 Word-level integer labels (for token classification tasks such as FUNSD, CORD).
         """
         # Input type checking for clearer error
@@ -684,14 +689,14 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
         truncates sequences if overflowing while taking into account the special tokens and manages a moving window
         (with user defined stride) for overflowing tokens.
 
-        Word-level :obj:`boxes` are turned into token-level :obj:`bbox`. If provided, word-level :obj:`word_labels` are
-        turned into token-level :obj:`labels`. The word label is used for the first token of the word, while remaining
+        Word-level `boxes` are turned into token-level `bbox`. If provided, word-level `word_labels` are
+        turned into token-level `labels`. The word label is used for the first token of the word, while remaining
         tokens are labeled with -100, such that they will be ignored by the loss function.
 
         Args:
-            text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+            text (`str`, `List[str]`, `List[List[str]]`):
                 The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
-            text_pair (:obj:`List[str]` or :obj:`List[int]`, `optional`):
+            text_pair (`List[str]` or `List[int]`, *optional*):
                 Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
                 list of list of strings (words of a batch of examples).
         """
@@ -868,41 +873,41 @@ class LayoutXLMTokenizer(PreTrainedTokenizer):
         Truncates a sequence pair in-place following the strategy.
 
         Args:
-            ids (:obj:`List[int]`):
-                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the ``tokenize``
-                and ``convert_tokens_to_ids`` methods.
-            token_boxes (:obj:`List[List[int]]`):
+            ids (`List[int]`):
+                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize`
+                and `convert_tokens_to_ids` methods.
+            token_boxes (`List[List[int]]`):
                 Bounding boxes of the first sequence.
-            pair_ids (:obj:`List[int]`, `optional`):
-                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the ``tokenize``
-                and ``convert_tokens_to_ids`` methods.
-            pair_token_boxes (:obj:`List[List[int]]`, `optional`):
+            pair_ids (`List[int]`, *optional*):
+                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
+                and `convert_tokens_to_ids` methods.
+            pair_token_boxes (`List[List[int]]`, *optional*):
                 Bounding boxes of the second sequence.
-            labels (:obj:`List[int]`, `optional`):
+            labels (`List[int]`, *optional*):
                 Labels of the first sequence (for token classification tasks).
-            num_tokens_to_remove (:obj:`int`, `optional`, defaults to 0):
+            num_tokens_to_remove (`int`, *optional*, defaults to 0):
                 Number of tokens to remove using the truncation strategy.
-            truncation_strategy (:obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
+            truncation_strategy (`str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
                 The strategy to follow for truncation. Can be:
 
-                * :obj:`'longest_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
+                - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
                   to the maximum acceptable input length for the model if that argument is not provided. This will
                   truncate token by token, removing a token from the longest sequence in the pair if a pair of
                   sequences (or a batch of pairs) is provided.
-                * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
+                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to
                   the maximum acceptable input length for the model if that argument is not provided. This will only
                   truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
+                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or
                   to the maximum acceptable input length for the model if that argument is not provided. This will only
                   truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                * :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
+                - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
                   greater than the model maximum admissible input size).
-            stride (:obj:`int`, `optional`, defaults to 0):
+            stride (`int`, *optional*, defaults to 0):
                 If set to a positive number, the overflowing tokens returned will contain some tokens from the main
                 sequence returned. The value of this argument defines the number of additional tokens.
 
         Returns:
-            :obj:`Tuple[List[int], List[int], List[int]]`: The truncated ``ids``, the truncated ``pair_ids`` and the
+            `Tuple[List[int], List[int], List[int]]`: The truncated `ids`, the truncated `pair_ids` and the
             list of overflowing tokens.
         """
         if num_tokens_to_remove <= 0:
diff --git a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
index 4b9170250f..8c17828de9 100644
--- a/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
+++ b/src/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
@@ -52,57 +52,62 @@ logger = logging.get_logger(__name__)
 
 class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
     """
-    Construct a "fast" LayoutXLM tokenizer (backed by HuggingFace's `tokenizers` library). Adapted from
-    :class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Based on `BPE
-    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models>`__.
+    Construct a "fast" LayoutXLM tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
+    [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
     methods. Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             Path to the vocabulary file.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        cls_token_box (:obj:`List[int]`, `optional`, defaults to :obj:`[0, 0, 0, 0]`):
+        cls_token_box (`List[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
             The bounding box to use for the special [CLS] token.
-        sep_token_box (:obj:`List[int]`, `optional`, defaults to :obj:`[1000, 1000, 1000, 1000]`):
+        sep_token_box (`List[int]`, *optional*, defaults to `[1000, 1000, 1000, 1000]`):
             The bounding box to use for the special [SEP] token.
-        pad_token_box (:obj:`List[int]`, `optional`, defaults to :obj:`[0, 0, 0, 0]`):
+        pad_token_box (`List[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
             The bounding box to use for the special [PAD] token.
-        pad_token_label (:obj:`int`, `optional`, defaults to -100):
-            The label to use for padding tokens. Defaults to -100, which is the :obj:`ignore_index` of PyTorch's
+        pad_token_label (`int`, *optional*, defaults to -100):
+            The label to use for padding tokens. Defaults to -100, which is the `ignore_index` of PyTorch's
             CrossEntropyLoss.
-        only_label_first_subword (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        only_label_first_subword (`bool`, *optional*, defaults to `True`):
             Whether or not to only label the first subword, in case word labels are provided.
-        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
             Additional special tokens used by the tokenizer.
     """
 
@@ -189,16 +194,16 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
         sequences with word-level normalized bounding boxes and optional labels.
 
         Args:
-            text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+            text (`str`, `List[str]`, `List[List[str]]`):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string, a list of strings
                 (words of a single example or questions of a batch of examples) or a list of list of strings (batch of
                 words).
-            text_pair (:obj:`List[str]`, :obj:`List[List[str]]`):
+            text_pair (`List[str]`, `List[List[str]]`):
                 The sequence or batch of sequences to be encoded. Each sequence should be a list of strings
                 (pretokenized string).
-            boxes (:obj:`List[List[int]]`, :obj:`List[List[List[int]]]`):
+            boxes (`List[List[int]]`, `List[List[List[int]]]`):
                 Word-level bounding boxes. Each bounding box should be normalized to be on a 0-1000 scale.
-            word_labels (:obj:`List[int]`, :obj:`List[List[int]]`, `optional`):
+            word_labels (`List[int]`, `List[List[int]]`, *optional*):
                 Word-level integer labels (for token classification tasks such as FUNSD, CORD).
         """
         # Input type checking for clearer error
@@ -630,17 +635,17 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. An XLM-RoBERTa sequence has the following format:
 
-        - single sequence: ``<s> X </s>``
-        - pair of sequences: ``<s> A </s></s> B </s>``
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
 
         if token_ids_1 is None:
@@ -657,13 +662,13 @@ class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
         not make use of token type ids, therefore a list of zeros is returned.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of zeros.
+            `List[int]`: List of zeros.
 
         """
 
diff --git a/src/transformers/models/led/configuration_led.py b/src/transformers/models/led/configuration_led.py
index e30c3e04c4..e6b617cc24 100644
--- a/src/transformers/models/led/configuration_led.py
+++ b/src/transformers/models/led/configuration_led.py
@@ -30,60 +30,63 @@ LED_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class LEDConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.LEDModel`. It is used to
+    This is the configuration class to store the configuration of a [`LEDModel`]. It is used to
     instantiate an LED model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the LED `allenai/led-base-16384
-    <https://huggingface.co/allenai/led-base-16384>`__ architecture.
+    configuration with the defaults will yield a similar configuration to that of the LED [allenai/led-base-16384](https://huggingface.co/allenai/led-base-16384) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 50265):
+        vocab_size (`int`, *optional*, defaults to 50265):
             Vocabulary size of the LED model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.LEDModel` or :class:`~transformers.TFLEDModel`.
-        d_model (:obj:`int`, `optional`, defaults to 1024):
+            `inputs_ids` passed when calling [`LEDModel`] or [`TFLEDModel`].
+        d_model (`int`, *optional*, defaults to 1024):
             Dimensionality of the layers and the pooler layer.
-        encoder_layers (:obj:`int`, `optional`, defaults to 12):
+        encoder_layers (`int`, *optional*, defaults to 12):
             Number of encoder layers.
-        decoder_layers (:obj:`int`, `optional`, defaults to 12):
+        decoder_layers (`int`, *optional*, defaults to 12):
             Number of decoder layers.
-        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        activation_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for classifier.
-        max_encoder_position_embeddings (:obj:`int`, `optional`, defaults to 16384):
+        max_encoder_position_embeddings (`int`, *optional*, defaults to 16384):
             The maximum sequence length that the encoder might ever be used with.
-        max_decoder_position_embeddings (:obj:`int`, `optional`, defaults to 16384):
+        max_decoder_position_embeddings (`int`, *optional*, defaults to 16384):
             The maximum sequence length that the decoder might ever be used with.
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
+        init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models)
 
-        Example::
+    Example:
+
+    ```python
+
+    ```
 
         >>> from transformers import LEDModel, LEDConfig
 
diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py
index b0fd2e4ed7..1610b67ada 100755
--- a/src/transformers/models/led/modeling_led.py
+++ b/src/transformers/models/led/modeling_led.py
@@ -340,18 +340,21 @@ class LEDEncoderSelfAttention(nn.Module):
         """
         shift every row 1 step right, converting columns into diagonals.
 
-        Example::
+        Example:
 
-              chunked_hidden_states: [ 0.4983,  2.6918, -0.0071,  1.0492,
-                                       -1.8348,  0.7672,  0.2986,  0.0285,
-                                       -0.7584,  0.4206, -0.0405,  0.1599,
-                                       2.0514, -1.1600,  0.5372,  0.2629 ]
-              window_overlap = num_rows = 4
-             (pad & diagonalize) =>
-             [ 0.4983,  2.6918, -0.0071,  1.0492, 0.0000,  0.0000,  0.0000
-               0.0000,  -1.8348,  0.7672,  0.2986,  0.0285, 0.0000,  0.0000
-               0.0000,  0.0000, -0.7584,  0.4206, -0.0405,  0.1599, 0.0000
-               0.0000,  0.0000,  0.0000, 2.0514, -1.1600,  0.5372,  0.2629 ]
+        ```python
+        chunked_hidden_states: [ 0.4983,  2.6918, -0.0071,  1.0492,
+                                 -1.8348,  0.7672,  0.2986,  0.0285,
+                                 -0.7584,  0.4206, -0.0405,  0.1599,
+                                 2.0514, -1.1600,  0.5372,  0.2629 ]
+        window_overlap = num_rows = 4
+        ```
+
+                     (pad & diagonalize) =>
+                     [ 0.4983,  2.6918, -0.0071,  1.0492, 0.0000,  0.0000,  0.0000
+                       0.0000,  -1.8348,  0.7672,  0.2986,  0.0285, 0.0000,  0.0000
+                       0.0000,  0.0000, -0.7584,  0.4206, -0.0405,  0.1599, 0.0000
+                       0.0000,  0.0000,  0.0000, 2.0514, -1.1600,  0.5372,  0.2629 ]
         """
         total_num_heads, num_chunks, window_overlap, hidden_dim = chunked_hidden_states.size()
         chunked_hidden_states = nn.functional.pad(
diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py
index b12c1d0786..a0d5dc503d 100644
--- a/src/transformers/models/led/modeling_tf_led.py
+++ b/src/transformers/models/led/modeling_tf_led.py
@@ -607,18 +607,21 @@ class TFLEDEncoderSelfAttention(tf.keras.layers.Layer):
         """
         shift every row 1 step right, converting columns into diagonals.
 
-        Example::
+        Example:
 
-              chunked_hidden_states: [ 0.4983,  2.6918, -0.0071,  1.0492,
-                                       -1.8348,  0.7672,  0.2986,  0.0285,
-                                       -0.7584,  0.4206, -0.0405,  0.1599,
-                                       2.0514, -1.1600,  0.5372,  0.2629 ]
-              window_overlap = num_rows = 4
-             (pad & diagonalize) =>
-             [ 0.4983,  2.6918, -0.0071,  1.0492, 0.0000,  0.0000,  0.0000
-               0.0000,  -1.8348,  0.7672,  0.2986,  0.0285, 0.0000,  0.0000
-               0.0000,  0.0000, -0.7584,  0.4206, -0.0405,  0.1599, 0.0000
-               0.0000,  0.0000,  0.0000, 2.0514, -1.1600,  0.5372,  0.2629 ]
+        ```python
+        chunked_hidden_states: [ 0.4983,  2.6918, -0.0071,  1.0492,
+                                 -1.8348,  0.7672,  0.2986,  0.0285,
+                                 -0.7584,  0.4206, -0.0405,  0.1599,
+                                 2.0514, -1.1600,  0.5372,  0.2629 ]
+        window_overlap = num_rows = 4
+        ```
+
+                     (pad & diagonalize) =>
+                     [ 0.4983,  2.6918, -0.0071,  1.0492, 0.0000,  0.0000,  0.0000
+                       0.0000,  -1.8348,  0.7672,  0.2986,  0.0285, 0.0000,  0.0000
+                       0.0000,  0.0000, -0.7584,  0.4206, -0.0405,  0.1599, 0.0000
+                       0.0000,  0.0000,  0.0000, 2.0514, -1.1600,  0.5372,  0.2629 ]
         """
         total_num_heads, num_chunks, window_overlap, hidden_dim = shape_list(chunked_hidden_states)
         paddings = tf.convert_to_tensor([[0, 0], [0, 0], [0, 0], [0, window_overlap + 1]])
@@ -2368,19 +2371,20 @@ class TFLEDForConditionalGeneration(TFLEDPreTrainedModel):
         """
         Returns:
 
-        Examples::
+        Examples:
 
-            >>> from transformers import LEDTokenizer, TFLEDForConditionalGeneration
-            >>> import tensorflow as tf
-            >>> mname = 'allenai/led-base-16384'
-            >>> tokenizer = LEDTokenizer.from_pretrained(mname)
-            >>> TXT = "My friends are <mask> but they eat too many carbs."
-            >>> model = TFLEDForConditionalGeneration.from_pretrained(mname)
-            >>> batch = tokenizer([TXT], return_tensors='tf')
-            >>> logits = model(inputs=batch.input_ids).logits
-            >>> probs = tf.nn.softmax(logits[0])
-            >>> # probs[5] is associated with the mask token
-        """
+        ```python
+        >>> from transformers import LEDTokenizer, TFLEDForConditionalGeneration
+        >>> import tensorflow as tf
+        >>> mname = 'allenai/led-base-16384'
+        >>> tokenizer = LEDTokenizer.from_pretrained(mname)
+        >>> TXT = "My friends are <mask> but they eat too many carbs."
+        >>> model = TFLEDForConditionalGeneration.from_pretrained(mname)
+        >>> batch = tokenizer([TXT], return_tensors='tf')
+        >>> logits = model(inputs=batch.input_ids).logits
+        >>> probs = tf.nn.softmax(logits[0])
+        >>> # probs[5] is associated with the mask token
+        ```"""
 
         inputs = input_processing(
             func=self.call,
diff --git a/src/transformers/models/led/tokenization_led.py b/src/transformers/models/led/tokenization_led.py
index 3facfaa515..eca8098817 100644
--- a/src/transformers/models/led/tokenization_led.py
+++ b/src/transformers/models/led/tokenization_led.py
@@ -40,10 +40,10 @@ class LEDTokenizer(BartTokenizer):
     """
     Construct a LED tokenizer.
 
-    :class:`~transformers.LEDTokenizer` is identical to :class:`~transformers.BartTokenizer` and runs end-to-end
+    [`LEDTokenizer`] is identical to [`BartTokenizer`] and runs end-to-end
     tokenization: punctuation splitting and wordpiece.
 
-    Refer to superclass :class:`~transformers.BartTokenizer` for usage examples and documentation concerning
+    Refer to superclass [`BartTokenizer`] for usage examples and documentation concerning
     parameters.
     """
 
diff --git a/src/transformers/models/led/tokenization_led_fast.py b/src/transformers/models/led/tokenization_led_fast.py
index a6b681c4df..b815c806f3 100644
--- a/src/transformers/models/led/tokenization_led_fast.py
+++ b/src/transformers/models/led/tokenization_led_fast.py
@@ -39,12 +39,12 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 
 class LEDTokenizerFast(BartTokenizerFast):
     r"""
-    Construct a "fast" LED tokenizer (backed by HuggingFace's `tokenizers` library).
+    Construct a "fast" LED tokenizer (backed by HuggingFace's *tokenizers* library).
 
-    :class:`~transformers.LEDTokenizerFast` is identical to :class:`~transformers.BartTokenizerFast` and runs
+    [`LEDTokenizerFast`] is identical to [`BartTokenizerFast`] and runs
     end-to-end tokenization: punctuation splitting and wordpiece.
 
-    Refer to superclass :class:`~transformers.BartTokenizerFast` for usage examples and documentation concerning
+    Refer to superclass [`BartTokenizerFast`] for usage examples and documentation concerning
     parameters.
     """
 
diff --git a/src/transformers/models/longformer/configuration_longformer.py b/src/transformers/models/longformer/configuration_longformer.py
index 3c72fc2763..f10f4a4cd3 100644
--- a/src/transformers/models/longformer/configuration_longformer.py
+++ b/src/transformers/models/longformer/configuration_longformer.py
@@ -34,37 +34,37 @@ LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class LongformerConfig(RobertaConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.LongformerModel` or a
-    :class:`~transformers.TFLongformerModel`. It is used to instantiate a Longformer model according to the specified
+    This is the configuration class to store the configuration of a [`LongformerModel`] or a
+    [`TFLongformerModel`]. It is used to instantiate a Longformer model according to the specified
     arguments, defining the model architecture.
 
-    This is the configuration class to store the configuration of a :class:`~transformers.LongformerModel`. It is used
+    This is the configuration class to store the configuration of a [`LongformerModel`]. It is used
     to instantiate an Longformer model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the RoBERTa
-    `roberta-base <https://huggingface.co/roberta-base>`__ architecture with a sequence length 4,096.
+    [roberta-base](https://huggingface.co/roberta-base) architecture with a sequence length 4,096.
 
-    The :class:`~transformers.LongformerConfig` class directly inherits :class:`~transformers.RobertaConfig`. It reuses
+    The [`LongformerConfig`] class directly inherits [`RobertaConfig`]. It reuses
     the same defaults. Please check the parent class for more information.
 
     Args:
-        attention_window (:obj:`int` or :obj:`List[int]`, `optional`, defaults to 512):
-            Size of an attention window around each token. If an :obj:`int`, use the same size for all layers. To
-            specify a different window size for each layer, use a :obj:`List[int]` where ``len(attention_window) ==
-            num_hidden_layers``.
+        attention_window (`int` or `List[int]`, *optional*, defaults to 512):
+            Size of an attention window around each token. If an `int`, use the same size for all layers. To
+            specify a different window size for each layer, use a `List[int]` where `len(attention_window) == num_hidden_layers`.
 
-    Example::
+    Example:
 
-        >>> from transformers import LongformerConfig, LongformerModel
+    ```python
+    >>> from transformers import LongformerConfig, LongformerModel
 
-        >>> # Initializing a Longformer configuration
-        >>> configuration = LongformerConfig()
+    >>> # Initializing a Longformer configuration
+    >>> configuration = LongformerConfig()
 
-        >>> # Initializing a model from the configuration
-        >>> model = LongformerModel(configuration)
+    >>> # Initializing a model from the configuration
+    >>> model = LongformerModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "longformer"
 
     def __init__(self, attention_window: Union[List[int], int] = 512, sep_token_id: int = 2, **kwargs):
diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py
index 6524f42d80..056a85450d 100755
--- a/src/transformers/models/longformer/modeling_longformer.py
+++ b/src/transformers/models/longformer/modeling_longformer.py
@@ -709,18 +709,21 @@ class LongformerSelfAttention(nn.Module):
         """
         shift every row 1 step right, converting columns into diagonals.
 
-        Example::
+        Example:
 
-              chunked_hidden_states: [ 0.4983,  2.6918, -0.0071,  1.0492,
-                                       -1.8348,  0.7672,  0.2986,  0.0285,
-                                       -0.7584,  0.4206, -0.0405,  0.1599,
-                                       2.0514, -1.1600,  0.5372,  0.2629 ]
-              window_overlap = num_rows = 4
-             (pad & diagonalize) =>
-             [ 0.4983,  2.6918, -0.0071,  1.0492, 0.0000,  0.0000,  0.0000
-               0.0000,  -1.8348,  0.7672,  0.2986,  0.0285, 0.0000,  0.0000
-               0.0000,  0.0000, -0.7584,  0.4206, -0.0405,  0.1599, 0.0000
-               0.0000,  0.0000,  0.0000, 2.0514, -1.1600,  0.5372,  0.2629 ]
+        ```python
+        chunked_hidden_states: [ 0.4983,  2.6918, -0.0071,  1.0492,
+                                 -1.8348,  0.7672,  0.2986,  0.0285,
+                                 -0.7584,  0.4206, -0.0405,  0.1599,
+                                 2.0514, -1.1600,  0.5372,  0.2629 ]
+        window_overlap = num_rows = 4
+        ```
+
+                     (pad & diagonalize) =>
+                     [ 0.4983,  2.6918, -0.0071,  1.0492, 0.0000,  0.0000,  0.0000
+                       0.0000,  -1.8348,  0.7672,  0.2986,  0.0285, 0.0000,  0.0000
+                       0.0000,  0.0000, -0.7584,  0.4206, -0.0405,  0.1599, 0.0000
+                       0.0000,  0.0000,  0.0000, 2.0514, -1.1600,  0.5372,  0.2629 ]
         """
         total_num_heads, num_chunks, window_overlap, hidden_dim = chunked_hidden_states.size()
         chunked_hidden_states = nn.functional.pad(
@@ -1584,28 +1587,29 @@ class LongformerModel(LongformerPreTrainedModel):
 
         Returns:
 
-        Examples::
+        Examples:
 
-            >>> import torch
-            >>> from transformers import LongformerModel, LongformerTokenizer
+        ```python
+        >>> import torch
+        >>> from transformers import LongformerModel, LongformerTokenizer
 
-            >>> model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
-            >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
+        >>> model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
+        >>> tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
 
-            >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
-            >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1
+        >>> SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
+        >>> input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(0)  # batch of size 1
 
-            >>> attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention
-            >>> global_attention_mask = torch.zeros(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to global attention to be deactivated for all tokens
-            >>> global_attention_mask[:, [1, 4, 21,]] = 1  # Set global attention to random tokens for the sake of this example
-            ...                                     # Usually, set global attention based on the task. For example,
-            ...                                     # classification: the <s> token
-            ...                                     # QA: question tokens
-            ...                                     # LM: potentially on the beginning of sentences and paragraphs
-            >>> outputs = model(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)
-            >>> sequence_output = outputs.last_hidden_state
-            >>> pooled_output = outputs.pooler_output
-        """
+        >>> attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to local attention
+        >>> global_attention_mask = torch.zeros(input_ids.shape, dtype=torch.long, device=input_ids.device) # initialize to global attention to be deactivated for all tokens
+        >>> global_attention_mask[:, [1, 4, 21,]] = 1  # Set global attention to random tokens for the sake of this example
+        ...                                     # Usually, set global attention based on the task. For example,
+        ...                                     # classification: the <s> token
+        ...                                     # QA: question tokens
+        ...                                     # LM: potentially on the beginning of sentences and paragraphs
+        >>> outputs = model(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)
+        >>> sequence_output = outputs.last_hidden_state
+        >>> pooled_output = outputs.pooler_output
+        ```"""
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py
index 19d354248b..0137d545da 100644
--- a/src/transformers/models/longformer/modeling_tf_longformer.py
+++ b/src/transformers/models/longformer/modeling_tf_longformer.py
@@ -1121,18 +1121,21 @@ class TFLongformerSelfAttention(tf.keras.layers.Layer):
         """
         shift every row 1 step right, converting columns into diagonals.
 
-        Example::
+        Example:
 
-              chunked_hidden_states: [ 0.4983,  2.6918, -0.0071,  1.0492,
-                                       -1.8348,  0.7672,  0.2986,  0.0285,
-                                       -0.7584,  0.4206, -0.0405,  0.1599,
-                                       2.0514, -1.1600,  0.5372,  0.2629 ]
-              window_overlap = num_rows = 4
-             (pad & diagonalize) =>
-             [ 0.4983,  2.6918, -0.0071,  1.0492, 0.0000,  0.0000,  0.0000
-               0.0000,  -1.8348,  0.7672,  0.2986,  0.0285, 0.0000,  0.0000
-               0.0000,  0.0000, -0.7584,  0.4206, -0.0405,  0.1599, 0.0000
-               0.0000,  0.0000,  0.0000, 2.0514, -1.1600,  0.5372,  0.2629 ]
+        ```python
+        chunked_hidden_states: [ 0.4983,  2.6918, -0.0071,  1.0492,
+                                 -1.8348,  0.7672,  0.2986,  0.0285,
+                                 -0.7584,  0.4206, -0.0405,  0.1599,
+                                 2.0514, -1.1600,  0.5372,  0.2629 ]
+        window_overlap = num_rows = 4
+        ```
+
+                     (pad & diagonalize) =>
+                     [ 0.4983,  2.6918, -0.0071,  1.0492, 0.0000,  0.0000,  0.0000
+                       0.0000,  -1.8348,  0.7672,  0.2986,  0.0285, 0.0000,  0.0000
+                       0.0000,  0.0000, -0.7584,  0.4206, -0.0405,  0.1599, 0.0000
+                       0.0000,  0.0000,  0.0000, 2.0514, -1.1600,  0.5372,  0.2629 ]
         """
         total_num_heads, num_chunks, window_overlap, hidden_dim = shape_list(chunked_hidden_states)
         paddings = tf.convert_to_tensor([[0, 0], [0, 0], [0, 0], [0, window_overlap + 1]])
diff --git a/src/transformers/models/longformer/tokenization_longformer.py b/src/transformers/models/longformer/tokenization_longformer.py
index d841b4147c..bba0cbd7c5 100644
--- a/src/transformers/models/longformer/tokenization_longformer.py
+++ b/src/transformers/models/longformer/tokenization_longformer.py
@@ -52,7 +52,7 @@ class LongformerTokenizer(RobertaTokenizer):
     r"""
     Construct a Longformer tokenizer.
 
-    :class:`~transformers.LongformerTokenizer` is identical to :class:`~transformers.RobertaTokenizer`. Refer to the
+    [`LongformerTokenizer`] is identical to [`RobertaTokenizer`]. Refer to the
     superclass for usage examples and documentation concerning parameters.
     """
     vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/models/longformer/tokenization_longformer_fast.py b/src/transformers/models/longformer/tokenization_longformer_fast.py
index a42346fcd7..145e90b544 100644
--- a/src/transformers/models/longformer/tokenization_longformer_fast.py
+++ b/src/transformers/models/longformer/tokenization_longformer_fast.py
@@ -58,9 +58,9 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 
 class LongformerTokenizerFast(RobertaTokenizerFast):
     r"""
-    Construct a "fast" Longformer tokenizer (backed by HuggingFace's `tokenizers` library).
+    Construct a "fast" Longformer tokenizer (backed by HuggingFace's *tokenizers* library).
 
-    :class:`~transformers.LongformerTokenizerFast` is identical to :class:`~transformers.RobertaTokenizerFast`. Refer
+    [`LongformerTokenizerFast`] is identical to [`RobertaTokenizerFast`]. Refer
     to the superclass for usage examples and documentation concerning parameters.
     """
     # merges and vocab same as Roberta
diff --git a/src/transformers/models/luke/configuration_luke.py b/src/transformers/models/luke/configuration_luke.py
index ba6dc49643..0c39057042 100644
--- a/src/transformers/models/luke/configuration_luke.py
+++ b/src/transformers/models/luke/configuration_luke.py
@@ -28,64 +28,64 @@ LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class LukeConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.LukeModel`. It is used to
+    This is the configuration class to store the configuration of a [`LukeModel`]. It is used to
     instantiate a LUKE model according to the specified arguments, defining the model architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the LUKE model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.LukeModel`.
-        entity_vocab_size (:obj:`int`, `optional`, defaults to 500000):
+            `inputs_ids` passed when calling [`LukeModel`].
+        entity_vocab_size (`int`, *optional*, defaults to 500000):
             Entity vocabulary size of the LUKE model. Defines the number of different entities that can be represented
-            by the :obj:`entity_ids` passed when calling :class:`~transformers.LukeModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            by the `entity_ids` passed when calling [`LukeModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        entity_emb_size (:obj:`int`, `optional`, defaults to 256):
+        entity_emb_size (`int`, *optional*, defaults to 256):
             The number of dimensions of the entity embedding.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.LukeModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`LukeModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        use_entity_aware_attention (:obj:`bool`, defaults to :obj:`True`):
-            Whether or not the model should use the entity-aware self-attention mechanism proposed in `LUKE: Deep
-            Contextualized Entity Representations with Entity-aware Self-attention (Yamada et al.)
-            <https://arxiv.org/abs/2010.01057>`__.
+        use_entity_aware_attention (`bool`, defaults to `True`):
+            Whether or not the model should use the entity-aware self-attention mechanism proposed in [LUKE: Deep
+            Contextualized Entity Representations with Entity-aware Self-attention (Yamada et al.)](https://arxiv.org/abs/2010.01057).
 
-    Examples::
+    Examples:
 
-        >>> from transformers import LukeConfig, LukeModel
+    ```python
+    >>> from transformers import LukeConfig, LukeModel
 
-        >>> # Initializing a LUKE configuration
-        >>> configuration = LukeConfig()
+    >>> # Initializing a LUKE configuration
+    >>> configuration = LukeConfig()
 
-        >>> # Initializing a model from the configuration
-        >>> model = LukeModel(configuration)
+    >>> # Initializing a model from the configuration
+    >>> model = LukeModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "luke"
 
     def __init__(
diff --git a/src/transformers/models/luke/modeling_luke.py b/src/transformers/models/luke/modeling_luke.py
index 468093f24a..1a73f73a38 100644
--- a/src/transformers/models/luke/modeling_luke.py
+++ b/src/transformers/models/luke/modeling_luke.py
@@ -924,32 +924,33 @@ class LukeModel(LukePreTrainedModel):
 
         Returns:
 
-        Examples::
+        Examples:
 
-            >>> from transformers import LukeTokenizer, LukeModel
+        ```python
+        >>> from transformers import LukeTokenizer, LukeModel
 
-            >>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base")
-            >>> model = LukeModel.from_pretrained("studio-ousia/luke-base")
+        >>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base")
+        >>> model = LukeModel.from_pretrained("studio-ousia/luke-base")
 
-            # Compute the contextualized entity representation corresponding to the entity mention "Beyoncé"
-            >>> text = "Beyoncé lives in Los Angeles."
-            >>> entity_spans = [(0, 7)]  # character-based entity span corresponding to "Beyoncé"
+        # Compute the contextualized entity representation corresponding to the entity mention "Beyoncé"
+        >>> text = "Beyoncé lives in Los Angeles."
+        >>> entity_spans = [(0, 7)]  # character-based entity span corresponding to "Beyoncé"
 
-            >>> encoding = tokenizer(text, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
-            >>> outputs = model(**encoding)
-            >>> word_last_hidden_state = outputs.last_hidden_state
-            >>> entity_last_hidden_state = outputs.entity_last_hidden_state
+        >>> encoding = tokenizer(text, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
+        >>> outputs = model(**encoding)
+        >>> word_last_hidden_state = outputs.last_hidden_state
+        >>> entity_last_hidden_state = outputs.entity_last_hidden_state
 
-            # Input Wikipedia entities to obtain enriched contextualized representations of word tokens
-            >>> text = "Beyoncé lives in Los Angeles."
-            >>> entities = ["Beyoncé", "Los Angeles"]  # Wikipedia entity titles corresponding to the entity mentions "Beyoncé" and "Los Angeles"
-            >>> entity_spans = [(0, 7), (17, 28)]  # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
+        # Input Wikipedia entities to obtain enriched contextualized representations of word tokens
+        >>> text = "Beyoncé lives in Los Angeles."
+        >>> entities = ["Beyoncé", "Los Angeles"]  # Wikipedia entity titles corresponding to the entity mentions "Beyoncé" and "Los Angeles"
+        >>> entity_spans = [(0, 7), (17, 28)]  # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
 
-            >>> encoding = tokenizer(text, entities=entities, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
-            >>> outputs = model(**encoding)
-            >>> word_last_hidden_state = outputs.last_hidden_state
-            >>> entity_last_hidden_state = outputs.entity_last_hidden_state
-        """
+        >>> encoding = tokenizer(text, entities=entities, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
+        >>> outputs = model(**encoding)
+        >>> word_last_hidden_state = outputs.last_hidden_state
+        >>> entity_last_hidden_state = outputs.entity_last_hidden_state
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/luke/tokenization_luke.py b/src/transformers/models/luke/tokenization_luke.py
index 785fdf2233..374819a130 100644
--- a/src/transformers/models/luke/tokenization_luke.py
+++ b/src/transformers/models/luke/tokenization_luke.py
@@ -74,79 +74,79 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 }
 
 ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
-            return_token_type_ids (:obj:`bool`, `optional`):
+            return_token_type_ids (`bool`, *optional*):
                 Whether to return token type IDs. If left to the default, will return the token type IDs according to
-                the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+                the specific tokenizer's default, defined by the `return_outputs` attribute.
 
-                `What are token type IDs? <../glossary.html#token-type-ids>`__
-            return_attention_mask (:obj:`bool`, `optional`):
+                [What are token type IDs?](../glossary#token-type-ids)
+            return_attention_mask (`bool`, *optional*):
                 Whether to return the attention mask. If left to the default, will return the attention mask according
-                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+                to the specific tokenizer's default, defined by the `return_outputs` attribute.
 
-                `What are attention masks? <../glossary.html#attention-mask>`__
-            return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                [What are attention masks?](../glossary#attention-mask)
+            return_overflowing_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch
-                of pairs) is provided with :obj:`truncation_strategy = longest_first` or :obj:`True`, an error is
+                of pairs) is provided with `truncation_strategy = longest_first` or `True`, an error is
                 raised instead of returning overflowing tokens.
-            return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            return_special_tokens_mask (`bool`, *optional*, defaults to `False`):
                 Whether or not to return special tokens mask information.
-            return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not to return :obj:`(char_start, char_end)` for each token.
+            return_offsets_mapping (`bool`, *optional*, defaults to `False`):
+                Whether or not to return `(char_start, char_end)` for each token.
 
                 This is only available on fast tokenizers inheriting from
-                :class:`~transformers.PreTrainedTokenizerFast`, if using Python's tokenizer, this method will raise
-                :obj:`NotImplementedError`.
-            return_length  (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                [`PreTrainedTokenizerFast`], if using Python's tokenizer, this method will raise
+                `NotImplementedError`.
+            return_length  (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the lengths of the encoded inputs.
-            verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            verbose (`bool`, *optional*, defaults to `True`):
                 Whether or not to print more information and warnings.
-            **kwargs: passed to the :obj:`self.tokenize()` method
+            **kwargs: passed to the `self.tokenize()` method
 
-        Return:
-            :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields:
+            Return:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
 
             - **input_ids** -- List of token ids to be fed to a model.
 
-              `What are input IDs? <../glossary.html#input-ids>`__
+              [What are input IDs?](../glossary#input-ids)
 
-            - **token_type_ids** -- List of token type ids to be fed to a model (when :obj:`return_token_type_ids=True`
-              or if `"token_type_ids"` is in :obj:`self.model_input_names`).
+            - **token_type_ids** -- List of token type ids to be fed to a model (when `return_token_type_ids=True`
+              or if *"token_type_ids"* is in `self.model_input_names`).
 
-              `What are token type IDs? <../glossary.html#token-type-ids>`__
+              [What are token type IDs?](../glossary#token-type-ids)
 
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              :obj:`return_attention_mask=True` or if `"attention_mask"` is in :obj:`self.model_input_names`).
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`).
 
-              `What are attention masks? <../glossary.html#attention-mask>`__
+              [What are attention masks?](../glossary#attention-mask)
 
             - **entity_ids** -- List of entity ids to be fed to a model.
 
-              `What are input IDs? <../glossary.html#input-ids>`__
+              [What are input IDs?](../glossary#input-ids)
 
             - **entity_position_ids** -- List of entity positions in the input sequence to be fed to a model.
 
             - **entity_token_type_ids** -- List of entity token type ids to be fed to a model (when
-              :obj:`return_token_type_ids=True` or if `"entity_token_type_ids"` is in :obj:`self.model_input_names`).
+              `return_token_type_ids=True` or if *"entity_token_type_ids"* is in `self.model_input_names`).
 
-              `What are token type IDs? <../glossary.html#token-type-ids>`__
+              [What are token type IDs?](../glossary#token-type-ids)
 
             - **entity_attention_mask** -- List of indices specifying which entities should be attended to by the model
-              (when :obj:`return_attention_mask=True` or if `"entity_attention_mask"` is in
-              :obj:`self.model_input_names`).
+              (when `return_attention_mask=True` or if *"entity_attention_mask"* is in
+              `self.model_input_names`).
 
-              `What are attention masks? <../glossary.html#attention-mask>`__
+              [What are attention masks?](../glossary#attention-mask)
 
             - **entity_start_positions** -- List of the start positions of entities in the word token sequence (when
-              :obj:`task="entity_span_classification"`).
+              `task="entity_span_classification"`).
             - **entity_end_positions** -- List of the end positions of entities in the word token sequence (when
-              :obj:`task="entity_span_classification"`).
-            - **overflowing_tokens** -- List of overflowing tokens sequences (when a :obj:`max_length` is specified and
-              :obj:`return_overflowing_tokens=True`).
-            - **num_truncated_tokens** -- Number of tokens truncated (when a :obj:`max_length` is specified and
-              :obj:`return_overflowing_tokens=True`).
+              `task="entity_span_classification"`).
+            - **overflowing_tokens** -- List of overflowing tokens sequences (when a `max_length` is specified and
+              `return_overflowing_tokens=True`).
+            - **num_truncated_tokens** -- Number of tokens truncated (when a `max_length` is specified and
+              `return_overflowing_tokens=True`).
             - **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying
-              regular sequence tokens (when :obj:`add_special_tokens=True` and :obj:`return_special_tokens_mask=True`).
-            - **length** -- The length of the inputs (when :obj:`return_length=True`)
+              regular sequence tokens (when `add_special_tokens=True` and `return_special_tokens_mask=True`).
+            - **length** -- The length of the inputs (when `return_length=True`)
 
 """
 
@@ -155,33 +155,33 @@ class LukeTokenizer(RobertaTokenizer):
     r"""
     Construct a LUKE tokenizer.
 
-    This tokenizer inherits from :class:`~transformers.RobertaTokenizer` which contains most of the main methods. Users
+    This tokenizer inherits from [`RobertaTokenizer`] which contains most of the main methods. Users
     should refer to this superclass for more information regarding those methods. Compared to
-    :class:`~transformers.RobertaTokenizer`, :class:`~transformers.LukeTokenizer` also creates entity sequences, namely
-    :obj:`entity_ids`, :obj:`entity_attention_mask`, :obj:`entity_token_type_ids`, and :obj:`entity_position_ids` to be
+    [`RobertaTokenizer`], [`LukeTokenizer`] also creates entity sequences, namely
+    `entity_ids`, `entity_attention_mask`, `entity_token_type_ids`, and `entity_position_ids` to be
     used by the LUKE model.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             Path to the vocabulary file.
-        merges_file (:obj:`str`):
+        merges_file (`str`):
             Path to the merges file.
-        entity_vocab_file (:obj:`str`):
+        entity_vocab_file (`str`):
             Path to the entity vocabulary file.
-        task (:obj:`str`, `optional`):
-            Task for which you want to prepare sequences. One of :obj:`"entity_classification"`,
-            :obj:`"entity_pair_classification"`, or :obj:`"entity_span_classification"`. If you specify this argument,
+        task (`str`, *optional*):
+            Task for which you want to prepare sequences. One of `"entity_classification"`,
+            `"entity_pair_classification"`, or `"entity_span_classification"`. If you specify this argument,
             the entity sequence is automatically created based on the given entity span(s).
-        max_entity_length (:obj:`int`, `optional`, defaults to 32):
-            The maximum length of :obj:`entity_ids`.
-        max_mention_length (:obj:`int`, `optional`, defaults to 30):
+        max_entity_length (`int`, *optional*, defaults to 32):
+            The maximum length of `entity_ids`.
+        max_mention_length (`int`, *optional*, defaults to 30):
             The maximum number of tokens inside an entity span.
-        entity_token_1 (:obj:`str`, `optional`, defaults to :obj:`<ent>`):
+        entity_token_1 (`str`, *optional*, defaults to `<ent>`):
             The special token used to represent an entity span in a word token sequence. This token is only used when
-            ``task`` is set to :obj:`"entity_classification"` or :obj:`"entity_pair_classification"`.
-        entity_token_2 (:obj:`str`, `optional`, defaults to :obj:`<ent2>`):
+            `task` is set to `"entity_classification"` or `"entity_pair_classification"`.
+        entity_token_2 (`str`, *optional*, defaults to `<ent2>`):
             The special token used to represent an entity span in a word token sequence. This token is only used when
-            ``task`` is set to :obj:`"entity_pair_classification"`.
+            `task` is set to `"entity_pair_classification"`.
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -275,39 +275,39 @@ class LukeTokenizer(RobertaTokenizer):
         sequences, depending on the task you want to prepare them for.
 
         Args:
-            text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+            text (`str`, `List[str]`, `List[List[str]]`):
                 The sequence or batch of sequences to be encoded. Each sequence must be a string. Note that this
                 tokenizer does not support tokenization based on pretokenized strings.
-            text_pair (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+            text_pair (`str`, `List[str]`, `List[List[str]]`):
                 The sequence or batch of sequences to be encoded. Each sequence must be a string. Note that this
                 tokenizer does not support tokenization based on pretokenized strings.
-            entity_spans (:obj:`List[Tuple[int, int]]`, :obj:`List[List[Tuple[int, int]]]`, `optional`):
+            entity_spans (`List[Tuple[int, int]]`, `List[List[Tuple[int, int]]]`, *optional*):
                 The sequence or batch of sequences of entity spans to be encoded. Each sequence consists of tuples each
                 with two integers denoting character-based start and end positions of entities. If you specify
-                :obj:`"entity_classification"` or :obj:`"entity_pair_classification"` as the ``task`` argument in the
-                constructor, the length of each sequence must be 1 or 2, respectively. If you specify ``entities``, the
-                length of each sequence must be equal to the length of each sequence of ``entities``.
-            entity_spans_pair (:obj:`List[Tuple[int, int]]`, :obj:`List[List[Tuple[int, int]]]`, `optional`):
+                `"entity_classification"` or `"entity_pair_classification"` as the `task` argument in the
+                constructor, the length of each sequence must be 1 or 2, respectively. If you specify `entities`, the
+                length of each sequence must be equal to the length of each sequence of `entities`.
+            entity_spans_pair (`List[Tuple[int, int]]`, `List[List[Tuple[int, int]]]`, *optional*):
                 The sequence or batch of sequences of entity spans to be encoded. Each sequence consists of tuples each
                 with two integers denoting character-based start and end positions of entities. If you specify the
-                ``task`` argument in the constructor, this argument is ignored. If you specify ``entities_pair``, the
-                length of each sequence must be equal to the length of each sequence of ``entities_pair``.
-            entities (:obj:`List[str]`, :obj:`List[List[str]]`, `optional`):
+                `task` argument in the constructor, this argument is ignored. If you specify `entities_pair`, the
+                length of each sequence must be equal to the length of each sequence of `entities_pair`.
+            entities (`List[str]`, `List[List[str]]`, *optional*):
                 The sequence or batch of sequences of entities to be encoded. Each sequence consists of strings
                 representing entities, i.e., special entities (e.g., [MASK]) or entity titles of Wikipedia (e.g., Los
-                Angeles). This argument is ignored if you specify the ``task`` argument in the constructor. The length
-                of each sequence must be equal to the length of each sequence of ``entity_spans``. If you specify
-                ``entity_spans`` without specifying this argument, the entity sequence or the batch of entity sequences
+                Angeles). This argument is ignored if you specify the `task` argument in the constructor. The length
+                of each sequence must be equal to the length of each sequence of `entity_spans`. If you specify
+                `entity_spans` without specifying this argument, the entity sequence or the batch of entity sequences
                 is automatically constructed by filling it with the [MASK] entity.
-            entities_pair (:obj:`List[str]`, :obj:`List[List[str]]`, `optional`):
+            entities_pair (`List[str]`, `List[List[str]]`, *optional*):
                 The sequence or batch of sequences of entities to be encoded. Each sequence consists of strings
                 representing entities, i.e., special entities (e.g., [MASK]) or entity titles of Wikipedia (e.g., Los
-                Angeles). This argument is ignored if you specify the ``task`` argument in the constructor. The length
-                of each sequence must be equal to the length of each sequence of ``entity_spans_pair``. If you specify
-                ``entity_spans_pair`` without specifying this argument, the entity sequence or the batch of entity
+                Angeles). This argument is ignored if you specify the `task` argument in the constructor. The length
+                of each sequence must be equal to the length of each sequence of `entity_spans_pair`. If you specify
+                `entity_spans_pair` without specifying this argument, the entity sequence or the batch of entity
                 sequences is automatically constructed by filling it with the [MASK] entity.
-            max_entity_length (:obj:`int`, `optional`):
-                The maximum length of :obj:`entity_ids`.
+            max_entity_length (`int`, *optional*):
+                The maximum length of `entity_ids`.
         """
         # Input type checking for clearer error
         is_valid_single_text = isinstance(text, str)
@@ -865,24 +865,24 @@ class LukeTokenizer(RobertaTokenizer):
         Prepares a sequence of input id, entity id and entity span, or a pair of sequences of inputs ids, entity ids,
         entity spans so that it can be used by the model. It adds special tokens, truncates sequences if overflowing
         while taking into account the special tokens and manages a moving window (with user defined stride) for
-        overflowing tokens. Please Note, for `pair_ids` different than `None` and `truncation_strategy = longest_first`
-        or `True`, it is not possible to return overflowing tokens. Such a combination of arguments will raise an
+        overflowing tokens. Please Note, for *pair_ids* different than *None* and *truncation_strategy = longest_first*
+        or *True*, it is not possible to return overflowing tokens. Such a combination of arguments will raise an
         error.
 
         Args:
-            ids (:obj:`List[int]`):
+            ids (`List[int]`):
                 Tokenized input ids of the first sequence.
-            pair_ids (:obj:`List[int]`, `optional`):
+            pair_ids (`List[int]`, *optional*):
                 Tokenized input ids of the second sequence.
-            entity_ids (:obj:`List[int]`, `optional`):
+            entity_ids (`List[int]`, *optional*):
                 Entity ids of the first sequence.
-            pair_entity_ids (:obj:`List[int]`, `optional`):
+            pair_entity_ids (`List[int]`, *optional*):
                 Entity ids of the second sequence.
-            entity_token_spans (:obj:`List[Tuple[int, int]]`, `optional`):
+            entity_token_spans (`List[Tuple[int, int]]`, *optional*):
                 Entity spans of the first sequence.
-            pair_entity_token_spans (:obj:`List[Tuple[int, int]]`, `optional`):
+            pair_entity_token_spans (`List[Tuple[int, int]]`, *optional*):
                 Entity spans of the second sequence.
-            max_entity_length (:obj:`int`, `optional`):
+            max_entity_length (`int`, *optional*):
                 The maximum length of the entity sequence.
         """
 
@@ -1083,46 +1083,45 @@ class LukeTokenizer(RobertaTokenizer):
         """
         Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
         in the batch. Padding side (left/right) padding token ids are defined at the tokenizer level (with
-        ``self.padding_side``, ``self.pad_token_id`` and ``self.pad_token_type_id``) .. note:: If the
-        ``encoded_inputs`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the result
-        will use the same type unless you provide a different tensor type with ``return_tensors``. In the case of
+        `self.padding_side`, `self.pad_token_id` and `self.pad_token_type_id`) .. note:: If the
+        `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the result
+        will use the same type unless you provide a different tensor type with `return_tensors`. In the case of
         PyTorch tensors, you will lose the specific device of your tensors however.
 
         Args:
-            encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
-                Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or :obj:`Dict[str,
-                List[int]]`) or a batch of tokenized inputs (list of :class:`~transformers.BatchEncoding`, `Dict[str,
-                List[List[int]]]` or `List[Dict[str, List[int]]]`) so you can use this method during preprocessing as
-                well as in a PyTorch Dataloader collate function. Instead of :obj:`List[int]` you can have tensors
+            encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `Dict[str, List[int]]`, `Dict[str, List[List[int]]` or `List[Dict[str, List[int]]]`):
+                Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, List[int]]`) or a batch of tokenized inputs (list of [`BatchEncoding`], *Dict[str,
+                List[List[int]]]* or *List[Dict[str, List[int]]]*) so you can use this method during preprocessing as
+                well as in a PyTorch Dataloader collate function. Instead of `List[int]` you can have tensors
                 (numpy arrays, PyTorch tensors or TensorFlow tensors), see the note above for the return type.
-            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `True`):
                  Select a strategy to pad the returned sequences (according to the model's padding side and padding
                  index) among:
 
-                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a
                   single sequence if provided).
-                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
                   maximum acceptable input length for the model if that argument is not provided.
-                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
                   different lengths).
-            max_length (:obj:`int`, `optional`):
+            max_length (`int`, *optional*):
                 Maximum length of the returned list and optionally padding length (see above).
-            max_entity_length (:obj:`int`, `optional`):
+            max_entity_length (`int`, *optional*):
                 The maximum length of the entity sequence.
-            pad_to_multiple_of (:obj:`int`, `optional`):
+            pad_to_multiple_of (`int`, *optional*):
                 If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                 the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
-            return_attention_mask (:obj:`bool`, `optional`):
+            return_attention_mask (`bool`, *optional*):
                 Whether to return the attention mask. If left to the default, will return the attention mask according
-                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. `What are
-                attention masks? <../glossary.html#attention-mask>`__
-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+                to the specific tokenizer's default, defined by the `return_outputs` attribute. [What are
+                attention masks?](../glossary#attention-mask)
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
 
-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
-            verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            verbose (`bool`, *optional*, defaults to `True`):
                 Whether or not to print more information and warnings.
         """
         # If we have a list of dicts, let's convert it in a dict of lists
diff --git a/src/transformers/models/lxmert/configuration_lxmert.py b/src/transformers/models/lxmert/configuration_lxmert.py
index e4d9474941..a0fabca535 100644
--- a/src/transformers/models/lxmert/configuration_lxmert.py
+++ b/src/transformers/models/lxmert/configuration_lxmert.py
@@ -28,86 +28,86 @@ LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class LxmertConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.LxmertModel` or a
-    :class:`~transformers.TFLxmertModel`. It is used to instantiate a LXMERT model according to the specified
+    This is the configuration class to store the configuration of a [`LxmertModel`] or a
+    [`TFLxmertModel`]. It is used to instantiate a LXMERT model according to the specified
     arguments, defining the model architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the LXMERT model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.LxmertModel` or
-            :class:`~transformers.TFLxmertModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            `inputs_ids` passed when calling [`LxmertModel`] or
+            [`TFLxmertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        r_layers (:obj:`int`, `optional`, defaults to 5):
+        r_layers (`int`, *optional*, defaults to 5):
             Number of hidden layers in the Transformer visual encoder.
-        l_layers (:obj:`int`, `optional`, defaults to 9):
+        l_layers (`int`, *optional*, defaults to 9):
             Number of hidden layers in the Transformer language encoder.
-        x_layers (:obj:`int`, `optional`, defaults to 5):
+        x_layers (`int`, *optional*, defaults to 5):
             Number of hidden layers in the Transformer cross modality encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 5):
+        num_attention_heads (`int`, *optional*, defaults to 5):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the *token_type_ids* passed into [`BertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        visual_feat_dim (:obj:`int`, `optional`, defaults to 2048):
+        visual_feat_dim (`int`, *optional*, defaults to 2048):
             This represents the last dimension of the pooled-object features used as input for the model, representing
             the size of each object feature itself.
-        visual_pos_dim (:obj:`int`, `optional`, defaults to 4):
+        visual_pos_dim (`int`, *optional*, defaults to 4):
             This represents the number of spacial features that are mixed into the visual features. The default is set
             to 4 because most commonly this will represent the location of a bounding box. i.e., (x, y, width, height)
-        visual_loss_normalizer (:obj:`float`, `optional`, defaults to 1/15):
+        visual_loss_normalizer (`float`, *optional*, defaults to 1/15):
             This represents the scaling factor in which each visual loss is multiplied by if during pretraining, one
             decided to train with multiple vision-based loss objectives.
-        num_qa_labels (:obj:`int`, `optional`, defaults to 9500):
+        num_qa_labels (`int`, *optional*, defaults to 9500):
             This represents the total number of different question answering (QA) labels there are. If using more than
             one dataset with QA, the user will need to account for the total number of labels that all of the datasets
             have in total.
-        num_object_labels (:obj:`int`, `optional`, defaults to 1600):
+        num_object_labels (`int`, *optional*, defaults to 1600):
             This represents the total number of semantically unique objects that lxmert will be able to classify a
             pooled-object feature as belonging too.
-        num_attr_labels (:obj:`int`, `optional`, defaults to 400):
+        num_attr_labels (`int`, *optional*, defaults to 400):
             This represents the total number of semantically unique attributes that lxmert will be able to classify a
             pooled-object feature as possessing.
-        task_matched (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        task_matched (`bool`, *optional*, defaults to `True`):
             This task is used for sentence-image matching. If the sentence correctly describes the image the label will
             be 1. If the sentence does not correctly describe the image, the label will be 0.
-        task_mask_lm (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        task_mask_lm (`bool`, *optional*, defaults to `True`):
             Whether or not to add masked language modeling (as used in pretraining models such as BERT) to the loss
             objective.
-        task_obj_predict (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        task_obj_predict (`bool`, *optional*, defaults to `True`):
             Whether or not to add object prediction, attribute prediction and feature regression to the loss objective.
-        task_qa (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        task_qa (`bool`, *optional*, defaults to `True`):
             Whether or not to add the question-answering loss to the objective
-        visual_obj_loss (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        visual_obj_loss (`bool`, *optional*, defaults to `True`):
             Whether or not to calculate the object-prediction loss objective
-        visual_attr_loss (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        visual_attr_loss (`bool`, *optional*, defaults to `True`):
             Whether or not to calculate the attribute-prediction loss objective
-        visual_feat_loss (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        visual_feat_loss (`bool`, *optional*, defaults to `True`):
             Whether or not to calculate the feature-regression loss objective
-        output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        output_attentions (`bool`, *optional*, defaults to `False`):
             Whether or not the model should return the attentions from the vision, language, and cross-modality layers
             should be returned.
-        output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        output_hidden_states (`bool`, *optional*, defaults to `False`):
             Whether or not the model should return the hidden states from the vision, language, and cross-modality
             layers should be returned.
     """
diff --git a/src/transformers/models/lxmert/tokenization_lxmert.py b/src/transformers/models/lxmert/tokenization_lxmert.py
index 75f55e5607..5d4e97ad54 100644
--- a/src/transformers/models/lxmert/tokenization_lxmert.py
+++ b/src/transformers/models/lxmert/tokenization_lxmert.py
@@ -37,10 +37,10 @@ class LxmertTokenizer(BertTokenizer):
     r"""
     Construct an LXMERT tokenizer.
 
-    :class:`~transformers.LxmertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
+    [`LxmertTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end
     tokenization: punctuation splitting and wordpiece.
 
-    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning
     parameters.
     """
 
diff --git a/src/transformers/models/lxmert/tokenization_lxmert_fast.py b/src/transformers/models/lxmert/tokenization_lxmert_fast.py
index 9f179fb319..08323d25f6 100644
--- a/src/transformers/models/lxmert/tokenization_lxmert_fast.py
+++ b/src/transformers/models/lxmert/tokenization_lxmert_fast.py
@@ -39,12 +39,12 @@ PRETRAINED_INIT_CONFIGURATION = {
 
 class LxmertTokenizerFast(BertTokenizerFast):
     r"""
-    Construct a "fast" LXMERT tokenizer (backed by HuggingFace's `tokenizers` library).
+    Construct a "fast" LXMERT tokenizer (backed by HuggingFace's *tokenizers* library).
 
-    :class:`~transformers.LxmertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
+    [`LxmertTokenizerFast`] is identical to [`BertTokenizerFast`] and runs
     end-to-end tokenization: punctuation splitting and wordpiece.
 
-    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning
     parameters.
     """
     vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/models/m2m_100/configuration_m2m_100.py b/src/transformers/models/m2m_100/configuration_m2m_100.py
index a4a0df749c..3651f51487 100644
--- a/src/transformers/models/m2m_100/configuration_m2m_100.py
+++ b/src/transformers/models/m2m_100/configuration_m2m_100.py
@@ -28,71 +28,71 @@ M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class M2M100Config(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.M2M100Model`. It is used to
+    This is the configuration class to store the configuration of a [`M2M100Model`]. It is used to
     instantiate an M2M100 model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the M2M100 `m2m100_418M
-    <https://huggingface.co/facebook/m2m100_418M>`__ architecture.
+    configuration with the defaults will yield a similar configuration to that of the M2M100 [m2m100_418M](https://huggingface.co/facebook/m2m100_418M) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 50265):
+        vocab_size (`int`, *optional*, defaults to 50265):
             Vocabulary size of the M2M100 model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.M2M100Model` or
-        d_model (:obj:`int`, `optional`, defaults to 1024):
+            `inputs_ids` passed when calling [`M2M100Model`] or
+        d_model (`int`, *optional*, defaults to 1024):
             Dimensionality of the layers and the pooler layer.
-        encoder_layers (:obj:`int`, `optional`, defaults to 12):
+        encoder_layers (`int`, *optional*, defaults to 12):
             Number of encoder layers.
-        decoder_layers (:obj:`int`, `optional`, defaults to 12):
+        decoder_layers (`int`, *optional*, defaults to 12):
             Number of decoder layers.
-        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        activation_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for classifier.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
+        init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
 
-        Example::
+    Example:
 
-            >>> from transformers import M2M100Model, M2M100Config
+    ```python
+    >>> from transformers import M2M100Model, M2M100Config
 
-            >>> # Initializing a M2M100 facebook/m2m100_418M style configuration
-            >>> configuration = M2M100Config()
+    >>> # Initializing a M2M100 facebook/m2m100_418M style configuration
+    >>> configuration = M2M100Config()
 
-            >>> # Initializing a model from the facebook/m2m100_418M style configuration
-            >>> model = M2M100Model(configuration)
+    >>> # Initializing a model from the facebook/m2m100_418M style configuration
+    >>> model = M2M100Model(configuration)
 
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "m2m_100"
     keys_to_ignore_at_inference = ["past_key_values"]
     attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
diff --git a/src/transformers/models/m2m_100/tokenization_m2m_100.py b/src/transformers/models/m2m_100/tokenization_m2m_100.py
index 88ce4bd44d..7cd7bc4003 100644
--- a/src/transformers/models/m2m_100/tokenization_m2m_100.py
+++ b/src/transformers/models/m2m_100/tokenization_m2m_100.py
@@ -63,60 +63,60 @@ FAIRSEQ_LANGUAGE_CODES = {
 
 class M2M100Tokenizer(PreTrainedTokenizer):
     """
-    Construct an M2M100 tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
+    Construct an M2M100 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             Path to the vocabulary file.
-        spm_file (:obj:`str`):
-            Path to `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a .spm extension)
+        spm_file (`str`):
+            Path to [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension)
             that contains the vocabulary.
-        src_lang (:obj:`str`, `optional`):
+        src_lang (`str`, *optional*):
             A string representing the source language.
-        tgt_lang (:obj:`str`, `optional`):
+        tgt_lang (`str`, *optional*):
             A string representing the target language.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        language_codes (:obj:`str`, `optional`, defaults to :obj:`"m2m100"`):
-            What language codes to use. Should be one of :obj:`"m2m100"` or :obj:`"wmt21"`.
-        sp_model_kwargs (:obj:`dict`, `optional`):
-            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
-            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
+        language_codes (`str`, *optional*, defaults to `"m2m100"`):
+            What language codes to use. Should be one of `"m2m100"` or `"wmt21"`.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
 
-            - ``enable_sampling``: Enable subword regularization.
-            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
 
-              - ``nbest_size = {0,1}``: No sampling is performed.
-              - ``nbest_size > 1``: samples from the nbest_size results.
-              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                 using forward-filtering-and-backward-sampling algorithm.
 
-            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
               BPE-dropout.
 
-    Examples::
+    Examples:
 
-        >>> from transformers import M2M100Tokenizer
-        >>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M, src_lang="en", tgt_lang="ro")
-        >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
-        >>> tgt_text =  "Şeful ONU declară că nu există o soluţie militară în Siria"
-        >>> model_inputs = tokenizer(src_text, return_tensors="pt")
-        >>> with tokenizer.as_target_tokenizer():
-        ...    labels = tokenizer(tgt_text, return_tensors="pt").input_ids
-        >>> # model(**model_inputs, labels=labels) should work
-    """
+    ```python
+    >>> from transformers import M2M100Tokenizer
+    >>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M, src_lang="en", tgt_lang="ro")
+    >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
+    >>> tgt_text =  "Şeful ONU declară că nu există o soluţie militară în Siria"
+    >>> model_inputs = tokenizer(src_text, return_tensors="pt")
+    >>> with tokenizer.as_target_tokenizer():
+    ...    labels = tokenizer(tgt_text, return_tensors="pt").input_ids
+    >>> # model(**model_inputs, labels=labels) should work
+    ```"""
 
     vocab_files_names = VOCAB_FILES_NAMES
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
@@ -226,18 +226,18 @@ class M2M100Tokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
 
         if already_has_special_tokens:
@@ -256,22 +256,22 @@ class M2M100Tokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. An MBART sequence has the following format, where ``X`` represents the sequence:
+        adding special tokens. An MBART sequence has the following format, where `X` represents the sequence:
 
-        - ``input_ids`` (for encoder) ``X [eos, src_lang_code]``
-        - ``decoder_input_ids``: (for decoder) ``X [eos, tgt_lang_code]``
+        - `input_ids` (for encoder) `X [eos, src_lang_code]`
+        - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`
 
         BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
         separator.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         if token_ids_1 is None:
             return self.prefix_tokens + token_ids_0 + self.suffix_tokens
diff --git a/src/transformers/models/marian/configuration_marian.py b/src/transformers/models/marian/configuration_marian.py
index 825c7d707a..a0be3c7723 100644
--- a/src/transformers/models/marian/configuration_marian.py
+++ b/src/transformers/models/marian/configuration_marian.py
@@ -28,77 +28,78 @@ MARIAN_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class MarianConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.MarianModel`. It is used to
+    This is the configuration class to store the configuration of a [`MarianModel`]. It is used to
     instantiate an Marian model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the Marian
-    `Helsinki-NLP/opus-mt-en-de <https://huggingface.co/Helsinki-NLP/opus-mt-en-de>`__ architecture.
+    [Helsinki-NLP/opus-mt-en-de](https://huggingface.co/Helsinki-NLP/opus-mt-en-de) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 50265):
+        vocab_size (`int`, *optional*, defaults to 50265):
             Vocabulary size of the Marian model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.MarianModel` or
-            :class:`~transformers.TFMarianModel`.
-        d_model (:obj:`int`, `optional`, defaults to 1024):
+            `inputs_ids` passed when calling [`MarianModel`] or
+            [`TFMarianModel`].
+        d_model (`int`, *optional*, defaults to 1024):
             Dimensionality of the layers and the pooler layer.
-        encoder_layers (:obj:`int`, `optional`, defaults to 12):
+        encoder_layers (`int`, *optional*, defaults to 12):
             Number of encoder layers.
-        decoder_layers (:obj:`int`, `optional`, defaults to 12):
+        decoder_layers (`int`, *optional*, defaults to 12):
             Number of decoder layers.
-        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        activation_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for classifier.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
+        init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        scale_embedding (`bool`, *optional*, defaults to `False`):
             Scale embeddings by diving by sqrt(d_model).
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models)
-        forced_eos_token_id (:obj:`int`, `optional`, defaults to 0):
-            The id of the token to force as the last generated token when :obj:`max_length` is reached. Usually set to
-            :obj:`eos_token_id`.
+        forced_eos_token_id (`int`, *optional*, defaults to 0):
+            The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+            `eos_token_id`.
 
-    Examples::
+    Examples:
 
-        >>> from transformers import MarianModel, MarianConfig
+    ```python
+    >>> from transformers import MarianModel, MarianConfig
 
-        >>> # Initializing a Marian Helsinki-NLP/opus-mt-en-de style configuration
-        >>> configuration = MarianConfig()
+    >>> # Initializing a Marian Helsinki-NLP/opus-mt-en-de style configuration
+    >>> configuration = MarianConfig()
 
-        >>> # Initializing a model from the Helsinki-NLP/opus-mt-en-de style configuration
-        >>> model = MarianModel(configuration)
+    >>> # Initializing a model from the Helsinki-NLP/opus-mt-en-de style configuration
+    >>> model = MarianModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "marian"
     keys_to_ignore_at_inference = ["past_key_values"]
     attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
diff --git a/src/transformers/models/marian/modeling_flax_marian.py b/src/transformers/models/marian/modeling_flax_marian.py
index 5d58f03877..388d0cafd7 100644
--- a/src/transformers/models/marian/modeling_flax_marian.py
+++ b/src/transformers/models/marian/modeling_flax_marian.py
@@ -975,17 +975,18 @@ class FlaxMarianPreTrainedModel(FlaxPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import MarianTokenizer, FlaxMarianMTModel
+        ```python
+        >>> from transformers import MarianTokenizer, FlaxMarianMTModel
 
-            >>> tokenizer = MarianTokenizer.from_pretrained('facebook/marian-large-cnn')
-            >>> model = FlaxMarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+        >>> tokenizer = MarianTokenizer.from_pretrained('facebook/marian-large-cnn')
+        >>> model = FlaxMarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')
 
-            >>> text = "My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, max_length=64, return_tensors='jax')
-            >>> encoder_outputs = model.encode(**inputs)
-        """
+        >>> text = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer(text, max_length=64, return_tensors='jax')
+        >>> encoder_outputs = model.encode(**inputs)
+        ```"""
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -1041,23 +1042,24 @@ class FlaxMarianPreTrainedModel(FlaxPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import MarianTokenizer, FlaxMarianMTModel
+        ```python
+        >>> from transformers import MarianTokenizer, FlaxMarianMTModel
 
-            >>> tokenizer = MarianTokenizer.from_pretrained('facebook/marian-large-cnn')
-            >>> model = FlaxMarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+        >>> tokenizer = MarianTokenizer.from_pretrained('facebook/marian-large-cnn')
+        >>> model = FlaxMarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')
 
-            >>> text = "My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, max_length=64, return_tensors='jax')
-            >>> encoder_outputs = model.encode(**inputs)
+        >>> text = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer(text, max_length=64, return_tensors='jax')
+        >>> encoder_outputs = model.encode(**inputs)
 
-            >>> decoder_start_token_id = model.config.decoder_start_token_id
-            >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
 
-            >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
-            >>> last_decoder_hidden_states = outputs.last_hidden_state
-        """
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> last_decoder_hidden_states = outputs.last_hidden_state
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1308,23 +1310,24 @@ class FlaxMarianMTModel(FlaxMarianPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import MarianTokenizer, FlaxMarianMTModel
+        ```python
+        >>> from transformers import MarianTokenizer, FlaxMarianMTModel
 
-            >>> model = FlaxMarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')
-            >>> tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+        >>> model = FlaxMarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+        >>> tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
 
-            >>> text = "My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, max_length=64, return_tensors='jax')
-            >>> encoder_outputs = model.encode(**inputs)
+        >>> text = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer(text, max_length=64, return_tensors='jax')
+        >>> encoder_outputs = model.encode(**inputs)
 
-            >>> decoder_start_token_id = model.config.decoder_start_token_id
-            >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
 
-            >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
-            >>> logits = outputs.logits
-        """
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> logits = outputs.logits
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1468,20 +1471,22 @@ class FlaxMarianMTModel(FlaxMarianPreTrainedModel):
 FLAX_MARIAN_MT_DOCSTRING = """
     Returns:
 
-    Example::
+    Example:
 
-        >>> from transformers import MarianTokenizer, FlaxMarianMTModel
+    ```python
+    >>> from transformers import MarianTokenizer, FlaxMarianMTModel
 
-        >>> model = FlaxMarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')
-        >>> tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+    >>> model = FlaxMarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+    >>> tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
 
-        >>> text = "My friends are cool but they eat too many carbs."
-        >>> input_ids = tokenizer(text, max_length=64, return_tensors='jax').input_ids
+    >>> text = "My friends are cool but they eat too many carbs."
+    >>> input_ids = tokenizer(text, max_length=64, return_tensors='jax').input_ids
 
-        >>> sequences = model.generate(input_ids, max_length=64, num_beams=2).sequences
+    >>> sequences = model.generate(input_ids, max_length=64, num_beams=2).sequences
 
-        >>> outputs = tokenizer.batch_decode(sequences, skip_special_tokens=True)
-        >>> # should give `Meine Freunde sind cool, aber sie essen zu viele Kohlenhydrate.`
+    >>> outputs = tokenizer.batch_decode(sequences, skip_special_tokens=True)
+    >>> # should give *Meine Freunde sind cool, aber sie essen zu viele Kohlenhydrate.*
+    ```
 """
 
 overwrite_call_docstring(
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index 19903612ce..b7752e938d 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -523,23 +523,25 @@ MARIAN_START_DOCSTRING = r"""
 
 MARIAN_GENERATION_EXAMPLE = r"""
         Pytorch version of marian-nmt's transformer.h (c++). Designed for the OPUS-NMT translation checkpoints.
-        Available models are listed `here <https://huggingface.co/models?search=Helsinki-NLP>`__.
+        Available models are listed [here](https://huggingface.co/models?search=Helsinki-NLP).
 
-        Examples::
+        Examples:
 
-            >>> from transformers import MarianTokenizer, MarianMTModel
-            >>> from typing import List
-            >>> src = 'fr'  # source language
-            >>> trg = 'en'  # target language
-            >>> sample_text = "où est l'arrêt de bus ?"
-            >>> model_name = f'Helsinki-NLP/opus-mt-{src}-{trg}'
+        ```python
+        >>> from transformers import MarianTokenizer, MarianMTModel
+        >>> from typing import List
+        >>> src = 'fr'  # source language
+        >>> trg = 'en'  # target language
+        >>> sample_text = "où est l'arrêt de bus ?"
+        >>> model_name = f'Helsinki-NLP/opus-mt-{src}-{trg}'
 
-            >>> model = MarianMTModel.from_pretrained(model_name)
-            >>> tokenizer = MarianTokenizer.from_pretrained(model_name)
-            >>> batch = tokenizer([sample_text], return_tensors="pt")
-            >>> gen = model.generate(**batch)
-            >>> tokenizer.batch_decode(gen, skip_special_tokens=True)
-            "Where is the bus stop ?"
+        >>> model = MarianMTModel.from_pretrained(model_name)
+        >>> tokenizer = MarianTokenizer.from_pretrained(model_name)
+        >>> batch = tokenizer([sample_text], return_tensors="pt")
+        >>> gen = model.generate(**batch)
+        >>> tokenizer.batch_decode(gen, skip_special_tokens=True)
+        "Where is the bus stop ?"
+        ```
 """
 
 MARIAN_INPUTS_DOCSTRING = r"""
@@ -1124,20 +1126,21 @@ class MarianModel(MarianPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import MarianTokenizer, MarianModel
+        ```python
+        >>> from transformers import MarianTokenizer, MarianModel
 
-            >>> tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
-            >>> model = MarianModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+        >>> tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+        >>> model = MarianModel.from_pretrained('Helsinki-NLP/opus-mt-en-de')
 
-            >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
-            >>> decoder_input_ids = tokenizer("<pad> Studien haben gezeigt dass es hilfreich ist einen Hund zu besitzen",
-            ... return_tensors="pt", add_special_tokens=False).input_ids  # Batch size 1
-            >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+        >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+        >>> decoder_input_ids = tokenizer("<pad> Studien haben gezeigt dass es hilfreich ist einen Hund zu besitzen",
+        ... return_tensors="pt", add_special_tokens=False).input_ids  # Batch size 1
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
 
-            >>> last_hidden_states = outputs.last_hidden_state
-        """
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/marian/modeling_tf_marian.py b/src/transformers/models/marian/modeling_tf_marian.py
index d4bb501838..f8638c14a3 100644
--- a/src/transformers/models/marian/modeling_tf_marian.py
+++ b/src/transformers/models/marian/modeling_tf_marian.py
@@ -555,23 +555,25 @@ MARIAN_START_DOCSTRING = r"""
 
 MARIAN_GENERATION_EXAMPLE = r"""
         TF version of marian-nmt's transformer.h (c++). Designed for the OPUS-NMT translation checkpoints. Available
-        models are listed `here <https://huggingface.co/models?search=Helsinki-NLP>`__.
+        models are listed [here](https://huggingface.co/models?search=Helsinki-NLP).
 
-        Examples::
+        Examples:
 
-            >>> from transformers import MarianTokenizer, TFMarianMTModel
-            >>> from typing import List
-            >>> src = 'fr'  # source language
-            >>> trg = 'en'  # target language
-            >>> sample_text = "où est l'arrêt de bus ?"
-            >>> model_name = f'Helsinki-NLP/opus-mt-{src}-{trg}'
+        ```python
+        >>> from transformers import MarianTokenizer, TFMarianMTModel
+        >>> from typing import List
+        >>> src = 'fr'  # source language
+        >>> trg = 'en'  # target language
+        >>> sample_text = "où est l'arrêt de bus ?"
+        >>> model_name = f'Helsinki-NLP/opus-mt-{src}-{trg}'
 
-            >>> model = TFMarianMTModel.from_pretrained(model_name)
-            >>> tokenizer = MarianTokenizer.from_pretrained(model_name)
-            >>> batch = tokenizer([sample_text], return_tensors="tf")
-            >>> gen = model.generate(**batch)
-            >>> tokenizer.batch_decode(gen, skip_special_tokens=True)
-            "Where is the bus stop ?"
+        >>> model = TFMarianMTModel.from_pretrained(model_name)
+        >>> tokenizer = MarianTokenizer.from_pretrained(model_name)
+        >>> batch = tokenizer([sample_text], return_tensors="tf")
+        >>> gen = model.generate(**batch)
+        >>> tokenizer.batch_decode(gen, skip_special_tokens=True)
+        "Where is the bus stop ?"
+        ```
 """
 
 MARIAN_INPUTS_DOCSTRING = r"""
diff --git a/src/transformers/models/marian/tokenization_marian.py b/src/transformers/models/marian/tokenization_marian.py
index 828afd53b9..5022569b8d 100644
--- a/src/transformers/models/marian/tokenization_marian.py
+++ b/src/transformers/models/marian/tokenization_marian.py
@@ -55,61 +55,61 @@ PRETRAINED_INIT_CONFIGURATION = {}
 
 class MarianTokenizer(PreTrainedTokenizer):
     r"""
-    Construct a Marian tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
+    Construct a Marian tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        source_spm (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a .spm extension) that
+        source_spm (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
             contains the vocabulary for the source language.
-        target_spm (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a .spm extension) that
+        target_spm (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
             contains the vocabulary for the target language.
-        source_lang (:obj:`str`, `optional`):
+        source_lang (`str`, *optional*):
             A string representing the source language.
-        target_lang (:obj:`str`, `optional`):
+        target_lang (`str`, *optional*):
             A string representing the target language.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        model_max_length (:obj:`int`, `optional`, defaults to 512):
+        model_max_length (`int`, *optional*, defaults to 512):
             The maximum sentence length the model accepts.
-        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<eop>", "<eod>"]`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `["<eop>", "<eod>"]`):
             Additional special tokens used by the tokenizer.
-        sp_model_kwargs (:obj:`dict`, `optional`):
-            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
-            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
 
-            - ``enable_sampling``: Enable subword regularization.
-            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
 
-              - ``nbest_size = {0,1}``: No sampling is performed.
-              - ``nbest_size > 1``: samples from the nbest_size results.
-              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                 using forward-filtering-and-backward-sampling algorithm.
 
-            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
               BPE-dropout.
 
-    Examples::
+    Examples:
 
-        >>> from transformers import MarianTokenizer
-        >>> tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
-        >>> src_texts = [ "I am a small frog.", "Tom asked his teacher for advice."]
-        >>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."]  # optional
-        >>> inputs = tokenizer(src_texts, return_tensors="pt", padding=True)
-        >>> with tokenizer.as_target_tokenizer():
-        ...     labels = tokenizer(tgt_texts, return_tensors="pt", padding=True)
-        >>> inputs["labels"] = labels["input_ids"]
-        # keys  [input_ids, attention_mask, labels].
-        >>> outputs = model(**inputs) should work
-    """
+    ```python
+    >>> from transformers import MarianTokenizer
+    >>> tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
+    >>> src_texts = [ "I am a small frog.", "Tom asked his teacher for advice."]
+    >>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."]  # optional
+    >>> inputs = tokenizer(src_texts, return_tensors="pt", padding=True)
+    >>> with tokenizer.as_target_tokenizer():
+    ...     labels = tokenizer(tgt_texts, return_tensors="pt", padding=True)
+    >>> inputs["labels"] = labels["input_ids"]
+    # keys  [input_ids, attention_mask, labels].
+    >>> outputs = model(**inputs) should work
+    ```"""
 
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
@@ -202,20 +202,20 @@ class MarianTokenizer(PreTrainedTokenizer):
         Convert a list of lists of token ids into a list of strings by calling decode.
 
         Args:
-            sequences (:obj:`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
-                List of tokenized input ids. Can be obtained using the ``__call__`` method.
-            skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Can be obtained using the `__call__` method.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not to remove special tokens in the decoding.
-            clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
                 Whether or not to clean up the tokenization spaces.
-            use_source_tokenizer (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            use_source_tokenizer (`bool`, *optional*, defaults to `False`):
                 Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence
                 problems).
-            kwargs (additional keyword arguments, `optional`):
+            kwargs (additional keyword arguments, *optional*):
                 Will be passed to the underlying model specific decode method.
 
         Returns:
-            :obj:`List[str]`: The list of decoded sentences.
+            `List[str]`: The list of decoded sentences.
         """
         return super().batch_decode(sequences, **kwargs)
 
@@ -224,23 +224,23 @@ class MarianTokenizer(PreTrainedTokenizer):
         Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
         tokens and clean up tokenization spaces.
 
-        Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
+        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
 
         Args:
-            token_ids (:obj:`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
-                List of tokenized input ids. Can be obtained using the ``__call__`` method.
-            skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Can be obtained using the `__call__` method.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not to remove special tokens in the decoding.
-            clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
                 Whether or not to clean up the tokenization spaces.
-            use_source_tokenizer (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            use_source_tokenizer (`bool`, *optional*, defaults to `False`):
                 Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence
                 problems).
-            kwargs (additional keyword arguments, `optional`):
+            kwargs (additional keyword arguments, *optional*):
                 Will be passed to the underlying model specific decode method.
 
         Returns:
-            :obj:`str`: The decoded sentence.
+            `str`: The decoded sentence.
         """
         return super().decode(token_ids, **kwargs)
 
diff --git a/src/transformers/models/mbart/configuration_mbart.py b/src/transformers/models/mbart/configuration_mbart.py
index d1eb27c0e8..2e4769583f 100644
--- a/src/transformers/models/mbart/configuration_mbart.py
+++ b/src/transformers/models/mbart/configuration_mbart.py
@@ -32,77 +32,77 @@ MBART_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class MBartConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.MBartModel`. It is used to
+    This is the configuration class to store the configuration of a [`MBartModel`]. It is used to
     instantiate an MBART model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the MBART `facebook/mbart-large-cc25
-    <https://huggingface.co/facebook/mbart-large-cc25>`__ architecture.
+    configuration with the defaults will yield a similar configuration to that of the MBART [facebook/mbart-large-cc25](https://huggingface.co/facebook/mbart-large-cc25) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 50265):
+        vocab_size (`int`, *optional*, defaults to 50265):
             Vocabulary size of the MBART model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.MBartModel` or
-            :class:`~transformers.TFMBartModel`.
-        d_model (:obj:`int`, `optional`, defaults to 1024):
+            `inputs_ids` passed when calling [`MBartModel`] or
+            [`TFMBartModel`].
+        d_model (`int`, *optional*, defaults to 1024):
             Dimensionality of the layers and the pooler layer.
-        encoder_layers (:obj:`int`, `optional`, defaults to 12):
+        encoder_layers (`int`, *optional*, defaults to 12):
             Number of encoder layers.
-        decoder_layers (:obj:`int`, `optional`, defaults to 12):
+        decoder_layers (`int`, *optional*, defaults to 12):
             Number of decoder layers.
-        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        activation_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for classifier.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
+        init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        scale_embedding (`bool`, *optional*, defaults to `False`):
             Scale embeddings by diving by sqrt(d_model).
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models)
-        forced_eos_token_id (:obj:`int`, `optional`, defaults to 2):
-            The id of the token to force as the last generated token when :obj:`max_length` is reached. Usually set to
-            :obj:`eos_token_id`.
+        forced_eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+            `eos_token_id`.
 
-    Example::
+    Example:
 
-        >>> from transformers import MBartModel, MBartConfig
+    ```python
+    >>> from transformers import MBartModel, MBartConfig
 
-        >>> # Initializing a MBART facebook/mbart-large-cc25 style configuration
-        >>> configuration = MBartConfig()
+    >>> # Initializing a MBART facebook/mbart-large-cc25 style configuration
+    >>> configuration = MBartConfig()
 
-        >>> # Initializing a model from the facebook/mbart-large-cc25 style configuration
-        >>> model = MBartModel(configuration)
+    >>> # Initializing a model from the facebook/mbart-large-cc25 style configuration
+    >>> model = MBartModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "mbart"
     keys_to_ignore_at_inference = ["past_key_values"]
     attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
diff --git a/src/transformers/models/mbart/modeling_flax_mbart.py b/src/transformers/models/mbart/modeling_flax_mbart.py
index 88d8b76b69..d6a425ff3c 100644
--- a/src/transformers/models/mbart/modeling_flax_mbart.py
+++ b/src/transformers/models/mbart/modeling_flax_mbart.py
@@ -1041,17 +1041,18 @@ class FlaxMBartPreTrainedModel(FlaxPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import MBartTokenizer, FlaxMBartForConditionalGeneration
+        ```python
+        >>> from transformers import MBartTokenizer, FlaxMBartForConditionalGeneration
 
-            >>> model = FlaxMBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25')
-            >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-cc25')
+        >>> model = FlaxMBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25')
+        >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-cc25')
 
-            >>> text = "My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
-            >>> encoder_outputs = model.encode(**inputs)
-        """
+        >>> text = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
+        >>> encoder_outputs = model.encode(**inputs)
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1106,23 +1107,24 @@ class FlaxMBartPreTrainedModel(FlaxPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import MBartTokenizer, FlaxMBartForConditionalGeneration
+        ```python
+        >>> from transformers import MBartTokenizer, FlaxMBartForConditionalGeneration
 
-            >>> model = FlaxMBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25')
-            >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-cc25')
+        >>> model = FlaxMBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25')
+        >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-cc25')
 
-            >>> text = "My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
-            >>> encoder_outputs = model.encode(**inputs)
+        >>> text = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
+        >>> encoder_outputs = model.encode(**inputs)
 
-            >>> decoder_start_token_id = model.config.decoder_start_token_id
-            >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
 
-            >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
-            >>> last_decoder_hidden_states = outputs.last_hidden_state
-        """
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> last_decoder_hidden_states = outputs.last_hidden_state
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1372,23 +1374,24 @@ class FlaxMBartForConditionalGeneration(FlaxMBartPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import MBartTokenizer, FlaxMBartForConditionalGeneration
+        ```python
+        >>> from transformers import MBartTokenizer, FlaxMBartForConditionalGeneration
 
-            >>> model = FlaxMBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25')
-            >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-cc25')
+        >>> model = FlaxMBartForConditionalGeneration.from_pretrained('facebook/mbart-large-cc25')
+        >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-cc25')
 
-            >>> text = "My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
-            >>> encoder_outputs = model.encode(**inputs)
+        >>> text = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors='jax')
+        >>> encoder_outputs = model.encode(**inputs)
 
-            >>> decoder_start_token_id = model.config.decoder_start_token_id
-            >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
 
-            >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
-            >>> logits = outputs.logits
-        """
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> logits = outputs.logits
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/mbart/tokenization_mbart.py b/src/transformers/models/mbart/tokenization_mbart.py
index cf3bfab08f..ce33736f74 100644
--- a/src/transformers/models/mbart/tokenization_mbart.py
+++ b/src/transformers/models/mbart/tokenization_mbart.py
@@ -71,24 +71,25 @@ class MBartTokenizer(XLMRobertaTokenizer):
     """
     Construct an MBART tokenizer.
 
-    :class:`~transformers.MBartTokenizer` is a subclass of :class:`~transformers.XLMRobertaTokenizer`. Refer to
-    superclass :class:`~transformers.XLMRobertaTokenizer` for usage examples and documentation concerning the
+    [`MBartTokenizer`] is a subclass of [`XLMRobertaTokenizer`]. Refer to
+    superclass [`XLMRobertaTokenizer`] for usage examples and documentation concerning the
     initialization parameters and other methods.
 
-    The tokenization method is ``<tokens> <eos> <language code>`` for source language documents, and ``<language code>
+    The tokenization method is `<tokens> <eos> <language code>` for source language documents, and ``<language code>
     <tokens> <eos>``` for target language documents.
 
-    Examples::
+    Examples:
 
-        >>> from transformers import MBartTokenizer
-        >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-en-ro', src_lang="en_XX", tgt_lang="ro_RO")
-        >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
-        >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
-        >>> inputs = tokenizer(example_english_phrase, return_tensors="pt)
-        >>> with tokenizer.as_target_tokenizer():
-        ...     labels = tokenizer(expected_translation_romanian, return_tensors="pt")
-        >>> inputs["labels"] = labels["input_ids"]
-    """
+    ```python
+    >>> from transformers import MBartTokenizer
+    >>> tokenizer = MBartTokenizer.from_pretrained('facebook/mbart-large-en-ro', src_lang="en_XX", tgt_lang="ro_RO")
+    >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
+    >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
+    >>> inputs = tokenizer(example_english_phrase, return_tensors="pt)
+    >>> with tokenizer.as_target_tokenizer():
+    ...     labels = tokenizer(expected_translation_romanian, return_tensors="pt")
+    >>> inputs["labels"] = labels["input_ids"]
+    ```"""
 
     vocab_files_names = VOCAB_FILES_NAMES
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
@@ -149,18 +150,18 @@ class MBartTokenizer(XLMRobertaTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
 
         if already_has_special_tokens:
@@ -179,22 +180,22 @@ class MBartTokenizer(XLMRobertaTokenizer):
     ) -> List[int]:
         """
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. An MBART sequence has the following format, where ``X`` represents the sequence:
+        adding special tokens. An MBART sequence has the following format, where `X` represents the sequence:
 
-        - ``input_ids`` (for encoder) ``X [eos, src_lang_code]``
-        - ``decoder_input_ids``: (for decoder) ``X [eos, tgt_lang_code]``
+        - `input_ids` (for encoder) `X [eos, src_lang_code]`
+        - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`
 
         BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
         separator.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         if token_ids_1 is None:
             return self.prefix_tokens + token_ids_0 + self.suffix_tokens
diff --git a/src/transformers/models/mbart/tokenization_mbart_fast.py b/src/transformers/models/mbart/tokenization_mbart_fast.py
index b135ecba4c..bf578bb2c4 100644
--- a/src/transformers/models/mbart/tokenization_mbart_fast.py
+++ b/src/transformers/models/mbart/tokenization_mbart_fast.py
@@ -82,27 +82,27 @@ FAIRSEQ_LANGUAGE_CODES = [
 
 class MBartTokenizerFast(XLMRobertaTokenizerFast):
     """
-    Construct a "fast" MBART tokenizer (backed by HuggingFace's `tokenizers` library). Based on `BPE
-    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models>`__.
+    Construct a "fast" MBART tokenizer (backed by HuggingFace's *tokenizers* library). Based on [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
 
-    :class:`~transformers.MBartTokenizerFast` is a subclass of :class:`~transformers.XLMRobertaTokenizerFast`. Refer to
-    superclass :class:`~transformers.XLMRobertaTokenizerFast` for usage examples and documentation concerning the
+    [`MBartTokenizerFast`] is a subclass of [`XLMRobertaTokenizerFast`]. Refer to
+    superclass [`XLMRobertaTokenizerFast`] for usage examples and documentation concerning the
     initialization parameters and other methods.
 
-    The tokenization method is ``<tokens> <eos> <language code>`` for source language documents, and ``<language code>
+    The tokenization method is `<tokens> <eos> <language code>` for source language documents, and ``<language code>
     <tokens> <eos>``` for target language documents.
 
-    Examples::
+    Examples:
 
-        >>> from transformers import MBartTokenizerFast
-        >>> tokenizer = MBartTokenizerFast.from_pretrained('facebook/mbart-large-en-ro', src_lang="en_XX", tgt_lang="ro_RO")
-        >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
-        >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
-        >>> inputs = tokenizer(example_english_phrase, return_tensors="pt)
-        >>> with tokenizer.as_target_tokenizer():
-        ...     labels = tokenizer(expected_translation_romanian, return_tensors="pt")
-        >>> inputs["labels"] = labels["input_ids"]
-    """
+    ```python
+    >>> from transformers import MBartTokenizerFast
+    >>> tokenizer = MBartTokenizerFast.from_pretrained('facebook/mbart-large-en-ro', src_lang="en_XX", tgt_lang="ro_RO")
+    >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
+    >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
+    >>> inputs = tokenizer(example_english_phrase, return_tensors="pt)
+    >>> with tokenizer.as_target_tokenizer():
+    ...     labels = tokenizer(expected_translation_romanian, return_tensors="pt")
+    >>> inputs["labels"] = labels["input_ids"]
+    ```"""
 
     vocab_files_names = VOCAB_FILES_NAMES
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
@@ -164,22 +164,22 @@ class MBartTokenizerFast(XLMRobertaTokenizerFast):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. The special tokens depend on calling set_lang.
 
-        An MBART sequence has the following format, where ``X`` represents the sequence:
+        An MBART sequence has the following format, where `X` represents the sequence:
 
-        - ``input_ids`` (for encoder) ``X [eos, src_lang_code]``
-        - ``decoder_input_ids``: (for decoder) ``X [eos, tgt_lang_code]``
+        - `input_ids` (for encoder) `X [eos, src_lang_code]`
+        - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`
 
         BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
         separator.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         if token_ids_1 is None:
             return self.prefix_tokens + token_ids_0 + self.suffix_tokens
diff --git a/src/transformers/models/mbart50/tokenization_mbart50.py b/src/transformers/models/mbart50/tokenization_mbart50.py
index dbfd53a7fb..48f34cd9ac 100644
--- a/src/transformers/models/mbart50/tokenization_mbart50.py
+++ b/src/transformers/models/mbart50/tokenization_mbart50.py
@@ -47,61 +47,61 @@ FAIRSEQ_LANGUAGE_CODES = ["ar_AR", "cs_CZ", "de_DE", "en_XX", "es_XX", "et_EE",
 
 class MBart50Tokenizer(PreTrainedTokenizer):
     """
-    Construct a MBart50 tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
+    Construct a MBart50 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             Path to the vocabulary file.
-        src_lang (:obj:`str`, `optional`):
+        src_lang (`str`, *optional*):
             A string representing the source language.
-        tgt_lang (:obj:`str`, `optional`):
+        tgt_lang (`str`, *optional*):
             A string representing the target language.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        sp_model_kwargs (:obj:`dict`, `optional`):
-            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
-            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
 
-            - ``enable_sampling``: Enable subword regularization.
-            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
 
-              - ``nbest_size = {0,1}``: No sampling is performed.
-              - ``nbest_size > 1``: samples from the nbest_size results.
-              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                 using forward-filtering-and-backward-sampling algorithm.
 
-            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
               BPE-dropout.
 
-    Examples::
+    Examples:
 
-        >>> from transformers import MBart50Tokenizer
-        >>> tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")
-        >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
-        >>> tgt_text =  "Şeful ONU declară că nu există o soluţie militară în Siria"
-        >>> model_inputs = tokenizer(src_text, return_tensors="pt")
-        >>> with tokenizer.as_target_tokenizer():
-        ...    labels = tokenizer(tgt_text, return_tensors="pt").input_ids
-        >>> # model(**model_inputs, labels=labels) should work
-    """
+    ```python
+    >>> from transformers import MBart50Tokenizer
+    >>> tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")
+    >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
+    >>> tgt_text =  "Şeful ONU declară că nu există o soluţie militară în Siria"
+    >>> model_inputs = tokenizer(src_text, return_tensors="pt")
+    >>> with tokenizer.as_target_tokenizer():
+    ...    labels = tokenizer(tgt_text, return_tensors="pt").input_ids
+    >>> # model(**model_inputs, labels=labels) should work
+    ```"""
 
     vocab_files_names = VOCAB_FILES_NAMES
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
@@ -252,18 +252,18 @@ class MBart50Tokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
 
         if already_has_special_tokens:
@@ -282,22 +282,22 @@ class MBart50Tokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. An MBART-50 sequence has the following format, where ``X`` represents the sequence:
+        adding special tokens. An MBART-50 sequence has the following format, where `X` represents the sequence:
 
-        - ``input_ids`` (for encoder) ``[src_lang_code] X [eos]``
-        - ``labels``: (for decoder) ``[tgt_lang_code] X [eos]``
+        - `input_ids` (for encoder) `[src_lang_code] X [eos]`
+        - `labels`: (for decoder) `[tgt_lang_code] X [eos]`
 
         BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
         separator.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         if token_ids_1 is None:
             return self.prefix_tokens + token_ids_0 + self.suffix_tokens
diff --git a/src/transformers/models/mbart50/tokenization_mbart50_fast.py b/src/transformers/models/mbart50/tokenization_mbart50_fast.py
index 93f93d2423..7b481f4362 100644
--- a/src/transformers/models/mbart50/tokenization_mbart50_fast.py
+++ b/src/transformers/models/mbart50/tokenization_mbart50_fast.py
@@ -56,48 +56,48 @@ FAIRSEQ_LANGUAGE_CODES = ["ar_AR", "cs_CZ", "de_DE", "en_XX", "es_XX", "et_EE",
 
 class MBart50TokenizerFast(PreTrainedTokenizerFast):
     """
-    Construct a "fast" MBART tokenizer for mBART-50 (backed by HuggingFace's `tokenizers` library). Based on `BPE
-    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models>`__.
+    Construct a "fast" MBART tokenizer for mBART-50 (backed by HuggingFace's *tokenizers* library). Based on [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
     methods. Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             Path to the vocabulary file.
-        src_lang (:obj:`str`, `optional`):
+        src_lang (`str`, *optional*):
             A string representing the source language.
-        tgt_lang (:obj:`str`, `optional`):
+        tgt_lang (`str`, *optional*):
             A string representing the target language.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
 
-    Examples::
+    Examples:
 
-        >>> from transformers import MBart50TokenizerFast
-        >>> tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")
-        >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
-        >>> tgt_text =  "Şeful ONU declară că nu există o soluţie militară în Siria"
-        >>> model_inputs = tokenizer(src_text, return_tensors="pt")
-        >>> with tokenizer.as_target_tokenizer():
-        ...    labels = tokenizer(tgt_text, return_tensors="pt").input_ids
-        >>> # model(**model_inputs, labels=labels) should work
-    """
+    ```python
+    >>> from transformers import MBart50TokenizerFast
+    >>> tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")
+    >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
+    >>> tgt_text =  "Şeful ONU declară că nu există o soluţie militară în Siria"
+    >>> model_inputs = tokenizer(src_text, return_tensors="pt")
+    >>> with tokenizer.as_target_tokenizer():
+    ...    labels = tokenizer(tgt_text, return_tensors="pt").input_ids
+    >>> # model(**model_inputs, labels=labels) should work
+    ```"""
 
     vocab_files_names = VOCAB_FILES_NAMES
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
@@ -172,22 +172,22 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. The special tokens depend on calling set_lang.
 
-        An MBART-50 sequence has the following format, where ``X`` represents the sequence:
+        An MBART-50 sequence has the following format, where `X` represents the sequence:
 
-        - ``input_ids`` (for encoder) ``[src_lang_code] X [eos]``
-        - ``labels``: (for decoder) ``[tgt_lang_code] X [eos]``
+        - `input_ids` (for encoder) `[src_lang_code] X [eos]`
+        - `labels`: (for decoder) `[tgt_lang_code] X [eos]`
 
         BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
         separator.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         if token_ids_1 is None:
             return self.prefix_tokens + token_ids_0 + self.suffix_tokens
diff --git a/src/transformers/models/megatron_bert/configuration_megatron_bert.py b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
index d6e32cd496..0210f0466f 100644
--- a/src/transformers/models/megatron_bert/configuration_megatron_bert.py
+++ b/src/transformers/models/megatron_bert/configuration_megatron_bert.py
@@ -27,68 +27,67 @@ MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class MegatronBertConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.MegatronBertModel`. It is
+    This is the configuration class to store the configuration of a [`MegatronBertModel`]. It is
     used to instantiate a MEGATRON_BERT model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the MEGATRON_BERT
-    `megatron-bert-uncased-345m <https://huggingface.co/nvidia/megatron-bert-uncased-345m>`__ architecture.
+    [megatron-bert-uncased-345m](https://huggingface.co/nvidia/megatron-bert-uncased-345m) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 29056):
+        vocab_size (`int`, *optional*, defaults to 29056):
             Vocabulary size of the MEGATRON_BERT model. Defines the number of different tokens that can be represented
-            by the :obj:`inputs_ids` passed when calling :class:`~transformers.MegatronBertModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 1024):
+            by the `inputs_ids` passed when calling [`MegatronBertModel`].
+        hidden_size (`int`, *optional*, defaults to 1024):
             Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 24):
+        num_hidden_layers (`int`, *optional*, defaults to 24):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        num_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 4096):
+        intermediate_size (`int`, *optional*, defaults to 4096):
             Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling
-            :class:`~transformers.MegatronBertModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling
+            [`MegatronBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
-            Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
-            :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
-            :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
-            <https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
-            `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
-            <https://arxiv.org/abs/2009.13658>`__.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
+            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`,
+            `"relative_key_query"`. For positional embeddings use `"absolute"`. For more information on
+            `"relative_key"`, please refer to [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). For more information on `"relative_key_query"`, please refer to
+            *Method 4* in [Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if ``config.is_decoder=True``.
+            relevant if `config.is_decoder=True`.
 
-    Examples::
+    Examples:
 
-        >>> from transformers import MegatronBertModel, MegatronBertConfig
+    ```python
+    >>> from transformers import MegatronBertModel, MegatronBertConfig
 
-        >>> # Initializing a MEGATRON_BERT bert-base-uncased style configuration
-        >>> configuration = MegatronBertConfig()
+    >>> # Initializing a MEGATRON_BERT bert-base-uncased style configuration
+    >>> configuration = MegatronBertConfig()
 
-        >>> # Initializing a model from the bert-base-uncased style configuration
-        >>> model = MegatronBertModel(configuration)
+    >>> # Initializing a model from the bert-base-uncased style configuration
+    >>> model = MegatronBertModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "megatron-bert"
 
     def __init__(
diff --git a/src/transformers/models/mluke/tokenization_mluke.py b/src/transformers/models/mluke/tokenization_mluke.py
index aa547737c7..06dc3f4430 100644
--- a/src/transformers/models/mluke/tokenization_mluke.py
+++ b/src/transformers/models/mluke/tokenization_mluke.py
@@ -72,159 +72,164 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 }
 
 ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
-            return_token_type_ids (:obj:`bool`, `optional`):
+            return_token_type_ids (`bool`, *optional*):
                 Whether to return token type IDs. If left to the default, will return the token type IDs according to
-                the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+                the specific tokenizer's default, defined by the `return_outputs` attribute.
 
-                `What are token type IDs? <../glossary.html#token-type-ids>`__
-            return_attention_mask (:obj:`bool`, `optional`):
+                [What are token type IDs?](../glossary#token-type-ids)
+            return_attention_mask (`bool`, *optional*):
                 Whether to return the attention mask. If left to the default, will return the attention mask according
-                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+                to the specific tokenizer's default, defined by the `return_outputs` attribute.
 
-                `What are attention masks? <../glossary.html#attention-mask>`__
-            return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                [What are attention masks?](../glossary#attention-mask)
+            return_overflowing_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch
-                of pairs) is provided with :obj:`truncation_strategy = longest_first` or :obj:`True`, an error is
+                of pairs) is provided with `truncation_strategy = longest_first` or `True`, an error is
                 raised instead of returning overflowing tokens.
-            return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            return_special_tokens_mask (`bool`, *optional*, defaults to `False`):
                 Whether or not to return special tokens mask information.
-            return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not to return :obj:`(char_start, char_end)` for each token.
+            return_offsets_mapping (`bool`, *optional*, defaults to `False`):
+                Whether or not to return `(char_start, char_end)` for each token.
 
                 This is only available on fast tokenizers inheriting from
-                :class:`~transformers.PreTrainedTokenizerFast`, if using Python's tokenizer, this method will raise
-                :obj:`NotImplementedError`.
-            return_length  (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                [`PreTrainedTokenizerFast`], if using Python's tokenizer, this method will raise
+                `NotImplementedError`.
+            return_length  (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the lengths of the encoded inputs.
-            verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            verbose (`bool`, *optional*, defaults to `True`):
                 Whether or not to print more information and warnings.
-            **kwargs: passed to the :obj:`self.tokenize()` method
+            **kwargs: passed to the `self.tokenize()` method
 
-        Return:
-            :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields:
+            Return:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
 
             - **input_ids** -- List of token ids to be fed to a model.
 
-              `What are input IDs? <../glossary.html#input-ids>`__
+              [What are input IDs?](../glossary#input-ids)
 
-            - **token_type_ids** -- List of token type ids to be fed to a model (when :obj:`return_token_type_ids=True`
-              or if `"token_type_ids"` is in :obj:`self.model_input_names`).
+            - **token_type_ids** -- List of token type ids to be fed to a model (when `return_token_type_ids=True`
+              or if *"token_type_ids"* is in `self.model_input_names`).
 
-              `What are token type IDs? <../glossary.html#token-type-ids>`__
+              [What are token type IDs?](../glossary#token-type-ids)
 
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              :obj:`return_attention_mask=True` or if `"attention_mask"` is in :obj:`self.model_input_names`).
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`).
 
-              `What are attention masks? <../glossary.html#attention-mask>`__
+              [What are attention masks?](../glossary#attention-mask)
 
             - **entity_ids** -- List of entity ids to be fed to a model.
 
-              `What are input IDs? <../glossary.html#input-ids>`__
+              [What are input IDs?](../glossary#input-ids)
 
             - **entity_position_ids** -- List of entity positions in the input sequence to be fed to a model.
 
             - **entity_token_type_ids** -- List of entity token type ids to be fed to a model (when
-              :obj:`return_token_type_ids=True` or if `"entity_token_type_ids"` is in :obj:`self.model_input_names`).
+              `return_token_type_ids=True` or if *"entity_token_type_ids"* is in `self.model_input_names`).
 
-              `What are token type IDs? <../glossary.html#token-type-ids>`__
+              [What are token type IDs?](../glossary#token-type-ids)
 
             - **entity_attention_mask** -- List of indices specifying which entities should be attended to by the model
-              (when :obj:`return_attention_mask=True` or if `"entity_attention_mask"` is in
-              :obj:`self.model_input_names`).
+              (when `return_attention_mask=True` or if *"entity_attention_mask"* is in
+              `self.model_input_names`).
 
-              `What are attention masks? <../glossary.html#attention-mask>`__
+              [What are attention masks?](../glossary#attention-mask)
 
             - **entity_start_positions** -- List of the start positions of entities in the word token sequence (when
-              :obj:`task="entity_span_classification"`).
+              `task="entity_span_classification"`).
             - **entity_end_positions** -- List of the end positions of entities in the word token sequence (when
-              :obj:`task="entity_span_classification"`).
-            - **overflowing_tokens** -- List of overflowing tokens sequences (when a :obj:`max_length` is specified and
-              :obj:`return_overflowing_tokens=True`).
-            - **num_truncated_tokens** -- Number of tokens truncated (when a :obj:`max_length` is specified and
-              :obj:`return_overflowing_tokens=True`).
+              `task="entity_span_classification"`).
+            - **overflowing_tokens** -- List of overflowing tokens sequences (when a `max_length` is specified and
+              `return_overflowing_tokens=True`).
+            - **num_truncated_tokens** -- Number of tokens truncated (when a `max_length` is specified and
+              `return_overflowing_tokens=True`).
             - **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying
-              regular sequence tokens (when :obj:`add_special_tokens=True` and :obj:`return_special_tokens_mask=True`).
-            - **length** -- The length of the inputs (when :obj:`return_length=True`)
+              regular sequence tokens (when `add_special_tokens=True` and `return_special_tokens_mask=True`).
+            - **length** -- The length of the inputs (when `return_length=True`)
 
 """
 
 
 class MLukeTokenizer(PreTrainedTokenizer):
     """
-    Adapted from :class:`~transformers.XLMRobertaTokenizer` and :class:`~transformers.LukeTokenizer`. Based on
-    `SentencePiece <https://github.com/google/sentencepiece>`__.
+    Adapted from [`XLMRobertaTokenizer`] and [`LukeTokenizer`]. Based on
+    [SentencePiece](https://github.com/google/sentencepiece).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             Path to the vocabulary file.
-        entity_vocab_file (:obj:`str`):
+        entity_vocab_file (`str`):
             Path to the entity vocabulary file.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        task (:obj:`str`, `optional`):
-            Task for which you want to prepare sequences. One of :obj:`"entity_classification"`,
-            :obj:`"entity_pair_classification"`, or :obj:`"entity_span_classification"`. If you specify this argument,
+        task (`str`, *optional*):
+            Task for which you want to prepare sequences. One of `"entity_classification"`,
+            `"entity_pair_classification"`, or `"entity_span_classification"`. If you specify this argument,
             the entity sequence is automatically created based on the given entity span(s).
-        max_entity_length (:obj:`int`, `optional`, defaults to 32):
-            The maximum length of :obj:`entity_ids`.
-        max_mention_length (:obj:`int`, `optional`, defaults to 30):
+        max_entity_length (`int`, *optional*, defaults to 32):
+            The maximum length of `entity_ids`.
+        max_mention_length (`int`, *optional*, defaults to 30):
             The maximum number of tokens inside an entity span.
-        entity_token_1 (:obj:`str`, `optional`, defaults to :obj:`<ent>`):
+        entity_token_1 (`str`, *optional*, defaults to `<ent>`):
             The special token used to represent an entity span in a word token sequence. This token is only used when
-            ``task`` is set to :obj:`"entity_classification"` or :obj:`"entity_pair_classification"`.
-        entity_token_2 (:obj:`str`, `optional`, defaults to :obj:`<ent2>`):
+            `task` is set to `"entity_classification"` or `"entity_pair_classification"`.
+        entity_token_2 (`str`, *optional*, defaults to `<ent2>`):
             The special token used to represent an entity span in a word token sequence. This token is only used when
-            ``task`` is set to :obj:`"entity_pair_classification"`.
-        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
+            `task` is set to `"entity_pair_classification"`.
+        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
             Additional special tokens used by the tokenizer.
-        sp_model_kwargs (:obj:`dict`, `optional`):
-            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
-            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
 
-            - ``enable_sampling``: Enable subword regularization.
-            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
 
-              - ``nbest_size = {0,1}``: No sampling is performed.
-              - ``nbest_size > 1``: samples from the nbest_size results.
-              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                 using forward-filtering-and-backward-sampling algorithm.
 
-            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
               BPE-dropout.
 
     Attributes:
-        sp_model (:obj:`SentencePieceProcessor`):
-            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -373,39 +378,39 @@ class MLukeTokenizer(PreTrainedTokenizer):
         sequences, depending on the task you want to prepare them for.
 
         Args:
-            text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+            text (`str`, `List[str]`, `List[List[str]]`):
                 The sequence or batch of sequences to be encoded. Each sequence must be a string. Note that this
                 tokenizer does not support tokenization based on pretokenized strings.
-            text_pair (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+            text_pair (`str`, `List[str]`, `List[List[str]]`):
                 The sequence or batch of sequences to be encoded. Each sequence must be a string. Note that this
                 tokenizer does not support tokenization based on pretokenized strings.
-            entity_spans (:obj:`List[Tuple[int, int]]`, :obj:`List[List[Tuple[int, int]]]`, `optional`):
+            entity_spans (`List[Tuple[int, int]]`, `List[List[Tuple[int, int]]]`, *optional*):
                 The sequence or batch of sequences of entity spans to be encoded. Each sequence consists of tuples each
                 with two integers denoting character-based start and end positions of entities. If you specify
-                :obj:`"entity_classification"` or :obj:`"entity_pair_classification"` as the ``task`` argument in the
-                constructor, the length of each sequence must be 1 or 2, respectively. If you specify ``entities``, the
-                length of each sequence must be equal to the length of each sequence of ``entities``.
-            entity_spans_pair (:obj:`List[Tuple[int, int]]`, :obj:`List[List[Tuple[int, int]]]`, `optional`):
+                `"entity_classification"` or `"entity_pair_classification"` as the `task` argument in the
+                constructor, the length of each sequence must be 1 or 2, respectively. If you specify `entities`, the
+                length of each sequence must be equal to the length of each sequence of `entities`.
+            entity_spans_pair (`List[Tuple[int, int]]`, `List[List[Tuple[int, int]]]`, *optional*):
                 The sequence or batch of sequences of entity spans to be encoded. Each sequence consists of tuples each
                 with two integers denoting character-based start and end positions of entities. If you specify the
-                ``task`` argument in the constructor, this argument is ignored. If you specify ``entities_pair``, the
-                length of each sequence must be equal to the length of each sequence of ``entities_pair``.
-            entities (:obj:`List[str]`, :obj:`List[List[str]]`, `optional`):
+                `task` argument in the constructor, this argument is ignored. If you specify `entities_pair`, the
+                length of each sequence must be equal to the length of each sequence of `entities_pair`.
+            entities (`List[str]`, `List[List[str]]`, *optional*):
                 The sequence or batch of sequences of entities to be encoded. Each sequence consists of strings
                 representing entities, i.e., special entities (e.g., [MASK]) or entity titles of Wikipedia (e.g., Los
-                Angeles). This argument is ignored if you specify the ``task`` argument in the constructor. The length
-                of each sequence must be equal to the length of each sequence of ``entity_spans``. If you specify
-                ``entity_spans`` without specifying this argument, the entity sequence or the batch of entity sequences
+                Angeles). This argument is ignored if you specify the `task` argument in the constructor. The length
+                of each sequence must be equal to the length of each sequence of `entity_spans`. If you specify
+                `entity_spans` without specifying this argument, the entity sequence or the batch of entity sequences
                 is automatically constructed by filling it with the [MASK] entity.
-            entities_pair (:obj:`List[str]`, :obj:`List[List[str]]`, `optional`):
+            entities_pair (`List[str]`, `List[List[str]]`, *optional*):
                 The sequence or batch of sequences of entities to be encoded. Each sequence consists of strings
                 representing entities, i.e., special entities (e.g., [MASK]) or entity titles of Wikipedia (e.g., Los
-                Angeles). This argument is ignored if you specify the ``task`` argument in the constructor. The length
-                of each sequence must be equal to the length of each sequence of ``entity_spans_pair``. If you specify
-                ``entity_spans_pair`` without specifying this argument, the entity sequence or the batch of entity
+                Angeles). This argument is ignored if you specify the `task` argument in the constructor. The length
+                of each sequence must be equal to the length of each sequence of `entity_spans_pair`. If you specify
+                `entity_spans_pair` without specifying this argument, the entity sequence or the batch of entity
                 sequences is automatically constructed by filling it with the [MASK] entity.
-            max_entity_length (:obj:`int`, `optional`):
-                The maximum length of :obj:`entity_ids`.
+            max_entity_length (`int`, *optional*):
+                The maximum length of `entity_ids`.
         """
         # Input type checking for clearer error
         is_valid_single_text = isinstance(text, str)
@@ -969,24 +974,24 @@ class MLukeTokenizer(PreTrainedTokenizer):
         Prepares a sequence of input id, entity id and entity span, or a pair of sequences of inputs ids, entity ids,
         entity spans so that it can be used by the model. It adds special tokens, truncates sequences if overflowing
         while taking into account the special tokens and manages a moving window (with user defined stride) for
-        overflowing tokens. Please Note, for `pair_ids` different than `None` and `truncation_strategy = longest_first`
-        or `True`, it is not possible to return overflowing tokens. Such a combination of arguments will raise an
+        overflowing tokens. Please Note, for *pair_ids* different than *None* and *truncation_strategy = longest_first*
+        or *True*, it is not possible to return overflowing tokens. Such a combination of arguments will raise an
         error.
 
         Args:
-            ids (:obj:`List[int]`):
+            ids (`List[int]`):
                 Tokenized input ids of the first sequence.
-            pair_ids (:obj:`List[int]`, `optional`):
+            pair_ids (`List[int]`, *optional*):
                 Tokenized input ids of the second sequence.
-            entity_ids (:obj:`List[int]`, `optional`):
+            entity_ids (`List[int]`, *optional*):
                 Entity ids of the first sequence.
-            pair_entity_ids (:obj:`List[int]`, `optional`):
+            pair_entity_ids (`List[int]`, *optional*):
                 Entity ids of the second sequence.
-            entity_token_spans (:obj:`List[Tuple[int, int]]`, `optional`):
+            entity_token_spans (`List[Tuple[int, int]]`, *optional*):
                 Entity spans of the first sequence.
-            pair_entity_token_spans (:obj:`List[Tuple[int, int]]`, `optional`):
+            pair_entity_token_spans (`List[Tuple[int, int]]`, *optional*):
                 Entity spans of the second sequence.
-            max_entity_length (:obj:`int`, `optional`):
+            max_entity_length (`int`, *optional*):
                 The maximum length of the entity sequence.
         """
 
@@ -1188,46 +1193,45 @@ class MLukeTokenizer(PreTrainedTokenizer):
         """
         Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
         in the batch. Padding side (left/right) padding token ids are defined at the tokenizer level (with
-        ``self.padding_side``, ``self.pad_token_id`` and ``self.pad_token_type_id``) .. note:: If the
-        ``encoded_inputs`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the result
-        will use the same type unless you provide a different tensor type with ``return_tensors``. In the case of
+        `self.padding_side`, `self.pad_token_id` and `self.pad_token_type_id`) .. note:: If the
+        `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the result
+        will use the same type unless you provide a different tensor type with `return_tensors`. In the case of
         PyTorch tensors, you will lose the specific device of your tensors however.
 
         Args:
-            encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
-                Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or :obj:`Dict[str,
-                List[int]]`) or a batch of tokenized inputs (list of :class:`~transformers.BatchEncoding`, `Dict[str,
-                List[List[int]]]` or `List[Dict[str, List[int]]]`) so you can use this method during preprocessing as
-                well as in a PyTorch Dataloader collate function. Instead of :obj:`List[int]` you can have tensors
+            encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `Dict[str, List[int]]`, `Dict[str, List[List[int]]` or `List[Dict[str, List[int]]]`):
+                Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, List[int]]`) or a batch of tokenized inputs (list of [`BatchEncoding`], *Dict[str,
+                List[List[int]]]* or *List[Dict[str, List[int]]]*) so you can use this method during preprocessing as
+                well as in a PyTorch Dataloader collate function. Instead of `List[int]` you can have tensors
                 (numpy arrays, PyTorch tensors or TensorFlow tensors), see the note above for the return type.
-            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `True`):
                  Select a strategy to pad the returned sequences (according to the model's padding side and padding
                  index) among:
 
-                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a
                   single sequence if provided).
-                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
                   maximum acceptable input length for the model if that argument is not provided.
-                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
                   different lengths).
-            max_length (:obj:`int`, `optional`):
+            max_length (`int`, *optional*):
                 Maximum length of the returned list and optionally padding length (see above).
-            max_entity_length (:obj:`int`, `optional`):
+            max_entity_length (`int`, *optional*):
                 The maximum length of the entity sequence.
-            pad_to_multiple_of (:obj:`int`, `optional`):
+            pad_to_multiple_of (`int`, *optional*):
                 If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                 the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
-            return_attention_mask (:obj:`bool`, `optional`):
+            return_attention_mask (`bool`, *optional*):
                 Whether to return the attention mask. If left to the default, will return the attention mask according
-                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute. `What are
-                attention masks? <../glossary.html#attention-mask>`__
-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+                to the specific tokenizer's default, defined by the `return_outputs` attribute. [What are
+                attention masks?](../glossary#attention-mask)
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
 
-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
-            verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            verbose (`bool`, *optional*, defaults to `True`):
                 Whether or not to print more information and warnings.
         """
         # If we have a list of dicts, let's convert it in a dict of lists
@@ -1495,17 +1499,17 @@ class MLukeTokenizer(PreTrainedTokenizer):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. An XLM-RoBERTa sequence has the following format:
 
-        - single sequence: ``<s> X </s>``
-        - pair of sequences: ``<s> A </s></s> B </s>``
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
 
         if token_ids_1 is None:
@@ -1520,18 +1524,18 @@ class MLukeTokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
 
         if already_has_special_tokens:
@@ -1552,13 +1556,13 @@ class MLukeTokenizer(PreTrainedTokenizer):
         not make use of token type ids, therefore a list of zeros is returned.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of zeros.
+            `List[int]`: List of zeros.
 
         """
 
diff --git a/src/transformers/models/mmbt/configuration_mmbt.py b/src/transformers/models/mmbt/configuration_mmbt.py
index bbb6c9d240..1137917c34 100644
--- a/src/transformers/models/mmbt/configuration_mmbt.py
+++ b/src/transformers/models/mmbt/configuration_mmbt.py
@@ -23,15 +23,15 @@ logger = logging.get_logger(__name__)
 
 class MMBTConfig(object):
     """
-    This is the configuration class to store the configuration of a :class:`~transformers.MMBTModel`. It is used to
+    This is the configuration class to store the configuration of a [`MMBTModel`]. It is used to
     instantiate a MMBT model according to the specified arguments, defining the model architecture.
 
     Args:
-        config (:class:`~transformers.PreTrainedConfig`):
+        config ([`PreTrainedConfig`]):
             Config of the underlying Transformer models. Its values are copied over to use a single config.
-        num_labels (:obj:`int`, `optional`):
+        num_labels (`int`, *optional*):
             Size of final Linear layer for classification.
-        modal_hidden_size (:obj:`int`, `optional`, defaults to 2048):
+        modal_hidden_size (`int`, *optional*, defaults to 2048):
             Embedding dimension of the non-text modality encoder.
     """
 
diff --git a/src/transformers/models/mmbt/modeling_mmbt.py b/src/transformers/models/mmbt/modeling_mmbt.py
index d9b76c6f6b..d30d31fba6 100644
--- a/src/transformers/models/mmbt/modeling_mmbt.py
+++ b/src/transformers/models/mmbt/modeling_mmbt.py
@@ -208,13 +208,14 @@ class MMBTModel(nn.Module, ModuleUtilsMixin):
         r"""
         Returns:
 
-        Examples::
+        Examples:
 
-            # For example purposes. Not runnable.
-            transformer = BertModel.from_pretrained('bert-base-uncased')
-            encoder = ImageEncoder(args)
-            mmbt = MMBTModel(config, transformer, encoder)
-        """
+        ```python
+        # For example purposes. Not runnable.
+        transformer = BertModel.from_pretrained('bert-base-uncased')
+        encoder = ImageEncoder(args)
+        mmbt = MMBTModel(config, transformer, encoder)
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/mobilebert/configuration_mobilebert.py b/src/transformers/models/mobilebert/configuration_mobilebert.py
index 4f8e338d33..a738fc54c7 100644
--- a/src/transformers/models/mobilebert/configuration_mobilebert.py
+++ b/src/transformers/models/mobilebert/configuration_mobilebert.py
@@ -27,78 +27,80 @@ MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class MobileBertConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.MobileBertModel` or a
-    :class:`~transformers.TFMobileBertModel`. It is used to instantiate a MobileBERT model according to the specified
+    This is the configuration class to store the configuration of a [`MobileBertModel`] or a
+    [`TFMobileBertModel`]. It is used to instantiate a MobileBERT model according to the specified
     arguments, defining the model architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the MobileBERT model. Defines the number of different tokens that can be represented by
-            the :obj:`inputs_ids` passed when calling :class:`~transformers.MobileBertModel` or
-            :class:`~transformers.TFMobileBertModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 512):
+            the `inputs_ids` passed when calling [`MobileBertModel`] or
+            [`TFMobileBertModel`].
+        hidden_size (`int`, *optional*, defaults to 512):
             Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 24):
+        num_hidden_layers (`int`, *optional*, defaults to 24):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 4):
+        num_attention_heads (`int`, *optional*, defaults to 4):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 512):
+        intermediate_size (`int`, *optional*, defaults to 512):
             Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"relu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"relu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.0):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.MobileBertModel`
-            or :class:`~transformers.TFMobileBertModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`MobileBertModel`]
+            or [`TFMobileBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
 
-        pad_token_id (:obj:`int`, `optional`, defaults to 0):
+        pad_token_id (`int`, *optional*, defaults to 0):
             The ID of the token in the word embedding to use as padding.
-        embedding_size (:obj:`int`, `optional`, defaults to 128):
+        embedding_size (`int`, *optional*, defaults to 128):
             The dimension of the word embedding vectors.
-        trigram_input (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        trigram_input (`bool`, *optional*, defaults to `True`):
             Use a convolution of trigram as input.
-        use_bottleneck (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_bottleneck (`bool`, *optional*, defaults to `True`):
             Whether to use bottleneck in BERT.
-        intra_bottleneck_size (:obj:`int`, `optional`, defaults to 128):
+        intra_bottleneck_size (`int`, *optional*, defaults to 128):
             Size of bottleneck layer output.
-        use_bottleneck_attention (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        use_bottleneck_attention (`bool`, *optional*, defaults to `False`):
             Whether to use attention inputs from the bottleneck transformation.
-        key_query_shared_bottleneck (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        key_query_shared_bottleneck (`bool`, *optional*, defaults to `True`):
             Whether to use the same linear transformation for query&key in the bottleneck.
-        num_feedforward_networks (:obj:`int`, `optional`, defaults to 4):
+        num_feedforward_networks (`int`, *optional*, defaults to 4):
             Number of FFNs in a block.
-        normalization_type (:obj:`str`, `optional`, defaults to :obj:`"no_norm"`):
+        normalization_type (`str`, *optional*, defaults to `"no_norm"`):
             The normalization type in MobileBERT.
-        classifier_dropout (:obj:`float`, `optional`):
+        classifier_dropout (`float`, *optional*):
             The dropout ratio for the classification head.
 
-    Examples::
+    Examples:
 
-        >>> from transformers import MobileBertModel, MobileBertConfig
+    ```python
+    >>> from transformers import MobileBertModel, MobileBertConfig
 
-        >>> # Initializing a MobileBERT configuration
-        >>> configuration = MobileBertConfig()
+    >>> # Initializing a MobileBERT configuration
+    >>> configuration = MobileBertConfig()
 
-        >>> # Initializing a model from the configuration above
-        >>> model = MobileBertModel(configuration)
+    >>> # Initializing a model from the configuration above
+    >>> model = MobileBertModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
 
     Attributes: pretrained_config_archive_map (Dict[str, str]): A dictionary containing all the available pre-trained
     checkpoints.
diff --git a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
index 6de104a941..db7ad2ae09 100644
--- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
@@ -1031,18 +1031,18 @@ class TFMobileBertForPreTraining(TFMobileBertPreTrainedModel):
         r"""
         Return:
 
-        Examples::
+        Examples:
 
-            >>> import tensorflow as tf
-            >>> from transformers import MobileBertTokenizer, TFMobileBertForPreTraining
+        ```python
+        >>> import tensorflow as tf
+        >>> from transformers import MobileBertTokenizer, TFMobileBertForPreTraining
 
-            >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
-            >>> model = TFMobileBertForPreTraining.from_pretrained('google/mobilebert-uncased')
-            >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-            >>> outputs = model(input_ids)
-            >>> prediction_scores, seq_relationship_scores = outputs[:2]
-
-        """
+        >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
+        >>> model = TFMobileBertForPreTraining.from_pretrained('google/mobilebert-uncased')
+        >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> prediction_scores, seq_relationship_scores = outputs[:2]
+        ```"""
         inputs = input_processing(
             func=self.call,
             config=self.config,
@@ -1242,20 +1242,21 @@ class TFMobileBertForNextSentencePrediction(TFMobileBertPreTrainedModel, TFNextS
         r"""
         Return:
 
-        Examples::
+        Examples:
 
-            >>> import tensorflow as tf
-            >>> from transformers import MobileBertTokenizer, TFMobileBertForNextSentencePrediction
+        ```python
+        >>> import tensorflow as tf
+        >>> from transformers import MobileBertTokenizer, TFMobileBertForNextSentencePrediction
 
-            >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
-            >>> model = TFMobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased')
+        >>> tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
+        >>> model = TFMobileBertForNextSentencePrediction.from_pretrained('google/mobilebert-uncased')
 
-            >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
-            >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
-            >>> encoding = tokenizer(prompt, next_sentence, return_tensors='tf')
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='tf')
 
-            >>> logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
-        """
+        >>> logits = model(encoding['input_ids'], token_type_ids=encoding['token_type_ids'])[0]
+        ```"""
         inputs = input_processing(
             func=self.call,
             config=self.config,
diff --git a/src/transformers/models/mobilebert/tokenization_mobilebert.py b/src/transformers/models/mobilebert/tokenization_mobilebert.py
index b19fdcbf75..ef9828c407 100644
--- a/src/transformers/models/mobilebert/tokenization_mobilebert.py
+++ b/src/transformers/models/mobilebert/tokenization_mobilebert.py
@@ -37,10 +37,10 @@ class MobileBertTokenizer(BertTokenizer):
     r"""
     Construct a MobileBERT tokenizer.
 
-    :class:`~transformers.MobileBertTokenizer is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
+    [`MobileBertTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end
     tokenization: punctuation splitting and wordpiece.
 
-    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning
     parameters.
     """
 
diff --git a/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py b/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py
index 702d4d98b3..28eced0356 100644
--- a/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py
+++ b/src/transformers/models/mobilebert/tokenization_mobilebert_fast.py
@@ -39,12 +39,12 @@ PRETRAINED_INIT_CONFIGURATION = {}
 
 class MobileBertTokenizerFast(BertTokenizerFast):
     r"""
-    Construct a "fast" MobileBERT tokenizer (backed by HuggingFace's `tokenizers` library).
+    Construct a "fast" MobileBERT tokenizer (backed by HuggingFace's *tokenizers* library).
 
-    :class:`~transformers.MobileBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
+    [`MobileBertTokenizerFast`] is identical to [`BertTokenizerFast`] and runs
     end-to-end tokenization: punctuation splitting and wordpiece.
 
-    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning
     parameters.
     """
 
diff --git a/src/transformers/models/mpnet/configuration_mpnet.py b/src/transformers/models/mpnet/configuration_mpnet.py
index 0026b1d6eb..a6fc99486b 100644
--- a/src/transformers/models/mpnet/configuration_mpnet.py
+++ b/src/transformers/models/mpnet/configuration_mpnet.py
@@ -28,57 +28,58 @@ MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class MPNetConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.MPNetModel` or a
-    :class:`~transformers.TFMPNetModel`. It is used to instantiate a MPNet model according to the specified arguments,
+    This is the configuration class to store the configuration of a [`MPNetModel`] or a
+    [`TFMPNetModel`]. It is used to instantiate a MPNet model according to the specified arguments,
     defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
-    to that of the MPNet `mpnet-base <https://huggingface.co/mpnet-base>`__ architecture.
+    to that of the MPNet [mpnet-base](https://huggingface.co/mpnet-base) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 30527):
+        vocab_size (`int`, *optional*, defaults to 30527):
             Vocabulary size of the MPNet model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.MPNetModel` or
-            :class:`~transformers.TFMPNetModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            `inputs_ids` passed when calling [`MPNetModel`] or
+            [`TFMPNetModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        relative_attention_num_buckets (:obj:`int`, `optional`, defaults to 32):
+        relative_attention_num_buckets (`int`, *optional*, defaults to 32):
             The number of buckets to use for each attention layer.
 
-    Examples::
+    Examples:
 
-        >>> from transformers import MPNetModel, MPNetConfig
+    ```python
+    >>> from transformers import MPNetModel, MPNetConfig
 
-        >>> # Initializing a MPNet mpnet-base style configuration
-        >>> configuration = MPNetConfig()
+    >>> # Initializing a MPNet mpnet-base style configuration
+    >>> configuration = MPNetConfig()
 
-        >>> # Initializing a model from the mpnet-base style configuration
-        >>> model = MPNetModel(configuration)
+    >>> # Initializing a model from the mpnet-base style configuration
+    >>> model = MPNetModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "mpnet"
 
     def __init__(
diff --git a/src/transformers/models/mpnet/tokenization_mpnet.py b/src/transformers/models/mpnet/tokenization_mpnet.py
index c59cd56ec0..1de9746a21 100644
--- a/src/transformers/models/mpnet/tokenization_mpnet.py
+++ b/src/transformers/models/mpnet/tokenization_mpnet.py
@@ -66,56 +66,61 @@ def whitespace_tokenize(text):
 class MPNetTokenizer(PreTrainedTokenizer):
     """
 
-    This tokenizer inherits from :class:`~transformers.BertTokenizer` which contains most of the methods. Users should
+    This tokenizer inherits from [`BertTokenizer`] which contains most of the methods. Users should
     refer to the superclass for more information regarding methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             Path to the vocabulary file.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
             Whether or not to lowercase the input when tokenizing.
-        do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
             Whether or not to do basic tokenization before WordPiece.
-        never_split (:obj:`Iterable`, `optional`):
+        never_split (`Iterable`, *optional*):
             Collection of tokens which will never be split during tokenization. Only has an effect when
-            :obj:`do_basic_tokenize=True`
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+            `do_basic_tokenize=True`
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sequence token that was used during pre-training. Can be used a sequence classifier token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
             Whether or not to tokenize Chinese characters.
 
-            This should likely be deactivated for Japanese (see this `issue
-            <https://github.com/huggingface/transformers/issues/328>`__).
-        strip_accents: (:obj:`bool`, `optional`):
+            This should likely be deactivated for Japanese (see this [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents: (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for :obj:`lowercase` (as in the original BERT).
+            value for `lowercase` (as in the original BERT).
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -229,17 +234,17 @@ class MPNetTokenizer(PreTrainedTokenizer):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. A MPNet sequence has the following format:
 
-        - single sequence: ``<s> X </s>``
-        - pair of sequences: ``<s> A </s></s> B </s>``
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         if token_ids_1 is None:
             return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -252,18 +257,18 @@ class MPNetTokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` methods.
+        special tokens using the tokenizer `prepare_for_model` methods.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Set to True if the token list is already formatted with special tokens for the model
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
         if already_has_special_tokens:
             return super().get_special_tokens_mask(
@@ -282,13 +287,13 @@ class MPNetTokenizer(PreTrainedTokenizer):
         make use of token type ids, therefore a list of zeros is returned.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of zeros.
+            `List[int]`: List of zeros.
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
@@ -324,19 +329,18 @@ class BasicTokenizer(object):
     Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
 
     Args:
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
             Whether or not to lowercase the input when tokenizing.
-        never_split (:obj:`Iterable`, `optional`):
+        never_split (`Iterable`, *optional*):
             Collection of tokens which will never be split during tokenization. Only has an effect when
-            :obj:`do_basic_tokenize=True`
-        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            `do_basic_tokenize=True`
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
             Whether or not to tokenize Chinese characters.
 
-            This should likely be deactivated for Japanese (see this `issue
-            <https://github.com/huggingface/transformers/issues/328>`__).
-        strip_accents: (:obj:`bool`, `optional`):
+            This should likely be deactivated for Japanese (see this [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents: (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for :obj:`lowercase` (as in the original BERT).
+            value for `lowercase` (as in the original BERT).
     """
 
     def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
@@ -353,9 +357,9 @@ class BasicTokenizer(object):
         WordPieceTokenizer.
 
         Args:
-            **never_split**: (`optional`) list of str
+            never_split (`LIst[str]`, *optional*)
                 Kept for backward compatibility purposes. Now implemented directly at the base class level (see
-                :func:`PreTrainedTokenizer.tokenize`) List of token not to split.
+                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
         """
         # union() returns a new set by concatenating the two sets.
         never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
@@ -482,14 +486,14 @@ class WordpieceTokenizer(object):
         Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
         tokenization using the given vocabulary.
 
-        For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`.
+        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
 
         Args:
-          text: A single token or whitespace separated tokens. This should have
-            already been passed through `BasicTokenizer`.
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through *BasicTokenizer*.
 
         Returns:
-          A list of wordpiece tokens.
+            A list of wordpiece tokens.
         """
 
         output_tokens = []
diff --git a/src/transformers/models/mpnet/tokenization_mpnet_fast.py b/src/transformers/models/mpnet/tokenization_mpnet_fast.py
index 8b5aedb278..87b50e144a 100644
--- a/src/transformers/models/mpnet/tokenization_mpnet_fast.py
+++ b/src/transformers/models/mpnet/tokenization_mpnet_fast.py
@@ -50,51 +50,57 @@ PRETRAINED_INIT_CONFIGURATION = {
 
 class MPNetTokenizerFast(PreTrainedTokenizerFast):
     r"""
-    Construct a "fast" MPNet tokenizer (backed by HuggingFace's `tokenizers` library). Based on WordPiece.
+    Construct a "fast" MPNet tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
     methods. Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             File containing the vocabulary.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
             Whether or not to lowercase the input when tokenizing.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see `this
-            issue <https://github.com/huggingface/transformers/issues/328>`__).
-        strip_accents: (:obj:`bool`, `optional`):
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
+            issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents: (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for :obj:`lowercase` (as in the original BERT).
+            value for `lowercase` (as in the original BERT).
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -151,11 +157,11 @@ class MPNetTokenizerFast(PreTrainedTokenizerFast):
     @property
     def mask_token(self) -> str:
         """
-        :obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
+        `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
         not having been set.
 
         MPNet tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
-        comprise the space before the `<mask>`.
+        comprise the space before the *<mask>*.
         """
         if self._mask_token is None and self.verbose:
             logger.error("Using mask_token, but it is not set yet.")
@@ -189,13 +195,13 @@ class MPNetTokenizerFast(PreTrainedTokenizerFast):
         make use of token type ids, therefore a list of zeros is returned
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs
 
         Returns:
-            :obj:`List[int]`: List of zeros.
+            `List[int]`: List of zeros.
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
diff --git a/src/transformers/models/mt5/configuration_mt5.py b/src/transformers/models/mt5/configuration_mt5.py
index a5b01da8cb..e2275c5443 100644
--- a/src/transformers/models/mt5/configuration_mt5.py
+++ b/src/transformers/models/mt5/configuration_mt5.py
@@ -23,44 +23,43 @@ logger = logging.get_logger(__name__)
 
 class MT5Config(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.MT5Model` or a
-    :class:`~transformers.TFMT5Model`. It is used to instantiate a mT5 model according to the specified arguments,
+    This is the configuration class to store the configuration of a [`MT5Model`] or a
+    [`TFMT5Model`]. It is used to instantiate a mT5 model according to the specified arguments,
     defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
-    to that of the mT5 `google/mt5-small <https://huggingface.co/google/mt5-small>`__ architecture.
+    to that of the mT5 [google/mt5-small](https://huggingface.co/google/mt5-small) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
     Arguments:
-        vocab_size (:obj:`int`, `optional`, defaults to 250112):
+        vocab_size (`int`, *optional*, defaults to 250112):
             Vocabulary size of the T5 model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.T5Model` or :class:`~transformers.TFT5Model`.
-        d_model (:obj:`int`, `optional`, defaults to 512):
+            `inputs_ids` passed when calling [`T5Model`] or [`TFT5Model`].
+        d_model (`int`, *optional*, defaults to 512):
             Size of the encoder layers and the pooler layer.
-        d_kv (:obj:`int`, `optional`, defaults to 64):
-            Size of the key, query, value projections per attention head. :obj:`d_kv` has to be equal to :obj:`d_model
-            // num_heads`.
-        d_ff (:obj:`int`, `optional`, defaults to 1024):
-            Size of the intermediate feed forward layer in each :obj:`T5Block`.
-        num_layers (:obj:`int`, `optional`, defaults to 8):
+        d_kv (`int`, *optional*, defaults to 64):
+            Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model // num_heads`.
+        d_ff (`int`, *optional*, defaults to 1024):
+            Size of the intermediate feed forward layer in each `T5Block`.
+        num_layers (`int`, *optional*, defaults to 8):
             Number of hidden layers in the Transformer encoder.
-        num_decoder_layers (:obj:`int`, `optional`):
-            Number of hidden layers in the Transformer decoder. Will use the same value as :obj:`num_layers` if not
+        num_decoder_layers (`int`, *optional*):
+            Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not
             set.
-        num_heads (:obj:`int`, `optional`, defaults to 6):
+        num_heads (`int`, *optional*, defaults to 6):
             Number of attention heads for each attention layer in the Transformer encoder.
-        relative_attention_num_buckets (:obj:`int`, `optional`, defaults to 32):
+        relative_attention_num_buckets (`int`, *optional*, defaults to 32):
             The number of buckets to use for each attention layer.
-        dropout_rate (:obj:`float`, `optional`, defaults to 0.1):
+        dropout_rate (`float`, *optional*, defaults to 0.1):
             The ratio for all dropout layers.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-6):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
             The epsilon used by the layer normalization layers.
-        initializer_factor (:obj:`float`, `optional`, defaults to 1):
+        initializer_factor (`float`, *optional*, defaults to 1):
             A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
             testing).
-        feed_forward_proj (:obj:`string`, `optional`, defaults to :obj:`"gated-gelu"`):
-            Type of feed forward layer to be used. Should be one of :obj:`"relu"` or :obj:`"gated-gelu"`.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        feed_forward_proj (`string`, *optional*, defaults to `"gated-gelu"`):
+            Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`.
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
     """
     model_type = "mt5"
diff --git a/src/transformers/models/openai/configuration_openai.py b/src/transformers/models/openai/configuration_openai.py
index 5ba2a80078..7f212b6c3f 100644
--- a/src/transformers/models/openai/configuration_openai.py
+++ b/src/transformers/models/openai/configuration_openai.py
@@ -26,91 +26,92 @@ OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {"openai-gpt": "https://huggingface.c
 
 class OpenAIGPTConfig(PretrainedConfig):
     """
-    This is the configuration class to store the configuration of a :class:`~transformers.OpenAIGPTModel` or a
-    :class:`~transformers.TFOpenAIGPTModel`. It is used to instantiate a GPT model according to the specified
+    This is the configuration class to store the configuration of a [`OpenAIGPTModel`] or a
+    [`TFOpenAIGPTModel`]. It is used to instantiate a GPT model according to the specified
     arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a similar
-    configuration to that of the `GPT <https://huggingface.co/openai-gpt>`__ architecture from OpenAI.
+    configuration to that of the [GPT](https://huggingface.co/openai-gpt) architecture from OpenAI.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 40478):
+        vocab_size (`int`, *optional*, defaults to 40478):
             Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.OpenAIGPTModel` or
-            :class:`~transformers.TFOpenAIGPTModel`.
-        n_positions (:obj:`int`, `optional`, defaults to 512):
+            `inputs_ids` passed when calling [`OpenAIGPTModel`] or
+            [`TFOpenAIGPTModel`].
+        n_positions (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        n_embd (:obj:`int`, `optional`, defaults to 768):
+        n_embd (`int`, *optional*, defaults to 768):
             Dimensionality of the embeddings and hidden states.
-        n_layer (:obj:`int`, `optional`, defaults to 12):
+        n_layer (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        n_head (:obj:`int`, `optional`, defaults to 12):
+        n_head (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        afn (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+        afn (`str` or `Callable`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        resid_pdrop (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        embd_pdrop (:obj:`int`, `optional`, defaults to 0.1):
+        embd_pdrop (`int`, *optional*, defaults to 0.1):
             The dropout ratio for the embeddings.
-        attn_pdrop (:obj:`float`, `optional`, defaults to 0.1):
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention.
-        layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
             The epsilon to use in the layer normalization layers
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        predict_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        predict_special_tokens (`bool`, *optional*, defaults to `True`):
             Whether or not special tokens should be predicted when the model has a language modeling head.
-        summary_type (:obj:`str`, `optional`, defaults to :obj:`"cls_index"`):
+        summary_type (`str`, *optional*, defaults to `"cls_index"`):
             Argument used when doing sequence summary, used in the models
-            :class:`~transformers.OpenAIGPTDoubleHeadsModel` and :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+            [`OpenAIGPTDoubleHeadsModel`] and [`OpenAIGPTDoubleHeadsModel`].
 
             Has to be one of the following options:
 
-                - :obj:`"last"`: Take the last token hidden state (like XLNet).
-                - :obj:`"first"`: Take the first token hidden state (like BERT).
-                - :obj:`"mean"`: Take the mean of all tokens hidden states.
-                - :obj:`"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
-                - :obj:`"attn"`: Not implemented now, use multi-head attention.
-        summary_use_proj (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                - `"last"`: Take the last token hidden state (like XLNet).
+                - `"first"`: Take the first token hidden state (like BERT).
+                - `"mean"`: Take the mean of all tokens hidden states.
+                - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
+                - `"attn"`: Not implemented now, use multi-head attention.
+        summary_use_proj (`bool`, *optional*, defaults to `True`):
             Argument used when doing sequence summary, used in the models
-            :class:`~transformers.OpenAIGPTDoubleHeadsModel` and :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+            [`OpenAIGPTDoubleHeadsModel`] and [`OpenAIGPTDoubleHeadsModel`].
 
             Whether or not to add a projection after the vector extraction.
-        summary_activation (:obj:`str`, `optional`):
+        summary_activation (`str`, *optional*):
             Argument used when doing sequence summary, used in the models
-            :class:`~transformers.OpenAIGPTDoubleHeadsModel` and :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+            [`OpenAIGPTDoubleHeadsModel`] and [`OpenAIGPTDoubleHeadsModel`].
 
-            Pass :obj:`"tanh"` for a tanh activation to the output, any other value will result in no activation.
-        summary_proj_to_labels (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
+        summary_proj_to_labels (`bool`, *optional*, defaults to `True`):
             Argument used when doing sequence summary, used in the models
-            :class:`~transformers.OpenAIGPTDoubleHeadsModel` and :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+            [`OpenAIGPTDoubleHeadsModel`] and [`OpenAIGPTDoubleHeadsModel`].
 
-            Whether the projection outputs should have :obj:`config.num_labels` or :obj:`config.hidden_size` classes.
-        summary_first_dropout (:obj:`float`, `optional`, defaults to 0.1):
+            Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
+        summary_first_dropout (`float`, *optional*, defaults to 0.1):
             Argument used when doing sequence summary, used in the models
-            :class:`~transformers.OpenAIGPTDoubleHeadsModel` and :class:`~transformers.OpenAIGPTDoubleHeadsModel`.
+            [`OpenAIGPTDoubleHeadsModel`] and [`OpenAIGPTDoubleHeadsModel`].
 
             The dropout ratio to be used after the projection and activation.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
 
 
-    Examples::
+    Examples:
 
-        >>> from transformers import OpenAIGPTConfig, OpenAIGPTModel
+    ```python
+    >>> from transformers import OpenAIGPTConfig, OpenAIGPTModel
 
-        >>> # Initializing a GPT configuration
-        >>> configuration = OpenAIGPTConfig()
+    >>> # Initializing a GPT configuration
+    >>> configuration = OpenAIGPTConfig()
 
-        >>> # Initializing a model from the configuration
-        >>> model = OpenAIGPTModel(configuration)
+    >>> # Initializing a model from the configuration
+    >>> model = OpenAIGPTModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
 
     model_type = "openai-gpt"
     attribute_map = {
diff --git a/src/transformers/models/openai/tokenization_openai.py b/src/transformers/models/openai/tokenization_openai.py
index e5bc6b245f..0b6987000c 100644
--- a/src/transformers/models/openai/tokenization_openai.py
+++ b/src/transformers/models/openai/tokenization_openai.py
@@ -75,18 +75,18 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
     Construct a GPT Tokenizer. Based on Byte-Pair-Encoding with the following peculiarities:
 
     - lowercases all inputs,
-    - uses :obj:`SpaCy` tokenizer and :obj:`ftfy` for pre-BPE tokenization if they are installed, fallback to BERT's
-      :obj:`BasicTokenizer` if not.
+    - uses `SpaCy` tokenizer and `ftfy` for pre-BPE tokenization if they are installed, fallback to BERT's
+      `BasicTokenizer` if not.
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             Path to the vocabulary file.
-        merges_file (:obj:`str`):
+        merges_file (`str`):
             Path to the merges file.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
     """
diff --git a/src/transformers/models/openai/tokenization_openai_fast.py b/src/transformers/models/openai/tokenization_openai_fast.py
index 0b15b6efaa..88bd569ba8 100644
--- a/src/transformers/models/openai/tokenization_openai_fast.py
+++ b/src/transformers/models/openai/tokenization_openai_fast.py
@@ -39,21 +39,21 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 
 class OpenAIGPTTokenizerFast(PreTrainedTokenizerFast):
     """
-    Construct a "fast" GPT Tokenizer (backed by HuggingFace's `tokenizers` library). Based on Byte-Pair-Encoding with
+    Construct a "fast" GPT Tokenizer (backed by HuggingFace's *tokenizers* library). Based on Byte-Pair-Encoding with
     the following peculiarities:
 
     - lower case all inputs
     - uses BERT's BasicTokenizer for pre-BPE tokenization
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
     methods. Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             Path to the vocabulary file.
-        merges_file (:obj:`str`):
+        merges_file (`str`):
             Path to the merges file.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
     """
diff --git a/src/transformers/models/pegasus/configuration_pegasus.py b/src/transformers/models/pegasus/configuration_pegasus.py
index 8cf76c482b..884a4524b5 100644
--- a/src/transformers/models/pegasus/configuration_pegasus.py
+++ b/src/transformers/models/pegasus/configuration_pegasus.py
@@ -28,77 +28,77 @@ PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class PegasusConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.PegasusModel`. It is used to
+    This is the configuration class to store the configuration of a [`PegasusModel`]. It is used to
     instantiate an PEGASUS model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the PEGASUS `google/pegasus-large
-    <https://huggingface.co/google/pegasus-large>`__ architecture.
+    configuration with the defaults will yield a similar configuration to that of the PEGASUS [google/pegasus-large](https://huggingface.co/google/pegasus-large) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 50265):
+        vocab_size (`int`, *optional*, defaults to 50265):
             Vocabulary size of the PEGASUS model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.PegasusModel` or
-            :class:`~transformers.TFPegasusModel`.
-        d_model (:obj:`int`, `optional`, defaults to 1024):
+            `inputs_ids` passed when calling [`PegasusModel`] or
+            [`TFPegasusModel`].
+        d_model (`int`, *optional*, defaults to 1024):
             Dimensionality of the layers and the pooler layer.
-        encoder_layers (:obj:`int`, `optional`, defaults to 12):
+        encoder_layers (`int`, *optional*, defaults to 12):
             Number of encoder layers.
-        decoder_layers (:obj:`int`, `optional`, defaults to 12):
+        decoder_layers (`int`, *optional*, defaults to 12):
             Number of decoder layers.
-        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        activation_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for classifier.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
+        init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        scale_embedding (`bool`, *optional*, defaults to `False`):
             Scale embeddings by diving by sqrt(d_model).
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models)
-        forced_eos_token_id (:obj:`int`, `optional`, defaults to 1):
-            The id of the token to force as the last generated token when :obj:`max_length` is reached. Usually set to
-            :obj:`eos_token_id`.
+        forced_eos_token_id (`int`, *optional*, defaults to 1):
+            The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+            `eos_token_id`.
 
-    Example::
+    Example:
 
-        >>> from transformers import PegasusModel, PegasusConfig
+    ```python
+    >>> from transformers import PegasusModel, PegasusConfig
 
-        >>> # Initializing a PEGASUS google/pegasus-large style configuration
-        >>> configuration = PegasusConfig()
+    >>> # Initializing a PEGASUS google/pegasus-large style configuration
+    >>> configuration = PegasusConfig()
 
-        >>> # Initializing a model from the google/pegasus-large style configuration
-        >>> model = PegasusModel(configuration)
+    >>> # Initializing a model from the google/pegasus-large style configuration
+    >>> model = PegasusModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "pegasus"
     keys_to_ignore_at_inference = ["past_key_values"]
     attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
diff --git a/src/transformers/models/pegasus/modeling_flax_pegasus.py b/src/transformers/models/pegasus/modeling_flax_pegasus.py
index f55a993801..c6f72cbc47 100644
--- a/src/transformers/models/pegasus/modeling_flax_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_flax_pegasus.py
@@ -989,17 +989,18 @@ class FlaxPegasusPreTrainedModel(FlaxPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import PegasusTokenizer, FlaxPegasusForConditionalGeneration
+        ```python
+        >>> from transformers import PegasusTokenizer, FlaxPegasusForConditionalGeneration
 
-            >>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
-            >>> tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large')
+        >>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
+        >>> tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large')
 
-            >>> text = "My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
-            >>> encoder_outputs = model.encode(**inputs)
-        """
+        >>> text = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
+        >>> encoder_outputs = model.encode(**inputs)
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1054,23 +1055,24 @@ class FlaxPegasusPreTrainedModel(FlaxPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import PegasusTokenizer, FlaxPegasusForConditionalGeneration
+        ```python
+        >>> from transformers import PegasusTokenizer, FlaxPegasusForConditionalGeneration
 
-            >>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
-            >>> tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large')
+        >>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
+        >>> tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large')
 
-            >>> text = "My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
-            >>> encoder_outputs = model.encode(**inputs)
+        >>> text = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
+        >>> encoder_outputs = model.encode(**inputs)
 
-            >>> decoder_start_token_id = model.config.decoder_start_token_id
-            >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
 
-            >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
-            >>> last_decoder_hidden_states = outputs.last_hidden_state
-        """
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> last_decoder_hidden_states = outputs.last_hidden_state
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1322,23 +1324,24 @@ class FlaxPegasusForConditionalGeneration(FlaxPegasusPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import PegasusTokenizer, FlaxPegasusForConditionalGeneration
+        ```python
+        >>> from transformers import PegasusTokenizer, FlaxPegasusForConditionalGeneration
 
-            >>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
-            >>> tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large')
+        >>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
+        >>> tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-large')
 
-            >>> text = "My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
-            >>> encoder_outputs = model.encode(**inputs)
+        >>> text = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors='np')
+        >>> encoder_outputs = model.encode(**inputs)
 
-            >>> decoder_start_token_id = model.config.decoder_start_token_id
-            >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
 
-            >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
-            >>> logits = outputs.logits
-        """
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> logits = outputs.logits
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index b5e4879be2..f4d4d3ca79 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -1197,19 +1197,20 @@ class PegasusModel(PegasusPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import PegasusTokenizer, PegasusModel
+        ```python
+        >>> from transformers import PegasusTokenizer, PegasusModel
 
-            >>> tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
-            >>> model = PegasusModel.from_pretrained("google/pegasus-large")
+        >>> tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
+        >>> model = PegasusModel.from_pretrained("google/pegasus-large")
 
-            >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
-            >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
-            >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+        >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
 
-            >>> last_hidden_states = outputs.last_hidden_state
-        """
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
diff --git a/src/transformers/models/pegasus/tokenization_pegasus.py b/src/transformers/models/pegasus/tokenization_pegasus.py
index 15f6364923..c582bc71bc 100644
--- a/src/transformers/models/pegasus/tokenization_pegasus.py
+++ b/src/transformers/models/pegasus/tokenization_pegasus.py
@@ -40,56 +40,57 @@ logger = logging.get_logger(__name__)
 
 class PegasusTokenizer(PreTrainedTokenizer):
     r"""
-    Construct a PEGASUS tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
+    Construct a PEGASUS tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask_2>"`):
+        mask_token (`str`, *optional*, defaults to `"<mask_2>"`):
             The token used for masking single token values. This is the token used when training this model with masked
             language modeling (MLM). This is the token that the PEGASUS encoder will try to predict during pretraining.
-            It corresponds to `[MASK2]` in `PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive
-            Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
-        mask_token_sent (:obj:`str`, `optional`, defaults to :obj:`"<mask_1>"`):
+            It corresponds to *[MASK2]* in [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive
+            Summarization](https://arxiv.org/pdf/1912.08777.pdf).
+        mask_token_sent (`str`, *optional*, defaults to `"<mask_1>"`):
             The token used for masking whole target sentences. This is the token used when training this model with gap
             sentences generation (GSG). This is the sentence that the PEGASUS decoder will try to predict during
-            pretraining. It corresponds to `[MASK1]` in `PEGASUS: Pre-training with Extracted Gap-sentences for
-            Abstractive Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
-        additional_special_tokens (:obj:`List[str]`, `optional`):
+            pretraining. It corresponds to *[MASK1]* in [PEGASUS: Pre-training with Extracted Gap-sentences for
+            Abstractive Summarization](https://arxiv.org/pdf/1912.08777.pdf).
+        additional_special_tokens (`List[str]`, *optional*):
             Additional special tokens used by the tokenizer. If no additional_special_tokens are provided <mask_2> and
-            <unk_2, ..., unk_102> are used as additional special tokens corresponding to the `original PEGASUS
-            tokenizer
-            <https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66>`__
+            <unk_2, ..., unk_102> are used as additional special tokens corresponding to the [original PEGASUS
+            tokenizer](https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66)
             that uses the tokens 2 - 104 only for pretraining
-        sp_model_kwargs (:obj:`dict`, `optional`):
-            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
-            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
 
-            - ``enable_sampling``: Enable subword regularization.
-            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
 
-              - ``nbest_size = {0,1}``: No sampling is performed.
-              - ``nbest_size > 1``: samples from the nbest_size results.
-              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                 using forward-filtering-and-backward-sampling algorithm.
 
-            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
               BPE-dropout.
     """
     vocab_files_names = VOCAB_FILES_NAMES
@@ -252,22 +253,22 @@ class PegasusTokenizer(PreTrainedTokenizer):
     def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
         """
         Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating
-        and adding special tokens. A PEGASUS sequence has the following format, where ``X`` represents the sequence:
+        and adding special tokens. A PEGASUS sequence has the following format, where `X` represents the sequence:
 
-        - single sequence: ``X </s>``
-        - pair of sequences: ``A B </s>`` (not intended use)
+        - single sequence: `X </s>`
+        - pair of sequences: `A B </s>` (not intended use)
 
         BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
         separator.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         if token_ids_1 is None:
             return token_ids_0 + [self.eos_token_id]
diff --git a/src/transformers/models/pegasus/tokenization_pegasus_fast.py b/src/transformers/models/pegasus/tokenization_pegasus_fast.py
index 21c77594ea..12d5fcaeed 100644
--- a/src/transformers/models/pegasus/tokenization_pegasus_fast.py
+++ b/src/transformers/models/pegasus/tokenization_pegasus_fast.py
@@ -51,43 +51,44 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 
 class PegasusTokenizerFast(PreTrainedTokenizerFast):
     r"""
-    Construct a "fast" PEGASUS tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
-    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__.
+    Construct a "fast" PEGASUS tokenizer (backed by HuggingFace's *tokenizers* library). Based on [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
     methods. Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask_2>"`):
+        mask_token (`str`, *optional*, defaults to `"<mask_2>"`):
             The token used for masking single token values. This is the token used when training this model with masked
             language modeling (MLM). This is the token that the PEGASUS encoder will try to predict during pretraining.
-            It corresponds to `[MASK2]` in `PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive
-            Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
-        mask_token_sent (:obj:`str`, `optional`, defaults to :obj:`"<mask_1>"`):
+            It corresponds to *[MASK2]* in [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive
+            Summarization](https://arxiv.org/pdf/1912.08777.pdf).
+        mask_token_sent (`str`, *optional*, defaults to `"<mask_1>"`):
             The token used for masking whole target sentences. This is the token used when training this model with gap
             sentences generation (GSG). This is the sentence that the PEGASUS decoder will try to predict during
-            pretraining. It corresponds to `[MASK1]` in `PEGASUS: Pre-training with Extracted Gap-sentences for
-            Abstractive Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
-        additional_special_tokens (:obj:`List[str]`, `optional`):
+            pretraining. It corresponds to *[MASK1]* in [PEGASUS: Pre-training with Extracted Gap-sentences for
+            Abstractive Summarization](https://arxiv.org/pdf/1912.08777.pdf).
+        additional_special_tokens (`List[str]`, *optional*):
             Additional special tokens used by the tokenizer. If no additional_special_tokens are provided <mask_2> and
-            <unk_2, ..., unk_102> are used as additional special tokens corresponding to the `original PEGASUS
-            tokenizer
-            <https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66>`__
+            <unk_2, ..., unk_102> are used as additional special tokens corresponding to the [original PEGASUS
+            tokenizer](https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66)
             that uses the tokens 2 - 104 only for pretraining
     """
     vocab_files_names = VOCAB_FILES_NAMES
@@ -175,17 +176,17 @@ class PegasusTokenizerFast(PreTrainedTokenizerFast):
         """
         Build model inputs from a sequence by adding eos to the end. no bos token is added to the front.
 
-        - single sequence: ``X </s>``
-        - pair of sequences: ``A B </s>`` (not intended use)
+        - single sequence: `X </s>`
+        - pair of sequences: `A B </s>` (not intended use)
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         if token_ids_1 is None:
             return token_ids_0 + [self.eos_token_id]
diff --git a/src/transformers/models/perceiver/configuration_perceiver.py b/src/transformers/models/perceiver/configuration_perceiver.py
index 849f2413de..79a6d60689 100644
--- a/src/transformers/models/perceiver/configuration_perceiver.py
+++ b/src/transformers/models/perceiver/configuration_perceiver.py
@@ -28,85 +28,86 @@ PERCEIVER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class PerceiverConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.PerceiverModel`. It is used
+    This is the configuration class to store the configuration of a [`PerceiverModel`]. It is used
     to instantiate an Perceiver model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the Perceiver
-    `deepmind/language-perceiver <https://huggingface.co/deepmind/language-perceiver>`__ architecture.
+    [deepmind/language-perceiver](https://huggingface.co/deepmind/language-perceiver) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        num_latents (:obj:`int`, `optional`, defaults to 256):
+        num_latents (`int`, *optional*, defaults to 256):
             The number of latents.
-        d_latents (:obj:`int`, `optional`, defaults to 1280):
+        d_latents (`int`, *optional*, defaults to 1280):
             Dimension of the latent embeddings.
-        d_model (:obj:`int`, `optional`, defaults to 768):
-            Dimension of the inputs. Should only be provided in case [`PerceiverTextPreprocessor`] is used or no
+        d_model (`int`, *optional*, defaults to 768):
+            Dimension of the inputs. Should only be provided in case [*PerceiverTextPreprocessor*] is used or no
             preprocessor is provided.
-        num_blocks (:obj:`int`, `optional`, defaults to 1):
+        num_blocks (`int`, *optional*, defaults to 1):
             Number of blocks in the Transformer encoder.
-        num_self_attends_per_block (:obj:`int`, `optional`, defaults to 26):
+        num_self_attends_per_block (`int`, *optional*, defaults to 26):
             The number of self-attention layers per block.
-        num_self_attention_heads (:obj:`int`, `optional`, defaults to 8):
+        num_self_attention_heads (`int`, *optional*, defaults to 8):
             Number of attention heads for each self-attention layer in the Transformer encoder.
-        num_cross_attention_heads (:obj:`int`, `optional`, defaults to 8):
+        num_cross_attention_heads (`int`, *optional*, defaults to 8):
             Number of attention heads for each cross-attention layer in the Transformer encoder.
-        qk_channels (:obj:`int`, `optional`):
+        qk_channels (`int`, *optional*):
             Dimension to project the queries + keys before applying attention in the cross-attention and self-attention
             layers of the encoder. Will default to preserving the dimension of the queries if not specified.
-        v_channels (:obj:`int`, `optional`):
+        v_channels (`int`, *optional*):
             Dimension to project the values before applying attention in the cross-attention and self-attention layers
             of the encoder. Will default to preserving the dimension of the queries if not specified.
-        cross_attention_shape_for_attention (:obj:`str`, `optional`, defaults to :obj:`'kv'`):
+        cross_attention_shape_for_attention (`str`, *optional*, defaults to `'kv'`):
             Dimension to use when downsampling the queries and keys in the cross-attention layer of the encoder.
-        self_attention_widening_factor (:obj:`int`, `optional`, defaults to 1):
+        self_attention_widening_factor (`int`, *optional*, defaults to 1):
             Dimension of the feed-forward layer in the cross-attention layer of the Transformer encoder.
-        cross_attention_widening_factor (:obj:`int`, `optional`, defaults to 1):
+        cross_attention_widening_factor (`int`, *optional*, defaults to 1):
             Dimension of the feed-forward layer in the self-attention layers of the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        use_query_residual (:obj:`float`, `optional`, defaults to :obj:`True`):
+        use_query_residual (`float`, *optional*, defaults to `True`):
             Whether to add a query residual in the cross-attention layer of the encoder.
-        vocab_size (:obj:`int`, `optional`, defaults to 262):
+        vocab_size (`int`, *optional*, defaults to 262):
             Vocabulary size for the masked language modeling model.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 2048):
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
             The maximum sequence length that the masked language modeling model might ever be used with. Typically set
             this to something large just in case (e.g., 512 or 1024 or 2048).
-        image_size (:obj:`int`, `optional`, defaults to 56):
-            Size of the images after preprocessing, for :class:`~transformers.PerceiverForImageClassificationLearned`.
-        train_size (:obj:`List[int]`, `optional`, defaults to [368, 496]):
+        image_size (`int`, *optional*, defaults to 56):
+            Size of the images after preprocessing, for [`PerceiverForImageClassificationLearned`].
+        train_size (`List[int]`, *optional*, defaults to [368, 496]):
             Training size of the images for the optical flow model.
-        num_frames (:obj:`int`, `optional`, defaults to 16):
+        num_frames (`int`, *optional*, defaults to 16):
             Number of video frames used for the multimodal autoencoding model.
-        audio_samples_per_frame (:obj:`int`, `optional`, defaults to 1920):
+        audio_samples_per_frame (`int`, *optional*, defaults to 1920):
             Number of audio samples per frame for the multimodal autoencoding model.
-        samples_per_patch (:obj:`int`, `optional`, defaults to 16):
+        samples_per_patch (`int`, *optional*, defaults to 16):
             Number of audio samples per patch when preprocessing the audio for the multimodal autoencoding model.
-        output_shape (:obj:`List[int]`, `optional`, defaults to :obj:`[1, 16, 224, 224]`):
+        output_shape (`List[int]`, *optional*, defaults to `[1, 16, 224, 224]`):
             Shape of the output (batch_size, num_frames, height, width) for the video decoder queries of the multimodal
             autoencoding model. This excludes the channel dimension.
 
-    Example::
+    Example:
 
-        >>> from transformers import PerceiverModel, PerceiverConfig
+    ```python
+    >>> from transformers import PerceiverModel, PerceiverConfig
 
-        >>> # Initializing a Perceiver deepmind/language-perceiver style configuration
-        >>> configuration = PerceiverConfig()
+    >>> # Initializing a Perceiver deepmind/language-perceiver style configuration
+    >>> configuration = PerceiverConfig()
 
-        >>> # Initializing a model from the deepmind/language-perceiver style configuration
-        >>> model = PerceiverModel(configuration)
+    >>> # Initializing a model from the deepmind/language-perceiver style configuration
+    >>> model = PerceiverModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "perceiver"
 
     def __init__(
diff --git a/src/transformers/models/perceiver/feature_extraction_perceiver.py b/src/transformers/models/perceiver/feature_extraction_perceiver.py
index a15c7df204..49ca6d9cba 100644
--- a/src/transformers/models/perceiver/feature_extraction_perceiver.py
+++ b/src/transformers/models/perceiver/feature_extraction_perceiver.py
@@ -38,31 +38,31 @@ class PerceiverFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMi
     r"""
     Constructs a Perceiver feature extractor.
 
-    This feature extractor inherits from :class:`~transformers.ImageFeatureExtractionMixin` which contains most of the
+    This feature extractor inherits from [`ImageFeatureExtractionMixin`] which contains most of the
     main methods. Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        do_center_crop (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether to crop the input at the center. If the input size is smaller than :obj:`crop_size` along any edge,
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to crop the input at the center. If the input size is smaller than `crop_size` along any edge,
             the image is padded with 0's and then center cropped.
-        crop_size (:obj:`int`, `optional`, defaults to 256):
-            Desired output size when applying center-cropping. Only has an effect if :obj:`do_center_crop` is set to
-            :obj:`True`.
-        do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether to resize the input to a certain :obj:`size`.
-        size (:obj:`int` or :obj:`Tuple(int)`, `optional`, defaults to 224):
+        crop_size (`int`, *optional*, defaults to 256):
+            Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to
+            `True`.
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input to a certain `size`.
+        size (`int` or `Tuple(int)`, *optional*, defaults to 224):
             Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an
-            integer is provided, then the input will be resized to (size, size). Only has an effect if :obj:`do_resize`
-            is set to :obj:`True`.
-        resample (:obj:`int`, `optional`, defaults to :obj:`PIL.Image.BICUBIC`):
-            An optional resampling filter. This can be one of :obj:`PIL.Image.NEAREST`, :obj:`PIL.Image.BOX`,
-            :obj:`PIL.Image.BILINEAR`, :obj:`PIL.Image.HAMMING`, :obj:`PIL.Image.BICUBIC` or :obj:`PIL.Image.LANCZOS`.
-            Only has an effect if :obj:`do_resize` is set to :obj:`True`.
-        do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to normalize the input with :obj:`image_mean` and :obj:`image_std`.
-        image_mean (:obj:`List[int]`, defaults to :obj:`[0.485, 0.456, 0.406]`):
+            integer is provided, then the input will be resized to (size, size). Only has an effect if `do_resize`
+            is set to `True`.
+        resample (`int`, *optional*, defaults to `PIL.Image.BICUBIC`):
+            An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
+            `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`.
+            Only has an effect if `do_resize` is set to `True`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether or not to normalize the input with `image_mean` and `image_std`.
+        image_mean (`List[int]`, defaults to `[0.485, 0.456, 0.406]`):
             The sequence of means for each channel, to be used when normalizing images.
-        image_std (:obj:`List[int]`, defaults to :obj:`[0.229, 0.224, 0.225]`):
+        image_std (`List[int]`, defaults to `[0.229, 0.224, 0.225]`):
             The sequence of standard deviations for each channel, to be used when normalizing images.
     """
 
@@ -92,11 +92,11 @@ class PerceiverFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMi
 
     def center_crop(self, image):
         """
-        Crops :obj:`image` to `self.crop_size` using a center crop. Note that if the image is too small to be cropped
+        Crops `image` to *self.crop_size* using a center crop. Note that if the image is too small to be cropped
         to the size given, it will be padded (so the returned result has the size asked).
 
         Args:
-            image (:obj:`PIL.Image.Image` or :obj:`np.ndarray` or :obj:`torch.Tensor`):
+            image (`PIL.Image.Image` or `np.ndarray` or `torch.Tensor`):
                 The image to resize.
         """
 
@@ -125,27 +125,29 @@ class PerceiverFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMi
         """
         Main method to prepare for the model one or several image(s).
 
-        .. warning::
+        <Tip warning={true}>
 
-           NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
-           PIL images.
+        NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
+        PIL images.
+
+        </Tip>
 
         Args:
-            images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                 number of channels, H and W are image height and width.
 
-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`, defaults to :obj:`'np'`):
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*, defaults to `'np'`):
                 If set, will return tensors of a particular framework. Acceptable values are:
 
-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return NumPy :obj:`np.ndarray` objects.
-                * :obj:`'jax'`: Return JAX :obj:`jnp.ndarray` objects.
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
 
         Returns:
-            :class:`~transformers.BatchFeature`: A :class:`~transformers.BatchFeature` with the following fields:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
             - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
               width).
diff --git a/src/transformers/models/perceiver/modeling_perceiver.py b/src/transformers/models/perceiver/modeling_perceiver.py
index 5ab22e7bb6..0f5b2db226 100755
--- a/src/transformers/models/perceiver/modeling_perceiver.py
+++ b/src/transformers/models/perceiver/modeling_perceiver.py
@@ -765,83 +765,84 @@ class PerceiverModel(PerceiverPreTrainedModel):
         r"""
         Returns:
 
-        Examples::
+        Examples:
 
-            >>> from transformers import PerceiverConfig, PerceiverTokenizer, PerceiverFeatureExtractor, PerceiverModel
-            >>> from transformers.models.perceiver.modeling_perceiver import PerceiverTextPreprocessor, PerceiverImagePreprocessor, PerceiverClassificationDecoder
-            >>> import torch
-            >>> import requests
-            >>> from PIL import Image
+        ```python
+        >>> from transformers import PerceiverConfig, PerceiverTokenizer, PerceiverFeatureExtractor, PerceiverModel
+        >>> from transformers.models.perceiver.modeling_perceiver import PerceiverTextPreprocessor, PerceiverImagePreprocessor, PerceiverClassificationDecoder
+        >>> import torch
+        >>> import requests
+        >>> from PIL import Image
 
-            >>> # EXAMPLE 1: using the Perceiver to classify texts
-            >>> # - we define a TextPreprocessor, which can be used to embed tokens
-            >>> # - we define a ClassificationDecoder, which can be used to decode the
-            >>> # final hidden states of the latents to classification logits
-            >>> # using trainable position embeddings
-            >>> config = PerceiverConfig()
-            >>> preprocessor = PerceiverTextPreprocessor(config)
-            >>> decoder = PerceiverClassificationDecoder(config,
-            ...                                          num_channels=config.d_latents,
-            ...                                          trainable_position_encoding_kwargs=dict(num_channels=config.d_latents, index_dims=1),
-            ...                                          use_query_residual=True)
-            >>> model = PerceiverModel(config, input_preprocessor=preprocessor, decoder=decoder)
+        >>> # EXAMPLE 1: using the Perceiver to classify texts
+        >>> # - we define a TextPreprocessor, which can be used to embed tokens
+        >>> # - we define a ClassificationDecoder, which can be used to decode the
+        >>> # final hidden states of the latents to classification logits
+        >>> # using trainable position embeddings
+        >>> config = PerceiverConfig()
+        >>> preprocessor = PerceiverTextPreprocessor(config)
+        >>> decoder = PerceiverClassificationDecoder(config,
+        ...                                          num_channels=config.d_latents,
+        ...                                          trainable_position_encoding_kwargs=dict(num_channels=config.d_latents, index_dims=1),
+        ...                                          use_query_residual=True)
+        >>> model = PerceiverModel(config, input_preprocessor=preprocessor, decoder=decoder)
 
-            >>> # you can then do a forward pass as follows:
-            >>> tokenizer = PerceiverTokenizer()
-            >>> text = "hello world"
-            >>> inputs = tokenizer(text, return_tensors="pt").input_ids
+        >>> # you can then do a forward pass as follows:
+        >>> tokenizer = PerceiverTokenizer()
+        >>> text = "hello world"
+        >>> inputs = tokenizer(text, return_tensors="pt").input_ids
 
-            >>> with torch.no_grad():
-            >>>    outputs = model(inputs=inputs)
-            >>> logits = outputs.logits
+        >>> with torch.no_grad():
+        >>>    outputs = model(inputs=inputs)
+        >>> logits = outputs.logits
 
-            >>> # to train, one can train the model using standard cross-entropy:
-            >>> criterion = torch.nn.CrossEntropyLoss()
+        >>> # to train, one can train the model using standard cross-entropy:
+        >>> criterion = torch.nn.CrossEntropyLoss()
 
-            >>> labels = torch.tensor([1])
-            >>> loss = criterion(logits, labels)
+        >>> labels = torch.tensor([1])
+        >>> loss = criterion(logits, labels)
 
-            >>> # EXAMPLE 2: using the Perceiver to classify images
-            >>> # - we define an ImagePreprocessor, which can be used to embed images
-            >>> preprocessor=PerceiverImagePreprocessor(
-            ...              config,
-            ...              prep_type="conv1x1",
-            ...              spatial_downsample=1,
-            ...              out_channels=256,
-            ...              position_encoding_type="trainable",
-            ...              concat_or_add_pos="concat",
-            ...              project_pos_dim=256,
-            ...              trainable_position_encoding_kwargs=dict(num_channels=256, index_dims=config.image_size ** 2,
-            ...              ),
-            ... )
+        >>> # EXAMPLE 2: using the Perceiver to classify images
+        >>> # - we define an ImagePreprocessor, which can be used to embed images
+        >>> preprocessor=PerceiverImagePreprocessor(
+        ...              config,
+        ...              prep_type="conv1x1",
+        ...              spatial_downsample=1,
+        ...              out_channels=256,
+        ...              position_encoding_type="trainable",
+        ...              concat_or_add_pos="concat",
+        ...              project_pos_dim=256,
+        ...              trainable_position_encoding_kwargs=dict(num_channels=256, index_dims=config.image_size ** 2,
+        ...              ),
+        ... )
 
-            >>> model = PerceiverModel(
-            ...         config,
-            ...         input_preprocessor=preprocessor,
-            ...         decoder=PerceiverClassificationDecoder(
-            ...              config,
-            ...              num_channels=config.d_latents,
-            ...              trainable_position_encoding_kwargs=dict(num_channels=config.d_latents, index_dims=1),
-            ...              use_query_residual=True,
-            ...          ),
-            ... )
+        >>> model = PerceiverModel(
+        ...         config,
+        ...         input_preprocessor=preprocessor,
+        ...         decoder=PerceiverClassificationDecoder(
+        ...              config,
+        ...              num_channels=config.d_latents,
+        ...              trainable_position_encoding_kwargs=dict(num_channels=config.d_latents, index_dims=1),
+        ...              use_query_residual=True,
+        ...          ),
+        ... )
 
-            >>> # you can then do a forward pass as follows:
-            >>> feature_extractor = PerceiverFeatureExtractor()
-            >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
-            >>> image = Image.open(requests.get(url, stream=True).raw)
-            >>> inputs = feature_extractor(image, return_tensors="pt").pixel_values
+        >>> # you can then do a forward pass as follows:
+        >>> feature_extractor = PerceiverFeatureExtractor()
+        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = feature_extractor(image, return_tensors="pt").pixel_values
 
-            >>> with torch.no_grad():
-            >>>    outputs = model(inputs=inputs)
-            >>> logits = outputs.logits
+        >>> with torch.no_grad():
+        >>>    outputs = model(inputs=inputs)
+        >>> logits = outputs.logits
 
-            >>> # to train, one can train the model using standard cross-entropy:
-            >>> criterion = torch.nn.CrossEntropyLoss()
+        >>> # to train, one can train the model using standard cross-entropy:
+        >>> criterion = torch.nn.CrossEntropyLoss()
 
-            >>> labels = torch.tensor([1])
-            >>> loss = criterion(logits, labels)
-        """
+        >>> labels = torch.tensor([1])
+        >>> loss = criterion(logits, labels)
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/perceiver/tokenization_perceiver.py b/src/transformers/models/perceiver/tokenization_perceiver.py
index 95c4ee6683..505109a6d3 100644
--- a/src/transformers/models/perceiver/tokenization_perceiver.py
+++ b/src/transformers/models/perceiver/tokenization_perceiver.py
@@ -28,26 +28,29 @@ class PerceiverTokenizer(PreTrainedTokenizer):
     """
     Construct a Perceiver tokenizer. The Perceiver simply uses raw bytes utf-8 encoding.
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
             The token used for padding, for example when batching sequences of different lengths.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"[BOS]"`):
+        bos_token (`str`, *optional*, defaults to `"[BOS]"`):
             The BOS token (reserved in the vocab, but not actually used).
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"[EOS]"`):
+        eos_token (`str`, *optional*, defaults to `"[EOS]"`):
             The end of sequence token (reserved in the vocab, but not actually used).
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
             The MASK token, useful for masked language modeling.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
             The CLS token (reserved in the vocab, but not actually used).
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
             The separator token, which is used when building a sequence from two sequences.
 
     """
@@ -115,18 +118,18 @@ class PerceiverTokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
         if already_has_special_tokens:
             return super().get_special_tokens_mask(
@@ -145,17 +148,17 @@ class PerceiverTokenizer(PreTrainedTokenizer):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks. A sequence has the
         following format:
 
-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         if token_ids_1 is None:
             return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
diff --git a/src/transformers/models/phobert/tokenization_phobert.py b/src/transformers/models/phobert/tokenization_phobert.py
index a07e5bba3a..826db73374 100644
--- a/src/transformers/models/phobert/tokenization_phobert.py
+++ b/src/transformers/models/phobert/tokenization_phobert.py
@@ -69,41 +69,47 @@ class PhobertTokenizer(PreTrainedTokenizer):
     """
     Construct a PhoBERT tokenizer. Based on Byte-Pair-Encoding.
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             Path to the vocabulary file.
-        merges_file (:obj:`str`):
+        merges_file (`str`):
             Path to the merges file.
-        bos_token (:obj:`st`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`st`, *optional*, defaults to `"<s>"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
     """
@@ -162,17 +168,17 @@ class PhobertTokenizer(PreTrainedTokenizer):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. A PhoBERT sequence has the following format:
 
-        - single sequence: ``<s> X </s>``
-        - pair of sequences: ``<s> A </s></s> B </s>``
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
 
         if token_ids_1 is None:
@@ -186,18 +192,18 @@ class PhobertTokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
 
         if already_has_special_tokens:
@@ -217,13 +223,13 @@ class PhobertTokenizer(PreTrainedTokenizer):
         make use of token type ids, therefore a list of zeros is returned.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of zeros.
+            `List[int]`: List of zeros.
         """
 
         sep = [self.sep_token_id]
diff --git a/src/transformers/models/prophetnet/configuration_prophetnet.py b/src/transformers/models/prophetnet/configuration_prophetnet.py
index 074bad3e24..2ed8571e4b 100644
--- a/src/transformers/models/prophetnet/configuration_prophetnet.py
+++ b/src/transformers/models/prophetnet/configuration_prophetnet.py
@@ -28,69 +28,69 @@ PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class ProphetNetConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.ProphetNetModel`. It is used
+    This is the configuration class to store the configuration of a [`ProphetNetModel`]. It is used
     to instantiate a ProphetNet model according to the specified arguments, defining the model architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        activation_dropout (:obj:`float`, `optional`, defaults to 0.1):
+        activation_dropout (`float`, *optional*, defaults to 0.1):
             The dropout ratio for activations inside the fully connected layer.
-        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the ProphetNET model. Defines the number of different tokens that can be represented by
-            the :obj:`inputs_ids` passed when calling :class:`~transformers.ProphetNetModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 1024):
+            the `inputs_ids` passed when calling [`ProphetNetModel`].
+        hidden_size (`int`, *optional*, defaults to 1024):
             Dimensionality of the layers and the pooler layer.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        num_encoder_layers (:obj:`int`, `optional`, defaults to 12):
+        num_encoder_layers (`int`, *optional*, defaults to 12):
             Number of encoder layers.
-        num_encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        num_encoder_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
-            Dimensionality of the ``intermediate`` (often named feed-forward) layer in decoder.
-        num_decoder_layers (:obj:`int`, `optional`, defaults to 12):
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the `intermediate` (often named feed-forward) layer in decoder.
+        num_decoder_layers (`int`, *optional*, defaults to 12):
             Number of decoder layers.
-        num_decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        num_decoder_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer decoder.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
+        attention_dropout (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+        dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
+        init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        add_cross_attention (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        add_cross_attention (`bool`, *optional*, defaults to `True`):
             Whether cross-attention layers should be added to the model.
-        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
             Whether this is an encoder/decoder model.
-        pad_token_id (:obj:`int`, `optional`, defaults to 1)
+        pad_token_id (`int`, *optional*, defaults to 1)
             Padding token id.
-        bos_token_id (:obj:`int`, `optional`, defaults to 0)
+        bos_token_id (`int`, *optional*, defaults to 0)
             Beginning of stream token id.
-        eos_token_id (:obj:`int`, `optional`, defaults to 2)
+        eos_token_id (`int`, *optional*, defaults to 2)
             End of stream token id.
-        ngram (:obj:`int`, `optional`, defaults to 2)
+        ngram (`int`, *optional*, defaults to 2)
             Number of future tokens to predict. Set to 1 to be same as traditional Language model to predict next first
             token.
-        num_buckets (:obj:`int`, `optional`, defaults to 32)
+        num_buckets (`int`, *optional*, defaults to 32)
             The number of buckets to use for each attention layer. This is for relative position calculation. See the
-            `T5 paper <see https://arxiv.org/abs/1910.10683>`__ for more details.
-        relative_max_distance (:obj:`int`, `optional`, defaults to 128)
+            [T5 paper](see https://arxiv.org/abs/1910.10683) for more details.
+        relative_max_distance (`int`, *optional*, defaults to 128)
             Relative distances greater than this number will be put into the last same bucket. This is for relative
-            position calculation. See the `T5 paper <see https://arxiv.org/abs/1910.10683>`__ for more details.
-        disable_ngram_loss (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            position calculation. See the [T5 paper](see https://arxiv.org/abs/1910.10683) for more details.
+        disable_ngram_loss (`bool`, *optional*, defaults to `False`):
             Whether be trained predicting only the next first token.
-        eps (:obj:`float`, `optional`, defaults to 0.0):
-            Controls the ``epsilon`` parameter value for label smoothing in the loss calculation. If set to 0, no label
+        eps (`float`, *optional*, defaults to 0.0):
+            Controls the `epsilon` parameter value for label smoothing in the loss calculation. If set to 0, no label
             smoothing is performed.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
     """
     model_type = "prophetnet"
diff --git a/src/transformers/models/prophetnet/modeling_prophetnet.py b/src/transformers/models/prophetnet/modeling_prophetnet.py
index 34f8f1904f..9af663401d 100644
--- a/src/transformers/models/prophetnet/modeling_prophetnet.py
+++ b/src/transformers/models/prophetnet/modeling_prophetnet.py
@@ -1271,18 +1271,19 @@ class ProphetNetEncoder(ProphetNetPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import ProphetNetTokenizer, ProphetNetEncoder
-            >>> import torch
+        ```python
+        >>> from transformers import ProphetNetTokenizer, ProphetNetEncoder
+        >>> import torch
 
-            >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased')
-            >>> model = ProphetNetEncoder.from_pretrained('patrickvonplaten/prophetnet-large-uncased-standalone')
-            >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-            >>> outputs = model(**inputs)
+        >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased')
+        >>> model = ProphetNetEncoder.from_pretrained('patrickvonplaten/prophetnet-large-uncased-standalone')
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
 
-            >>> last_hidden_states = outputs.last_hidden_state
-        """
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -1788,20 +1789,21 @@ class ProphetNetModel(ProphetNetPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import ProphetNetTokenizer, ProphetNetModel
+        ```python
+        >>> from transformers import ProphetNetTokenizer, ProphetNetModel
 
-            >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased')
-            >>> model = ProphetNetModel.from_pretrained('microsoft/prophetnet-large-uncased')
+        >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased')
+        >>> model = ProphetNetModel.from_pretrained('microsoft/prophetnet-large-uncased')
 
-            >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
-            >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
-            >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+        >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
 
-            >>> last_hidden_states = outputs.last_hidden_state  # main stream hidden states
-            >>> last_hidden_states_ngram = outputs.last_hidden_state_ngram  # predict hidden states
-        """
+        >>> last_hidden_states = outputs.last_hidden_state  # main stream hidden states
+        >>> last_hidden_states_ngram = outputs.last_hidden_state_ngram  # predict hidden states
+        ```"""
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
diff --git a/src/transformers/models/prophetnet/tokenization_prophetnet.py b/src/transformers/models/prophetnet/tokenization_prophetnet.py
index 696420ca87..9866b7fd38 100644
--- a/src/transformers/models/prophetnet/tokenization_prophetnet.py
+++ b/src/transformers/models/prophetnet/tokenization_prophetnet.py
@@ -56,46 +56,45 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
     r"""
     Construct a ProphetNetTokenizer. Based on WordPiece.
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             File containing the vocabulary.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
             Whether or not to lowercase the input when tokenizing.
-        do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
             Whether or not to do basic tokenization before WordPiece.
-        never_split (:obj:`Iterable`, `optional`):
+        never_split (`Iterable`, *optional*):
             Collection of tokens which will never be split during tokenization. Only has an effect when
-            :obj:`do_basic_tokenize=True`
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+            `do_basic_tokenize=True`
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        x_sep_token (:obj:`str`, `optional`, defaults to :obj:`"[X_SEP]"`):
+        x_sep_token (`str`, *optional*, defaults to `"[X_SEP]"`):
             Special second separator token, which can be generated by
-            :class:`~transformers.ProphetNetForConditionalGeneration`. It is used to separate bullet-point like
+            [`ProphetNetForConditionalGeneration`]. It is used to separate bullet-point like
             sentences in summarization, *e.g.*.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
             The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
             Whether or not to tokenize Chinese characters.
 
-            This should likely be deactivated for Japanese (see this `issue
-            <https://github.com/huggingface/transformers/issues/328>`__).
-        strip_accents: (:obj:`bool`, `optional`):
+            This should likely be deactivated for Japanese (see this [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents: (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for :obj:`lowercase` (as in the original BERT).
+            value for `lowercase` (as in the original BERT).
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -189,18 +188,18 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
         if already_has_special_tokens:
             return super().get_special_tokens_mask(
@@ -218,21 +217,21 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
         Create a mask from the two sequences passed to be used in a sequence-pair classification task. A ProphetNet
         sequence pair mask has the following format:
 
-        ::
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
 
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
-
-        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
             sequence(s).
         """
         sep = [self.sep_token_id]
@@ -267,17 +266,17 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. A BERT sequence has the following format:
 
-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         if token_ids_1 is None:
             return token_ids_0 + [self.sep_token_id]
diff --git a/src/transformers/models/qdqbert/configuration_qdqbert.py b/src/transformers/models/qdqbert/configuration_qdqbert.py
index ede907486d..a30353a3b5 100644
--- a/src/transformers/models/qdqbert/configuration_qdqbert.py
+++ b/src/transformers/models/qdqbert/configuration_qdqbert.py
@@ -28,60 +28,60 @@ QDQBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class QDQBertConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.QDQBertModel`. It is used to
+    This is the configuration class to store the configuration of a [`QDQBertModel`]. It is used to
     instantiate an QDQBERT model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the BERT `bert-base-uncased
-    <https://huggingface.co/bert-base-uncased>`__ architecture.
+    configuration with the defaults will yield a similar configuration to that of the BERT [bert-base-uncased](https://huggingface.co/bert-base-uncased) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the QDQBERT model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.QDQBertModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            `inputs_ids` passed when calling [`QDQBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimension of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.QDQBertModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`QDQBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if ``config.is_decoder=True``.
+            relevant if `config.is_decoder=True`.
 
-    Examples::
+    Examples:
 
-        >>> from transformers import QDQBertModel, QDQBertConfig
+    ```python
+    >>> from transformers import QDQBertModel, QDQBertConfig
 
-        >>> # Initializing a QDQBERT bert-base-uncased style configuration
-        >>> configuration = QDQBertConfig()
+    >>> # Initializing a QDQBERT bert-base-uncased style configuration
+    >>> configuration = QDQBertConfig()
 
-        >>> # Initializing a model from the bert-base-uncased style configuration
-        >>> model = QDQBertModel(configuration)
+    >>> # Initializing a model from the bert-base-uncased style configuration
+    >>> model = QDQBertModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "qdqbert"
 
     def __init__(
diff --git a/src/transformers/models/rag/configuration_rag.py b/src/transformers/models/rag/configuration_rag.py
index 252d91660e..978b01cdd2 100644
--- a/src/transformers/models/rag/configuration_rag.py
+++ b/src/transformers/models/rag/configuration_rag.py
@@ -21,62 +21,62 @@ from ...file_utils import add_start_docstrings
 
 
 RAG_CONFIG_DOC = r"""
-    :class:`~transformers.RagConfig` stores the configuration of a `RagModel`. Configuration objects inherit from
-    :class:`~transformers.PretrainedConfig` and can be used to control the model outputs. Read the documentation from
-    :class:`~transformers.PretrainedConfig` for more information.
+    [`RagConfig`] stores the configuration of a *RagModel*. Configuration objects inherit from
+    [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from
+    [`PretrainedConfig`] for more information.
 
     Args:
-        title_sep (:obj:`str`, `optional`, defaults to  ``" / "``):
+        title_sep (`str`, *optional*, defaults to  `" / "`):
             Separator inserted between the title and the text of the retrieved document when calling
-            :class:`~transformers.RagRetriever`.
-        doc_sep (:obj:`str`, `optional`, defaults to  ``" // "``):
+            [`RagRetriever`].
+        doc_sep (`str`, *optional*, defaults to  `" // "`):
             Separator inserted between the the text of the retrieved document and the original input when calling
-            :class:`~transformers.RagRetriever`.
-        n_docs (:obj:`int`, `optional`, defaults to 5):
+            [`RagRetriever`].
+        n_docs (`int`, *optional*, defaults to 5):
             Number of documents to retrieve.
-        max_combined_length (:obj:`int`, `optional`, defaults to 300):
-            Max length of contextualized input returned by :meth:`~transformers.RagRetriever.__call__`.
-        retrieval_vector_size (:obj:`int`, `optional`, defaults to 768):
-            Dimensionality of the document embeddings indexed by :class:`~transformers.RagRetriever`.
-        retrieval_batch_size (:obj:`int`, `optional`, defaults to 8):
+        max_combined_length (`int`, *optional*, defaults to 300):
+            Max length of contextualized input returned by [`~RagRetriever.__call__`].
+        retrieval_vector_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the document embeddings indexed by [`RagRetriever`].
+        retrieval_batch_size (`int`, *optional*, defaults to 8):
             Retrieval batch size, defined as the number of queries issues concurrently to the faiss index encapsulated
-            :class:`~transformers.RagRetriever`.
-        dataset (:obj:`str`, `optional`, defaults to :obj:`"wiki_dpr"`):
+            [`RagRetriever`].
+        dataset (`str`, *optional*, defaults to `"wiki_dpr"`):
             A dataset identifier of the indexed dataset in HuggingFace Datasets (list all available datasets and ids
-            using :obj:`datasets.list_datasets()`).
-        dataset_split (:obj:`str`, `optional`, defaults to :obj:`"train"`)
-            Which split of the :obj:`dataset` to load.
-        index_name (:obj:`str`, `optional`, defaults to :obj:`"compressed"`)
-            The index name of the index associated with the :obj:`dataset`. One can choose between :obj:`"legacy"`,
-            :obj:`"exact"` and :obj:`"compressed"`.
-        index_path (:obj:`str`, `optional`)
+            using `datasets.list_datasets()`).
+        dataset_split (`str`, *optional*, defaults to `"train"`)
+            Which split of the `dataset` to load.
+        index_name (`str`, *optional*, defaults to `"compressed"`)
+            The index name of the index associated with the `dataset`. One can choose between `"legacy"`,
+            `"exact"` and `"compressed"`.
+        index_path (`str`, *optional*)
             The path to the serialized faiss index on disk.
-        passages_path: (:obj:`str`, `optional`):
+        passages_path: (`str`, *optional*):
             A path to text passages compatible with the faiss index. Required if using
-            :class:`~transformers.models.rag.retrieval_rag.LegacyIndex`
-        use_dummy_dataset (:obj:`bool`, `optional`, defaults to ``False``)
-            Whether to load a "dummy" variant of the dataset specified by :obj:`dataset`.
-        label_smoothing (:obj:`float`, `optional`, defaults to 0.0):
-            Only relevant if ``return_loss`` is set to :obj:`True`. Controls the ``epsilon`` parameter value for label
+            [`~models.rag.retrieval_rag.LegacyIndex`]
+        use_dummy_dataset (`bool`, *optional*, defaults to `False`)
+            Whether to load a "dummy" variant of the dataset specified by `dataset`.
+        label_smoothing (`float`, *optional*, defaults to 0.0):
+            Only relevant if `return_loss` is set to `True`. Controls the `epsilon` parameter value for label
             smoothing in the loss calculation. If set to 0, no label smoothing is performed.
-        do_marginalize (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            If :obj:`True`, the logits are marginalized over all documents by making use of
-            ``torch.nn.functional.log_softmax``.
-        reduce_loss (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to reduce the NLL loss using the ``torch.Tensor.sum`` operation.
-        do_deduplication (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_marginalize (`bool`, *optional*, defaults to `False`):
+            If `True`, the logits are marginalized over all documents by making use of
+            `torch.nn.functional.log_softmax`.
+        reduce_loss (`bool`, *optional*, defaults to `False`):
+            Whether or not to reduce the NLL loss using the `torch.Tensor.sum` operation.
+        do_deduplication (`bool`, *optional*, defaults to `True`):
             Whether or not to deduplicate the generations from different context documents for a given input. Has to be
-            set to :obj:`False` if used while training with distributed backend.
-        exclude_bos_score (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            set to `False` if used while training with distributed backend.
+        exclude_bos_score (`bool`, *optional*, defaults to `False`):
             Whether or not to disregard the BOS token when computing the loss.
-        output_retrieved(:obj:`bool`, `optional`, defaults to :obj:`False`):
-            If set to ``True``, :obj:`retrieved_doc_embeds`, :obj:`retrieved_doc_ids`, :obj:`context_input_ids` and
-            :obj:`context_attention_mask` are returned. See returned tensors for more detail.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        output_retrieved(`bool`, *optional*, defaults to `False`):
+            If set to `True`, `retrieved_doc_embeds`, `retrieved_doc_ids`, `context_input_ids` and
+            `context_attention_mask` are returned. See returned tensors for more detail.
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
-        forced_eos_token_id (:obj:`int`, `optional`):
-            The id of the token to force as the last generated token when :obj:`max_length` is reached. Usually set to
-            :obj:`eos_token_id`.
+        forced_eos_token_id (`int`, *optional*):
+            The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+            `eos_token_id`.
 """
 
 
@@ -174,21 +174,21 @@ class RagConfig(PretrainedConfig):
         cls, question_encoder_config: PretrainedConfig, generator_config: PretrainedConfig, **kwargs
     ) -> PretrainedConfig:
         r"""
-        Instantiate a :class:`~transformers.EncoderDecoderConfig` (or a derived class) from a pre-trained encoder model
+        Instantiate a [`EncoderDecoderConfig`] (or a derived class) from a pre-trained encoder model
         configuration and decoder model configuration.
 
         Returns:
-            :class:`EncoderDecoderConfig`: An instance of a configuration object
+            [`EncoderDecoderConfig`]: An instance of a configuration object
         """
         return cls(question_encoder=question_encoder_config.to_dict(), generator=generator_config.to_dict(), **kwargs)
 
     def to_dict(self):
         """
         Serializes this instance to a Python dictionary. Override the default
-        :meth:`~transformers.PretrainedConfig.to_dict`.
+        [`~PretrainedConfig.to_dict`].
 
         Returns:
-            :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
         """
         output = copy.deepcopy(self.__dict__)
         output["question_encoder"] = self.question_encoder.to_dict()
diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py
index a3b120d09f..b10270d77d 100644
--- a/src/transformers/models/rag/modeling_rag.py
+++ b/src/transformers/models/rag/modeling_rag.py
@@ -544,19 +544,20 @@ class RagModel(RagPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import RagTokenizer, RagRetriever, RagModel
-            >>> import torch
+        ```python
+        >>> from transformers import RagTokenizer, RagRetriever, RagModel
+        >>> import torch
 
-            >>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-base")
-            >>> retriever = RagRetriever.from_pretrained("facebook/rag-token-base", index_name="exact", use_dummy_dataset=True)
-            >>> # initialize with RagRetriever to do everything in one forward call
-            >>> model = RagModel.from_pretrained("facebook/rag-token-base", retriever=retriever)
+        >>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-base")
+        >>> retriever = RagRetriever.from_pretrained("facebook/rag-token-base", index_name="exact", use_dummy_dataset=True)
+        >>> # initialize with RagRetriever to do everything in one forward call
+        >>> model = RagModel.from_pretrained("facebook/rag-token-base", retriever=retriever)
 
-            >>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
-            >>> outputs = model(input_ids=inputs["input_ids"])
-        """
+        >>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
+        >>> outputs = model(input_ids=inputs["input_ids"])
+        ```"""
         n_docs = n_docs if n_docs is not None else self.config.n_docs
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
diff --git a/src/transformers/models/rag/modeling_tf_rag.py b/src/transformers/models/rag/modeling_tf_rag.py
index ccd1524bb6..cf41f74ea5 100644
--- a/src/transformers/models/rag/modeling_tf_rag.py
+++ b/src/transformers/models/rag/modeling_tf_rag.py
@@ -540,21 +540,21 @@ class TFRagModel(TFRagPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import RagTokenizer, RagRetriever, RagModel
-            >>> import torch
+        ```python
+        >>> from transformers import RagTokenizer, RagRetriever, RagModel
+        >>> import torch
 
-            >>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-base")
-            >>> retriever = RagRetriever.from_pretrained("facebook/rag-token-base", index_name="exact", use_dummy_dataset=True)
-            >>> # initialize with RagRetriever to do everything in one forward call
-            >>> model = TFRagModel.from_pretrained("facebook/rag-token-base", retriever=retriever, from_pt=True)
+        >>> tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-base")
+        >>> retriever = RagRetriever.from_pretrained("facebook/rag-token-base", index_name="exact", use_dummy_dataset=True)
+        >>> # initialize with RagRetriever to do everything in one forward call
+        >>> model = TFRagModel.from_pretrained("facebook/rag-token-base", retriever=retriever, from_pt=True)
 
-            >>> input_dict = tokenizer.prepare_seq2seq_batch("How many people live in Paris?", "In Paris, there are 10 million people.", return_tensors="tf")
-            >>> input_ids = input_dict["input_ids"]
-            >>> outputs = model(input_ids)
-
-        """
+        >>> input_dict = tokenizer.prepare_seq2seq_batch("How many people live in Paris?", "In Paris, there are 10 million people.", return_tensors="tf")
+        >>> input_ids = input_dict["input_ids"]
+        >>> outputs = model(input_ids)
+        ```"""
         assert (
             "decoder_cached_states" not in kwargs
         ), "Please use past_key_values to cache intermediate outputs"  # from modeling_tf_bart.py
diff --git a/src/transformers/models/rag/retrieval_rag.py b/src/transformers/models/rag/retrieval_rag.py
index aaf8fcdf77..2c2820b084 100644
--- a/src/transformers/models/rag/retrieval_rag.py
+++ b/src/transformers/models/rag/retrieval_rag.py
@@ -44,7 +44,7 @@ LEGACY_INDEX_PATH = "https://storage.googleapis.com/huggingface-nlp/datasets/wik
 
 class Index:
     """
-    A base class for the Indices encapsulated by the :class:`~transformers.RagRetriever`.
+    A base class for the Indices encapsulated by the [`RagRetriever`].
     """
 
     def get_doc_dicts(self, doc_ids: np.ndarray) -> List[dict]:
@@ -52,31 +52,31 @@ class Index:
         Returns a list of dictionaries, containing titles and text of the retrieved documents.
 
         Args:
-            doc_ids (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs)`):
+            doc_ids (`np.ndarray` of shape `(batch_size, n_docs)`):
                 A tensor of document indices.
         """
         raise NotImplementedError
 
     def get_top_docs(self, question_hidden_states: np.ndarray, n_docs=5) -> Tuple[np.ndarray, np.ndarray]:
         """
-        For each query in the batch, retrieves ``n_docs`` documents.
+        For each query in the batch, retrieves `n_docs` documents.
 
         Args:
-            question_hidden_states (:obj:`np.ndarray` of shape :obj:`(batch_size, vector_size):
+            question_hidden_states (`np.ndarray` of shape `(batch_size, vector_size)`):
                 An array of query vectors.
-            n_docs (:obj:`int`):
+            n_docs (`int`):
                 The number of docs retrieved per query.
 
         Returns:
-            :obj:`np.ndarray` of shape :obj:`(batch_size, n_docs)`: A tensor of indices of retrieved documents.
-            :obj:`np.ndarray` of shape :obj:`(batch_size, vector_size)`: A tensor of vector representations of
+            `np.ndarray` of shape `(batch_size, n_docs)`: A tensor of indices of retrieved documents.
+            `np.ndarray` of shape `(batch_size, vector_size)`: A tensor of vector representations of
             retrieved documents.
         """
         raise NotImplementedError
 
     def is_initialized(self):
         """
-        Returns :obj:`True` if index is already initialized.
+        Returns `True` if index is already initialized.
         """
         raise NotImplementedError
 
@@ -95,11 +95,11 @@ class LegacyIndex(Index):
     default faiss index parameters as specified in that repository.
 
     Args:
-        vector_size (:obj:`int`):
+        vector_size (`int`):
             The dimension of indexed vectors.
-        index_path (:obj:`str`):
-            A path to a `directory` containing index files compatible with
-            :class:`~transformers.models.rag.retrieval_rag.LegacyIndex`
+        index_path (`str`):
+            A path to a *directory* containing index files compatible with
+            [`~models.rag.retrieval_rag.LegacyIndex`]
     """
 
     INDEX_FILENAME = "hf_bert_base.hnswSQ8_correct_phi_128.c_index"
@@ -114,7 +114,7 @@ class LegacyIndex(Index):
         self._index_initialized = False
 
     def _resolve_path(self, index_path, filename):
-        assert os.path.isdir(index_path) or is_remote_url(index_path), "Please specify a valid ``index_path``."
+        assert os.path.isdir(index_path) or is_remote_url(index_path), "Please specify a valid `index_path`."
         archive_file = os.path.join(index_path, filename)
         try:
             # Load from URL or cache if already cached
@@ -228,23 +228,23 @@ class HFIndexBase(Index):
 
 class CanonicalHFIndex(HFIndexBase):
     """
-    A wrapper around an instance of :class:`~datasets.Datasets`. If ``index_path`` is set to ``None``, we load the
-    pre-computed index available with the :class:`~datasets.arrow_dataset.Dataset`, otherwise, we load the index from
+    A wrapper around an instance of [`~datasets.Datasets`]. If `index_path` is set to `None`, we load the
+    pre-computed index available with the [`~datasets.arrow_dataset.Dataset`], otherwise, we load the index from
     the indicated path on disk.
 
     Args:
-        vector_size (:obj:`int`): the dimension of the passages embeddings used by the index
-        dataset_name (:obj:`str`, optional, defaults to ``wiki_dpr``):
+        vector_size (`int`): the dimension of the passages embeddings used by the index
+        dataset_name (`str`, optional, defaults to `wiki_dpr`):
             A dataset identifier of the indexed dataset on HuggingFace AWS bucket (list all available datasets and ids
-            with ``datasets.list_datasets()``).
-        dataset_split (:obj:`str`, optional, defaults to ``train``)
-            Which split of the ``dataset`` to load.
-        index_name (:obj:`str`, optional, defaults to ``train``)
-            The index_name of the index associated with the ``dataset``. The index loaded from ``index_path`` will be
+            with `datasets.list_datasets()`).
+        dataset_split (`str`, optional, defaults to `train`)
+            Which split of the `dataset` to load.
+        index_name (`str`, optional, defaults to `train`)
+            The index_name of the index associated with the `dataset`. The index loaded from `index_path` will be
             saved under this name.
-        index_path (:obj:`str`, optional, defaults to ``None``)
+        index_path (`str`, optional, defaults to `None`)
             The path to the serialized faiss index on disk.
-        use_dummy_dataset (:obj:`bool`, optional, defaults to ``False``): If True, use the dummy configuration of the dataset for tests.
+        use_dummy_dataset (`bool`, optional, defaults to `False`): If True, use the dummy configuration of the dataset for tests.
     """
 
     def __init__(
@@ -289,15 +289,15 @@ class CanonicalHFIndex(HFIndexBase):
 
 class CustomHFIndex(HFIndexBase):
     """
-    A wrapper around an instance of :class:`~datasets.Datasets`. The dataset and the index are both loaded from the
+    A wrapper around an instance of [`~datasets.Datasets`]. The dataset and the index are both loaded from the
     indicated paths on disk.
 
     Args:
-        vector_size (:obj:`int`): the dimension of the passages embeddings used by the index
-        dataset_path (:obj:`str`):
+        vector_size (`int`): the dimension of the passages embeddings used by the index
+        dataset_path (`str`):
             The path to the serialized dataset on disk. The dataset should have 3 columns: title (str), text (str) and
             embeddings (arrays of dimension vector_size)
-        index_path (:obj:`str`)
+        index_path (`str`)
             The path to the serialized faiss index on disk.
     """
 
@@ -310,8 +310,8 @@ class CustomHFIndex(HFIndexBase):
         logger.info(f"Loading passages from {dataset_path}")
         if dataset_path is None or index_path is None:
             raise ValueError(
-                "Please provide ``dataset_path`` and ``index_path`` after calling ``dataset.save_to_disk(dataset_path)`` "
-                "and ``dataset.get_index('embeddings').save(index_path)``."
+                "Please provide `dataset_path` and `index_path` after calling `dataset.save_to_disk(dataset_path)` "
+                "and `dataset.get_index('embeddings').save(index_path)`."
             )
         dataset = load_from_disk(dataset_path)
         return cls(vector_size=vector_size, dataset=dataset, index_path=index_path)
@@ -329,40 +329,40 @@ class RagRetriever:
     contents, and it formats them to be used with a RagModel.
 
     Args:
-        config (:class:`~transformers.RagConfig`):
+        config ([`RagConfig`]):
             The configuration of the RAG model this Retriever is used with. Contains parameters indicating which
-            ``Index`` to build. You can load your own custom dataset with ``config.index_name="custom"`` or use a
-            canonical one (default) from the datasets library with ``config.index_name="wiki_dpr"`` for example.
-        question_encoder_tokenizer (:class:`~transformers.PreTrainedTokenizer`):
+            `Index` to build. You can load your own custom dataset with `config.index_name="custom"` or use a
+            canonical one (default) from the datasets library with `config.index_name="wiki_dpr"` for example.
+        question_encoder_tokenizer ([`PreTrainedTokenizer`]):
             The tokenizer that was used to tokenize the question. It is used to decode the question and then use the
             generator_tokenizer.
-        generator_tokenizer (:class:`~transformers.PreTrainedTokenizer`):
+        generator_tokenizer ([`PreTrainedTokenizer`]):
             The tokenizer used for the generator part of the RagModel.
-        index (:class:`~transformers.models.rag.retrieval_rag.Index`, optional, defaults to the one defined by the configuration):
+        index ([`~models.rag.retrieval_rag.Index`], optional, defaults to the one defined by the configuration):
             If specified, use this index instead of the one built using the configuration
 
-    Examples::
+    Examples:
 
-        >>> # To load the default "wiki_dpr" dataset with 21M passages from wikipedia (index name is 'compressed' or 'exact')
-        >>> from transformers import RagRetriever
-        >>> retriever = RagRetriever.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', dataset="wiki_dpr", index_name='compressed')
+    ```python
+    >>> # To load the default "wiki_dpr" dataset with 21M passages from wikipedia (index name is 'compressed' or 'exact')
+    >>> from transformers import RagRetriever
+    >>> retriever = RagRetriever.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', dataset="wiki_dpr", index_name='compressed')
 
-        >>> # To load your own indexed dataset built with the datasets library. More info on how to build the indexed dataset in examples/rag/use_own_knowledge_dataset.py
-        >>> from transformers import RagRetriever
-        >>> dataset = ...  # dataset must be a datasets.Datasets object with columns "title", "text" and "embeddings", and it must have a faiss index
-        >>> retriever = RagRetriever.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', indexed_dataset=dataset)
+    >>> # To load your own indexed dataset built with the datasets library. More info on how to build the indexed dataset in examples/rag/use_own_knowledge_dataset.py
+    >>> from transformers import RagRetriever
+    >>> dataset = ...  # dataset must be a datasets.Datasets object with columns "title", "text" and "embeddings", and it must have a faiss index
+    >>> retriever = RagRetriever.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', indexed_dataset=dataset)
 
-        >>> # To load your own indexed dataset built with the datasets library that was saved on disk. More info in examples/rag/use_own_knowledge_dataset.py
-        >>> from transformers import RagRetriever
-        >>> dataset_path = "path/to/my/dataset"  # dataset saved via `dataset.save_to_disk(...)`
-        >>> index_path = "path/to/my/index.faiss"  # faiss index saved via `dataset.get_index("embeddings").save(...)`
-        >>> retriever = RagRetriever.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', index_name='custom', passages_path=dataset_path, index_path=index_path)
+    >>> # To load your own indexed dataset built with the datasets library that was saved on disk. More info in examples/rag/use_own_knowledge_dataset.py
+    >>> from transformers import RagRetriever
+    >>> dataset_path = "path/to/my/dataset"  # dataset saved via *dataset.save_to_disk(...)*
+    >>> index_path = "path/to/my/index.faiss"  # faiss index saved via *dataset.get_index("embeddings").save(...)*
+    >>> retriever = RagRetriever.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', index_name='custom', passages_path=dataset_path, index_path=index_path)
 
-        >>> # To load the legacy index built originally for Rag's paper
-        >>> from transformers import RagRetriever
-        >>> retriever = RagRetriever.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', index_name='legacy')
-
-    """
+    >>> # To load the legacy index built originally for Rag's paper
+    >>> from transformers import RagRetriever
+    >>> retriever = RagRetriever.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base', index_name='legacy')
+    ```"""
 
     def __init__(self, config, question_encoder_tokenizer, generator_tokenizer, index=None, init_retrieval=True):
         self._init_retrieval = init_retrieval
@@ -454,19 +454,19 @@ class RagRetriever:
 
     def postprocess_docs(self, docs, input_strings, prefix, n_docs, return_tensors=None):
         r"""
-        Postprocessing retrieved ``docs`` and combining them with ``input_strings``.
+        Postprocessing retrieved `docs` and combining them with `input_strings`.
 
         Args:
-            docs  (:obj:`dict`):
+            docs  (`dict`):
                 Retrieved documents.
-            input_strings (:obj:`str`):
-                Input strings decoded by ``preprocess_query``.
-            prefix (:obj:`str`):
+            input_strings (`str`):
+                Input strings decoded by `preprocess_query`.
+            prefix (`str`):
                 Prefix added at the beginning of each input, typically used with T5-based models.
 
         Return:
-            :obj:`tuple(tensors)`: a tuple consisting of two elements: contextualized ``input_ids`` and a compatible
-            ``attention_mask``.
+            `tuple(tensors)`: a tuple consisting of two elements: contextualized `input_ids` and a compatible
+            `attention_mask`.
         """
 
         def cat_input_and_doc(doc_title, doc_text, input_string, prefix):
@@ -526,22 +526,22 @@ class RagRetriever:
 
     def retrieve(self, question_hidden_states: np.ndarray, n_docs: int) -> Tuple[np.ndarray, List[dict]]:
         """
-        Retrieves documents for specified ``question_hidden_states``.
+        Retrieves documents for specified `question_hidden_states`.
 
         Args:
-            question_hidden_states (:obj:`np.ndarray` of shape :obj:`(batch_size, vector_size)`):
+            question_hidden_states (`np.ndarray` of shape `(batch_size, vector_size)`):
                 A batch of query vectors to retrieve with.
-            n_docs (:obj:`int`):
+            n_docs (`int`):
                 The number of docs retrieved per query.
 
         Return:
-            :obj:`Tuple[np.ndarray, np.ndarray, List[dict]]`: A tuple with the following objects:
+            `Tuple[np.ndarray, np.ndarray, List[dict]]`: A tuple with the following objects:
 
-            - **retrieved_doc_embeds** (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs, dim)`) -- The retrieval
+            - **retrieved_doc_embeds** (`np.ndarray` of shape `(batch_size, n_docs, dim)`) -- The retrieval
               embeddings of the retrieved docs per query.
-            - **doc_ids** (:obj:`np.ndarray` of shape :obj:`(batch_size, n_docs)`) -- The ids of the documents in the
+            - **doc_ids** (`np.ndarray` of shape `(batch_size, n_docs)`) -- The ids of the documents in the
               index
-            - **doc_dicts** (:obj:`List[dict]`): The :obj:`retrieved_doc_embeds` examples per query.
+            - **doc_dicts** (`List[dict]`): The `retrieved_doc_embeds` examples per query.
         """
 
         doc_ids, retrieved_doc_embeds = self._main_retrieve(question_hidden_states, n_docs)
@@ -561,34 +561,34 @@ class RagRetriever:
         return_tensors=None,
     ) -> BatchEncoding:
         """
-        Retrieves documents for specified :obj:`question_hidden_states`.
+        Retrieves documents for specified `question_hidden_states`.
 
         Args:
-            question_input_ids: (:obj:`List[List[int]]`) batch of input ids
-            question_hidden_states (:obj:`np.ndarray` of shape :obj:`(batch_size, vector_size)`:
+            question_input_ids: (`List[List[int]]`) batch of input ids
+            question_hidden_states (`np.ndarray` of shape `(batch_size, vector_size)`:
                 A batch of query vectors to retrieve with.
-            prefix: (:obj:`str`, `optional`):
+            prefix: (`str`, *optional*):
                 The prefix used by the generator's tokenizer.
-            n_docs (:obj:`int`, `optional`):
+            n_docs (`int`, *optional*):
                 The number of docs retrieved per query.
-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`, defaults to "pt"):
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*, defaults to "pt"):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
 
-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
 
-        Returns: :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following
+        Returns: [`BatchEncoding`]: A [`BatchEncoding`] with the following
         fields:
 
             - **context_input_ids** -- List of token ids to be fed to a model.
 
-              `What are input IDs? <../glossary.html#input-ids>`__
+              [What are input IDs?](../glossary#input-ids)
 
             - **context_attention_mask** -- List of indices specifying which tokens should be attended to by the model
-            (when :obj:`return_attention_mask=True` or if `"attention_mask"` is in :obj:`self.model_input_names`).
+            (when `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`).
 
-              `What are attention masks? <../glossary.html#attention-mask>`__
+              [What are attention masks?](../glossary#attention-mask)
 
             - **retrieved_doc_embeds** -- List of embeddings of the retrieved documents
             - **doc_ids** -- List of ids of the retrieved documents
diff --git a/src/transformers/models/reformer/configuration_reformer.py b/src/transformers/models/reformer/configuration_reformer.py
index 406163a7ff..aaad9e96d6 100755
--- a/src/transformers/models/reformer/configuration_reformer.py
+++ b/src/transformers/models/reformer/configuration_reformer.py
@@ -29,133 +29,131 @@ REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class ReformerConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.ReformerModel`. It is used to
+    This is the configuration class to store the configuration of a [`ReformerModel`]. It is used to
     instantiate a Reformer model according to the specified arguments, defining the model architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        attention_head_size (:obj:`int`, `optional`, defaults to 64):
+        attention_head_size (`int`, *optional*, defaults to 64):
             Dimensionality of the projected key, query and value vectors
-        attn_layers (:obj:`List[str]`, `optional`, defaults to :obj:`["local", "lsh", "local", "lsh", "local", "lsh"]`):
+        attn_layers (`List[str]`, *optional*, defaults to `["local", "lsh", "local", "lsh", "local", "lsh"]`):
             List of attention layer types in ascending order. It can be chosen between a LSHSelfAttention layer
-            (:obj:`"lsh"`) and a LocalSelfAttention layer (:obj:`"local"`).
+            (`"lsh"`) and a LocalSelfAttention layer (`"local"`).
 
-            For more information on LSHSelfAttention layer, see `LSH Self Attention
-            <reformer.html#lsh-self-attention>`__. For more information on LocalSelfAttention layer, see `Local Self
-            Attention <reformer.html#local-self-attention>`__.
-        axial_pos_embds (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            For more information on LSHSelfAttention layer, see [LSH Self Attention](reformer#lsh-self-attention). For more information on LocalSelfAttention layer, see [Local Self
+            Attention](reformer#local-self-attention).
+        axial_pos_embds (`bool`, *optional*, defaults to `True`):
             Whether or not to use axial position embeddings. For more information on how axial position embeddings
-            work, see `Axial Position Encodings <reformer.html#axial-positional-encodings>`__.
-        axial_norm_std (:obj:`float`, `optional`, defaults to 1.0):
+            work, see [Axial Position Encodings](reformer#axial-positional-encodings).
+        axial_norm_std (`float`, *optional*, defaults to 1.0):
             The standard deviation of the normal_initializer for initializing the weight matrices of the axial
             positional encodings.
-        axial_pos_shape (:obj:`List[int]`, `optional`, defaults to :obj:`[64, 64]`):
+        axial_pos_shape (`List[int]`, *optional*, defaults to `[64, 64]`):
             The position dims of the axial position encodings. During training, the product of the position dims has to
             be equal to the sequence length.
 
-            For more information on how axial position embeddings work, see `Axial Position Encodings
-            <reformer.html#axial-positional-encodings>`__.
-        axial_pos_embds_dim (:obj:`List[int]`, `optional`, defaults to :obj:`[64, 192]`):
+            For more information on how axial position embeddings work, see [Axial Position Encodings](reformer#axial-positional-encodings).
+        axial_pos_embds_dim (`List[int]`, *optional*, defaults to `[64, 192]`):
             The embedding dims of the axial position encodings. The sum of the embedding dims has to be equal to the
             hidden size.
 
-            For more information on how axial position embeddings work, see `Axial Position Encodings
-            <reformer.html#axial-positional-encodings>`__.
-        chunk_size_lm_head (:obj:`int`, `optional`, defaults to 0):
+            For more information on how axial position embeddings work, see [Axial Position Encodings](reformer#axial-positional-encodings).
+        chunk_size_lm_head (`int`, *optional*, defaults to 0):
             The chunk size of the final language model feed forward head layer. A chunk size of 0 means that the feed
             forward layer is not chunked. A chunk size of n means that the feed forward layer processes n <
             sequence_length embeddings at a time.
 
-            For more information on feed forward chunking, see `How does Feed Forward Chunking work?
-            <../glossary.html#feed-forward-chunking>`__.
-        eos_token_id (:obj:`int`, `optional`, defaults to 2):
+            For more information on feed forward chunking, see [How does Feed Forward Chunking work?](../glossary#feed-forward-chunking).
+        eos_token_id (`int`, *optional*, defaults to 2):
             The token id for the end-of-sentence token.
-        feed_forward_size (:obj:`int`, `optional`, defaults to 512):
+        feed_forward_size (`int`, *optional*, defaults to 512):
             Dimensionality of the feed_forward layer in the residual attention block.
-        hash_seed (:obj:`int`, `optional`):
-            Seed that can be used to make local sensitive hashing in :obj:`LSHSelfAttention` deterministic. This should
-            only be set for testing purposed. For evaluation and training purposes :obj:`hash_seed` should be left as
-            :obj:`None` to ensure fully random rotations in local sensitive hashing scheme.
-        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"relu"`):
+        hash_seed (`int`, *optional*):
+            Seed that can be used to make local sensitive hashing in `LSHSelfAttention` deterministic. This should
+            only be set for testing purposed. For evaluation and training purposes `hash_seed` should be left as
+            `None` to ensure fully random rotations in local sensitive hashing scheme.
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"relu"`):
             The non-linear activation function (function or string) in the feed forward layer in the residual attention
-            block. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.05):
+            block. If string, `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.05):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        hidden_size (:obj:`int`, `optional`, defaults to 256):
+        hidden_size (`int`, *optional*, defaults to 256):
             Dimensionality of the output hidden states of the residual attention blocks.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        is_decoder (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to use a causal mask in addition to the :obj:`attention_mask` passed to
-            :class:`~transformers.ReformerModel`. When using the Reformer for causal language modeling, this argument
-            should be set to :obj:`True`.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether or not to use a causal mask in addition to the `attention_mask` passed to
+            [`ReformerModel`]. When using the Reformer for causal language modeling, this argument
+            should be set to `True`.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        local_chunk_length (:obj:`int`, `optional`, defaults to 64):
-            Length of chunk which attends to itself in :obj:`LocalSelfAttention`. Chunking reduces memory complexity
+        local_chunk_length (`int`, *optional*, defaults to 64):
+            Length of chunk which attends to itself in `LocalSelfAttention`. Chunking reduces memory complexity
             from sequence length x sequence length (self attention) to chunk length x chunk length x sequence length /
             chunk length (chunked self attention).
-        local_num_chunks_before (:obj:`int`, `optional`, defaults to 1):
-            Number of previous neighbouring chunks to attend to in :obj:`LocalSelfAttention` layer to itself.
-        local_num_chunks_after (:obj:`int`, `optional`, defaults to 0):
-            Number of following neighbouring chunks to attend to in :obj:`LocalSelfAttention` layer in addition to
+        local_num_chunks_before (`int`, *optional*, defaults to 1):
+            Number of previous neighbouring chunks to attend to in `LocalSelfAttention` layer to itself.
+        local_num_chunks_after (`int`, *optional*, defaults to 0):
+            Number of following neighbouring chunks to attend to in `LocalSelfAttention` layer in addition to
             itself.
-        local_attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout ratio for the attention probabilities in :obj:`LocalSelfAttention`.
-        lsh_attn_chunk_length (:obj:`int`, `optional`, defaults to 64):
-            Length of chunk which attends to itself in :obj:`LSHSelfAttention`. Chunking reduces memory complexity from
+        local_attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities in `LocalSelfAttention`.
+        lsh_attn_chunk_length (`int`, *optional*, defaults to 64):
+            Length of chunk which attends to itself in `LSHSelfAttention`. Chunking reduces memory complexity from
             sequence length x sequence length (self attention) to chunk length x chunk length x sequence length / chunk
             length (chunked self attention).
-        lsh_num_chunks_before (:obj:`int`, `optional`, defaults to 1):
-            Number of previous neighbouring chunks to attend to in :obj:`LSHSelfAttention` layer to itself.
-        lsh_num_chunks_after (:obj:`int`, `optional`, defaults to 0):
-            Number of following neighbouring chunks to attend to in :obj:`LSHSelfAttention` layer to itself.
-        lsh_attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout ratio for the attention probabilities in :obj:`LSHSelfAttention`.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 4096):
+        lsh_num_chunks_before (`int`, *optional*, defaults to 1):
+            Number of previous neighbouring chunks to attend to in `LSHSelfAttention` layer to itself.
+        lsh_num_chunks_after (`int`, *optional*, defaults to 0):
+            Number of following neighbouring chunks to attend to in `LSHSelfAttention` layer to itself.
+        lsh_attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities in `LSHSelfAttention`.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        num_buckets (:obj:`int` or :obj:`List[int]`, `optional`):
+        num_buckets (`int` or `List[int]`, *optional*):
             Number of buckets, the key query vectors can be "hashed into" using the locality sensitive hashing scheme.
-            Each query key vector is hashed into a hash in :obj:`1, ..., num_buckets`. The number of buckets can also
+            Each query key vector is hashed into a hash in `1, ..., num_buckets`. The number of buckets can also
             be factorized into a list for improved memory complexity. In this case, each query key vector is hashed
-            into a hash in :obj:`1-1, 1-2, ..., num_buckets[0]-1, ..., num_buckets[0]-num_buckets[1]` if
-            :obj:`num_buckets` is factorized into two factors. The number of buckets (or the product the factors)
-            should approximately equal sequence length / lsh_chunk_length. If :obj:`num_buckets` not set, a good value
+            into a hash in `1-1, 1-2, ..., num_buckets[0]-1, ..., num_buckets[0]-num_buckets[1]` if
+            `num_buckets` is factorized into two factors. The number of buckets (or the product the factors)
+            should approximately equal sequence length / lsh_chunk_length. If `num_buckets` not set, a good value
             is calculated on the fly.
-        num_hashes (:obj:`int`, `optional`, defaults to 1):
+        num_hashes (`int`, *optional*, defaults to 1):
             Number of hashing rounds (e.g., number of random rotations) in Local Sensitive Hashing scheme. The higher
-            :obj:`num_hashes`, the more accurate the :obj:`LSHSelfAttention` becomes, but also the more memory and time
+            `num_hashes`, the more accurate the `LSHSelfAttention` becomes, but also the more memory and time
             intensive the hashing becomes.
-        pad_token_id (:obj:`int`, `optional`, defaults to 0):
+        pad_token_id (`int`, *optional*, defaults to 0):
             The token id for the padding token.
-        vocab_size (:obj:`int`, `optional`, defaults to 320):\
+        vocab_size (`int`, *optional*, defaults to 320):\
             Vocabulary size of the Reformer model. Defines the number of different tokens that can be represented by
-            the :obj:`inputs_ids` passed when calling :class:`~transformers.ReformerModel`.
-        tie_word_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            the `inputs_ids` passed when calling [`ReformerModel`].
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
             Whether to tie input and output embeddings.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
-        classifier_dropout (:obj:`float`, `optional`):
+        classifier_dropout (`float`, *optional*):
             The dropout ratio for the classification head.
 
-    Examples::
+    Examples:
 
-        >>> from transformers import ReformerModel, ReformerConfig
+    ```python
+    >>> from transformers import ReformerModel, ReformerConfig
 
-        >>> # Initializing a Reformer configuration
-        >>> configuration = ReformerConfig()
+    >>> # Initializing a Reformer configuration
+    >>> configuration = ReformerConfig()
 
-        >>> # Initializing a Reformer model
-        >>> model = ReformerModel(configuration)
+    >>> # Initializing a Reformer model
+    >>> model = ReformerModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+"""
     model_type = "reformer"
     keys_to_ignore_at_inference = ["past_buckets_states"]
     attribute_map = {}
diff --git a/src/transformers/models/reformer/tokenization_reformer.py b/src/transformers/models/reformer/tokenization_reformer.py
index c816e73a7a..cfd6fa6651 100644
--- a/src/transformers/models/reformer/tokenization_reformer.py
+++ b/src/transformers/models/reformer/tokenization_reformer.py
@@ -45,42 +45,44 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 
 class ReformerTokenizer(PreTrainedTokenizer):
     """
-    Construct a Reformer tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__ .
+    Construct a Reformer tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece) .
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        additional_special_tokens (:obj:`List[str]`, `optional`):
+        additional_special_tokens (`List[str]`, *optional*):
             Additional special tokens used by the tokenizer.
-        sp_model_kwargs (:obj:`dict`, `optional`):
-            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
-            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
 
-            - ``enable_sampling``: Enable subword regularization.
-            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
 
-              - ``nbest_size = {0,1}``: No sampling is performed.
-              - ``nbest_size > 1``: samples from the nbest_size results.
-              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                 using forward-filtering-and-backward-sampling algorithm.
 
-            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
               BPE-dropout.
     """
 
diff --git a/src/transformers/models/reformer/tokenization_reformer_fast.py b/src/transformers/models/reformer/tokenization_reformer_fast.py
index 3fc8583c81..f466e69837 100644
--- a/src/transformers/models/reformer/tokenization_reformer_fast.py
+++ b/src/transformers/models/reformer/tokenization_reformer_fast.py
@@ -53,29 +53,31 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 
 class ReformerTokenizerFast(PreTrainedTokenizerFast):
     """
-    Construct a "fast" Reformer tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
-    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__.
+    Construct a "fast" Reformer tokenizer (backed by HuggingFace's *tokenizers* library). Based on [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
     methods. Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        additional_special_tokens (:obj:`List[str]`, `optional`):
+        additional_special_tokens (`List[str]`, *optional*):
             Additional special tokens used by the tokenizer.
     """
 
diff --git a/src/transformers/models/rembert/configuration_rembert.py b/src/transformers/models/rembert/configuration_rembert.py
index 51c899dfc9..5459afad4f 100644
--- a/src/transformers/models/rembert/configuration_rembert.py
+++ b/src/transformers/models/rembert/configuration_rembert.py
@@ -28,56 +28,60 @@ REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class RemBertConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.RemBertModel`. It is used to
+    This is the configuration class to store the configuration of a [`RemBertModel`]. It is used to
     instantiate an RemBERT model according to the specified arguments, defining the model architecture. Instantiating a
     configuration with the defaults will yield a similar configuration to that of the remert-large architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 250300):
+        vocab_size (`int`, *optional*, defaults to 250300):
             Vocabulary size of the RemBERT model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.RemBertModel` or
-            :class:`~transformers.TFRemBertModel`. Vocabulary size of the model. Defines the different tokens that can
-            be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.RemBertModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 1152):
+            `inputs_ids` passed when calling [`RemBertModel`] or
+            [`TFRemBertModel`]. Vocabulary size of the model. Defines the different tokens that can
+            be represented by the *inputs_ids* passed to the forward method of [`RemBertModel`].
+        hidden_size (`int`, *optional*, defaults to 1152):
             Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 32):
+        num_hidden_layers (`int`, *optional*, defaults to 32):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 18):
+        num_attention_heads (`int`, *optional*, defaults to 18):
             Number of attention heads for each attention layer in the Transformer encoder.
-        input_embedding_size (:obj:`int`, `optional`, defaults to 256):
+        input_embedding_size (`int`, *optional*, defaults to 256):
             Dimensionality of the input embeddings.
-        output_embedding_size (:obj:`int`, `optional`, defaults to 1664):
+        output_embedding_size (`int`, *optional*, defaults to 1664):
             Dimensionality of the output embeddings.
-        intermediate_size (:obj:`int`, `optional`, defaults to 4608):
+        intermediate_size (`int`, *optional*, defaults to 4608):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0):
             The dropout ratio for the attention probabilities.
-        classifier_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        classifier_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the classifier layer when fine-tuning.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.RemBertModel` or
-            :class:`~transformers.TFRemBertModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`RemBertModel`] or
+            [`TFRemBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if ``config.is_decoder=True``.
+            relevant if `config.is_decoder=True`.
 
-        Example::
+    Example:
+
+    ```python
+
+    ```
 
         >>> from transformers import RemBertModel, RemBertConfig
         >>> # Initializing a RemBERT rembert style configuration
diff --git a/src/transformers/models/rembert/tokenization_rembert.py b/src/transformers/models/rembert/tokenization_rembert.py
index 9b8f742283..d12cb75a3c 100644
--- a/src/transformers/models/rembert/tokenization_rembert.py
+++ b/src/transformers/models/rembert/tokenization_rembert.py
@@ -42,48 +42,54 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 
 class RemBertTokenizer(PreTrainedTokenizer):
     """
-    Construct a RemBERT tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
+    Construct a RemBERT tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        bos_token (`str`, *optional*, defaults to `"[CLS]"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"[SEP]"`):
             The end of sequence token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
 
     Attributes:
-        sp_model (:obj:`SentencePieceProcessor`):
-            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -170,17 +176,17 @@ class RemBertTokenizer(PreTrainedTokenizer):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. A REMBERT sequence has the following format:
 
-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
@@ -193,18 +199,18 @@ class RemBertTokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
 
         if already_has_special_tokens:
@@ -226,21 +232,21 @@ class RemBertTokenizer(PreTrainedTokenizer):
         Create a mask from the two sequences passed to be used in a sequence-pair classification task. A RemBERT
         sequence pair mask has the following format:
 
-        ::
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
 
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
-
-        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
             sequence(s).
         """
         sep = [self.sep_token_id]
diff --git a/src/transformers/models/rembert/tokenization_rembert_fast.py b/src/transformers/models/rembert/tokenization_rembert_fast.py
index 06fa05b7a1..335aa92a05 100644
--- a/src/transformers/models/rembert/tokenization_rembert_fast.py
+++ b/src/transformers/models/rembert/tokenization_rembert_fast.py
@@ -51,44 +51,46 @@ SPIECE_UNDERLINE = "▁"
 
 class RemBertTokenizerFast(PreTrainedTokenizerFast):
     """
-    Construct a "fast" RemBert tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
-    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__. This tokenizer
-    inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main methods. Users should
+    Construct a "fast" RemBert tokenizer (backed by HuggingFace's *tokenizers* library). Based on [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This tokenizer
+    inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
     refer to this superclass for more information regarding those methods
 
     Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
             Whether or not to lowercase the input when tokenizing.
-        remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        remove_space (`bool`, *optional*, defaults to `True`):
             Whether or not to strip the text when tokenizing (removing excess spaces before and after the string).
-        keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        keep_accents (`bool`, *optional*, defaults to `False`):
             Whether or not to keep accents when tokenizing.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        bos_token (`str`, *optional*, defaults to `"[CLS]"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
-            .. note::
+            <Tip>
 
-               When building a sequence using special tokens, this is not the token that is used for the beginning of
-               sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"[SEP]"`):
             The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token
-            that is used for the end of sequence. The token used is the :obj:`sep_token`.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            that is used for the end of sequence. The token used is the `sep_token`.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
     """
@@ -145,17 +147,17 @@ class RemBertTokenizerFast(PreTrainedTokenizerFast):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. A RemBERT sequence has the following format:
 
-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added
-            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+            token_ids_1 (`List[int]`, *optional*, defaults to `None`):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
@@ -168,18 +170,18 @@ class RemBertTokenizerFast(PreTrainedTokenizerFast):
     ) -> List[int]:
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+            token_ids_1 (`List[int]`, *optional*, defaults to `None`):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Set to True if the token list is already formatted with special tokens for the model
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
 
         if already_has_special_tokens:
@@ -201,21 +203,21 @@ class RemBertTokenizerFast(PreTrainedTokenizerFast):
         Creates a mask from the two sequences passed to be used in a sequence-pair classification task. A RemBERT
         sequence pair mask has the following format:
 
-        ::
-
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
 
         if token_ids_1 is None, only returns the first portion of the mask (0s).
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+            token_ids_1 (`List[int]`, *optional*, defaults to `None`):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
             sequence(s).
         """
         sep = [self.sep_token_id]
diff --git a/src/transformers/models/retribert/configuration_retribert.py b/src/transformers/models/retribert/configuration_retribert.py
index ffbb2af72f..6f5d15c290 100644
--- a/src/transformers/models/retribert/configuration_retribert.py
+++ b/src/transformers/models/retribert/configuration_retribert.py
@@ -28,44 +28,44 @@ RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class RetriBertConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.RetriBertModel`. It is used
+    This is the configuration class to store the configuration of a [`RetriBertModel`]. It is used
     to instantiate a RetriBertModel model according to the specified arguments, defining the model architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the RetriBERT model. Defines the number of different tokens that can be represented by
-            the :obj:`inputs_ids` passed when calling :class:`~transformers.RetriBertModel`
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            the `inputs_ids` passed when calling [`RetriBertModel`]
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the *token_type_ids* passed into [`BertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        share_encoders (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        share_encoders (`bool`, *optional*, defaults to `True`):
             Whether or not to use the same Bert-type encoder for the queries and document
-        projection_dim (:obj:`int`, `optional`, defaults to 128):
+        projection_dim (`int`, *optional*, defaults to 128):
             Final dimension of the query and document representation after projection
     """
     model_type = "retribert"
diff --git a/src/transformers/models/retribert/tokenization_retribert.py b/src/transformers/models/retribert/tokenization_retribert.py
index 085aafcd36..2c0ac65dd2 100644
--- a/src/transformers/models/retribert/tokenization_retribert.py
+++ b/src/transformers/models/retribert/tokenization_retribert.py
@@ -42,10 +42,10 @@ class RetriBertTokenizer(BertTokenizer):
     r"""
     Constructs a RetriBERT tokenizer.
 
-    :class:`~transformers.RetroBertTokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
+    [`RetroBertTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end
     tokenization: punctuation splitting and wordpiece.
 
-    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning
     parameters.
     """
 
diff --git a/src/transformers/models/retribert/tokenization_retribert_fast.py b/src/transformers/models/retribert/tokenization_retribert_fast.py
index 91f299b70b..71a038bd9e 100644
--- a/src/transformers/models/retribert/tokenization_retribert_fast.py
+++ b/src/transformers/models/retribert/tokenization_retribert_fast.py
@@ -44,12 +44,12 @@ PRETRAINED_INIT_CONFIGURATION = {
 
 class RetriBertTokenizerFast(BertTokenizerFast):
     r"""
-    Construct a "fast" RetriBERT tokenizer (backed by HuggingFace's `tokenizers` library).
+    Construct a "fast" RetriBERT tokenizer (backed by HuggingFace's *tokenizers* library).
 
-    :class:`~transformers.RetriBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
+    [`RetriBertTokenizerFast`] is identical to [`BertTokenizerFast`] and runs
     end-to-end tokenization: punctuation splitting and wordpiece.
 
-    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning
     parameters.
     """
 
diff --git a/src/transformers/models/roberta/configuration_roberta.py b/src/transformers/models/roberta/configuration_roberta.py
index 25fc855bd4..db462b17c2 100644
--- a/src/transformers/models/roberta/configuration_roberta.py
+++ b/src/transformers/models/roberta/configuration_roberta.py
@@ -36,30 +36,31 @@ ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class RobertaConfig(BertConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.RobertaModel` or a
-    :class:`~transformers.TFRobertaModel`. It is used to instantiate a RoBERTa model according to the specified
+    This is the configuration class to store the configuration of a [`RobertaModel`] or a
+    [`TFRobertaModel`]. It is used to instantiate a RoBERTa model according to the specified
     arguments, defining the model architecture.
 
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
-    The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`. It reuses the
+    The [`RobertaConfig`] class directly inherits [`BertConfig`]. It reuses the
     same defaults. Please check the parent class for more information.
 
-    Examples::
+    Examples:
 
-        >>> from transformers import RobertaConfig, RobertaModel
+    ```python
+    >>> from transformers import RobertaConfig, RobertaModel
 
-        >>> # Initializing a RoBERTa configuration
-        >>> configuration = RobertaConfig()
+    >>> # Initializing a RoBERTa configuration
+    >>> configuration = RobertaConfig()
 
-        >>> # Initializing a model from the configuration
-        >>> model = RobertaModel(configuration)
+    >>> # Initializing a model from the configuration
+    >>> model = RobertaModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "roberta"
 
     def __init__(self, pad_token_id=1, bos_token_id=0, eos_token_id=2, **kwargs):
diff --git a/src/transformers/models/roberta/tokenization_roberta.py b/src/transformers/models/roberta/tokenization_roberta.py
index 8e9a0fbbc2..43aa99fc94 100644
--- a/src/transformers/models/roberta/tokenization_roberta.py
+++ b/src/transformers/models/roberta/tokenization_roberta.py
@@ -64,64 +64,71 @@ class RobertaTokenizer(GPT2Tokenizer):
     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 
-    ::
+    ```
+    >>> from transformers import RobertaTokenizer
+    >>> tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
+    >>> tokenizer("Hello world")['input_ids']
+    [0, 31414, 232, 328, 2]
+    >>> tokenizer(" Hello world")['input_ids']
+    [0, 20920, 232, 2]
+    ```
 
-        >>> from transformers import RobertaTokenizer
-        >>> tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
-        >>> tokenizer("Hello world")['input_ids']
-        [0, 31414, 232, 328, 2]
-        >>> tokenizer(" Hello world")['input_ids']
-        [0, 20920, 232, 2]
-
-    You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
+    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
     call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
 
-    .. note::
+    <Tip>
 
-        When used with ``is_split_into_words=True``, this tokenizer will add a space before each word (even the first
-        one).
+    When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first
+    one).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    </Tip>
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             Path to the vocabulary file.
-        merges_file (:obj:`str`):
+        merges_file (`str`):
             Path to the merges file.
-        errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
-            Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
-            <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
             Whether or not to add an initial space to the input. This allows to treat the leading word just as any
             other word. (RoBERTa tokenizer detect beginning of words by the preceding space).
     """
@@ -178,17 +185,17 @@ class RobertaTokenizer(GPT2Tokenizer):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. A RoBERTa sequence has the following format:
 
-        - single sequence: ``<s> X </s>``
-        - pair of sequences: ``<s> A </s></s> B </s>``
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         if token_ids_1 is None:
             return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -201,18 +208,18 @@ class RobertaTokenizer(GPT2Tokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
         if already_has_special_tokens:
             return super().get_special_tokens_mask(
@@ -231,13 +238,13 @@ class RobertaTokenizer(GPT2Tokenizer):
         make use of token type ids, therefore a list of zeros is returned.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of zeros.
+            `List[int]`: List of zeros.
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
diff --git a/src/transformers/models/roberta/tokenization_roberta_fast.py b/src/transformers/models/roberta/tokenization_roberta_fast.py
index 124fe3fce2..140c9f6d1d 100644
--- a/src/transformers/models/roberta/tokenization_roberta_fast.py
+++ b/src/transformers/models/roberta/tokenization_roberta_fast.py
@@ -65,73 +65,80 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 
 class RobertaTokenizerFast(GPT2TokenizerFast):
     """
-    Construct a "fast" RoBERTa tokenizer (backed by HuggingFace's `tokenizers` library), derived from the GPT-2
+    Construct a "fast" RoBERTa tokenizer (backed by HuggingFace's *tokenizers* library), derived from the GPT-2
     tokenizer, using byte-level Byte-Pair-Encoding.
 
     This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
     be encoded differently whether it is at the beginning of the sentence (without space) or not:
 
-    ::
+    ```
+    >>> from transformers import RobertaTokenizerFast
+    >>> tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
+    >>> tokenizer("Hello world")['input_ids']
+    [0, 31414, 232, 328, 2]
+    >>> tokenizer(" Hello world")['input_ids']
+    [0, 20920, 232, 2]
+    ```
 
-        >>> from transformers import RobertaTokenizerFast
-        >>> tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
-        >>> tokenizer("Hello world")['input_ids']
-        [0, 31414, 232, 328, 2]
-        >>> tokenizer(" Hello world")['input_ids']
-        [0, 20920, 232, 2]
-
-    You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
+    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
     call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
 
-    .. note::
+    <Tip>
 
-        When used with ``is_split_into_words=True``, this tokenizer needs to be instantiated with
-        ``add_prefix_space=True``.
+    When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with
+    `add_prefix_space=True`.
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    </Tip>
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
     methods. Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             Path to the vocabulary file.
-        merges_file (:obj:`str`):
+        merges_file (`str`):
             Path to the merges file.
-        errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
-            Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
-            <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
             Whether or not to add an initial space to the input. This allows to treat the leading word just as any
             other word. (RoBERTa tokenizer detect beginning of words by the preceding space).
-        trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        trim_offsets (`bool`, *optional*, defaults to `True`):
             Whether the post processing step should trim offsets to avoid including whitespaces.
     """
 
@@ -176,11 +183,11 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
     @property
     def mask_token(self) -> str:
         """
-        :obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
+        `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
         not having been set.
 
         Roberta tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
-        comprise the space before the `<mask>`.
+        comprise the space before the *<mask>*.
         """
         if self._mask_token is None and self.verbose:
             logger.error("Using mask_token, but it is not set yet.")
@@ -214,13 +221,13 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
         make use of token type ids, therefore a list of zeros is returned.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of zeros.
+            `List[int]`: List of zeros.
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
diff --git a/src/transformers/models/roformer/configuration_roformer.py b/src/transformers/models/roformer/configuration_roformer.py
index 5027b3be1f..9ea3e57a14 100644
--- a/src/transformers/models/roformer/configuration_roformer.py
+++ b/src/transformers/models/roformer/configuration_roformer.py
@@ -33,67 +33,68 @@ ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class RoFormerConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.RoFormerModel`. It is used to
+    This is the configuration class to store the configuration of a [`RoFormerModel`]. It is used to
     instantiate an RoFormer model according to the specified arguments, defining the model architecture. Instantiating
     a configuration with the defaults will yield a similar configuration to that of the RoFormer
-    `junnyu/roformer_chinese_base <https://huggingface.co/junnyu/roformer_chinese_base>`__ architecture.
+    [junnyu/roformer_chinese_base](https://huggingface.co/junnyu/roformer_chinese_base) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 50000):
+        vocab_size (`int`, *optional*, defaults to 50000):
             Vocabulary size of the RoFormer model. Defines the number of different tokens that can be represented by
-            the :obj:`inputs_ids` passed when calling :class:`~transformers.RoFormerModel` or
-            :class:`~transformers.TFRoFormerModel`.
-        embedding_size (:obj:`int`, `optional`, defaults to None):
-            Dimensionality of the encoder layers and the pooler layer. Defaults to the :obj:`hidden_size` if not
+            the `inputs_ids` passed when calling [`RoFormerModel`] or
+            [`TFRoFormerModel`].
+        embedding_size (`int`, *optional*, defaults to None):
+            Dimensionality of the encoder layers and the pooler layer. Defaults to the `hidden_size` if not
             provided.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimension of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 1536):
+        max_position_embeddings (`int`, *optional*, defaults to 1536):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 1536).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.RoFormerModel`
-            or :class:`~transformers.TFRoFormerModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`RoFormerModel`]
+            or [`TFRoFormerModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if ``config.is_decoder=True``.
-        rotary_value (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            relevant if `config.is_decoder=True`.
+        rotary_value (`bool`, *optional*, defaults to `False`):
             Whether or not apply rotary position embeddings on value layer.
 
-    Example::
+    Example:
 
-        >>> from transformers import RoFormerModel, RoFormerConfig
+    ```python
+    >>> from transformers import RoFormerModel, RoFormerConfig
 
-        >>> # Initializing a RoFormer junnyu/roformer_chinese_base style configuration
-        >>> configuration = RoFormerConfig()
+    >>> # Initializing a RoFormer junnyu/roformer_chinese_base style configuration
+    >>> configuration = RoFormerConfig()
 
-        >>> # Initializing a model from the junnyu/roformer_chinese_base style configuration
-        >>> model = RoFormerModel(configuration)
+    >>> # Initializing a model from the junnyu/roformer_chinese_base style configuration
+    >>> model = RoFormerModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "roformer"
 
     def __init__(
diff --git a/src/transformers/models/roformer/tokenization_roformer.py b/src/transformers/models/roformer/tokenization_roformer.py
index c2fb637c4c..3e9e8832fd 100644
--- a/src/transformers/models/roformer/tokenization_roformer.py
+++ b/src/transformers/models/roformer/tokenization_roformer.py
@@ -60,53 +60,52 @@ PRETRAINED_INIT_CONFIGURATION = {
 
 class RoFormerTokenizer(PreTrainedTokenizer):
     r"""
-    Construct a RoFormer tokenizer. Based on `Rust Jieba <https://pypi.org/project/rjieba/>`.
+    Construct a RoFormer tokenizer. Based on *Rust Jieba <https://pypi.org/project/rjieba/>*.
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             File containing the vocabulary.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
             Whether or not to lowercase the input when tokenizing.
-        do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
             Whether or not to do basic tokenization before WordPiece.
-        never_split (:obj:`Iterable`, `optional`):
+        never_split (`Iterable`, *optional*):
             Collection of tokens which will never be split during tokenization. Only has an effect when
-            :obj:`do_basic_tokenize=True`
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+            `do_basic_tokenize=True`
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
             The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
             Whether or not to tokenize Chinese characters.
 
-            This should likely be deactivated for Japanese (see this `issue
-            <https://github.com/huggingface/transformers/issues/328>`__).
-        strip_accents: (:obj:`bool`, `optional`):
+            This should likely be deactivated for Japanese (see this [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents: (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for :obj:`lowercase` (as in the original BERT).
+            value for `lowercase` (as in the original BERT).
 
-    Example::
+    Example:
 
-        >>> from transformers import RoFormerTokenizer
-        >>> tokenizer = RoFormerTokenizer.from_pretrained('junnyu/roformer_chinese_base')
-        >>> tokenizer.tokenize("今天天气非常好。")
-        # ['今', '天', '天', '气', '非常', '好', '。']
-
-    """
+    ```python
+    >>> from transformers import RoFormerTokenizer
+    >>> tokenizer = RoFormerTokenizer.from_pretrained('junnyu/roformer_chinese_base')
+    >>> tokenizer.tokenize("今天天气非常好。")
+    # ['今', '天', '天', '气', '非常', '好', '。']
+    ```"""
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
@@ -230,17 +229,17 @@ class RoFormerTokenizer(PreTrainedTokenizer):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. A RoFormer sequence has the following format:
 
-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         if token_ids_1 is None:
             return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -253,18 +252,18 @@ class RoFormerTokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
 
         if already_has_special_tokens:
@@ -283,21 +282,21 @@ class RoFormerTokenizer(PreTrainedTokenizer):
         Create a mask from the two sequences passed to be used in a sequence-pair classification task. A RoFormer
         sequence pair mask has the following format:
 
-        ::
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
 
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
-
-        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
             sequence(s).
         """
         sep = [self.sep_token_id]
diff --git a/src/transformers/models/roformer/tokenization_roformer_fast.py b/src/transformers/models/roformer/tokenization_roformer_fast.py
index 736f157f92..b7ef87181f 100644
--- a/src/transformers/models/roformer/tokenization_roformer_fast.py
+++ b/src/transformers/models/roformer/tokenization_roformer_fast.py
@@ -62,23 +62,23 @@ PRETRAINED_INIT_CONFIGURATION = {
 
 class RoFormerTokenizerFast(PreTrainedTokenizerFast):
     r"""
-    Construct a "fast" RoFormer tokenizer (backed by HuggingFace's `tokenizers` library).
+    Construct a "fast" RoFormer tokenizer (backed by HuggingFace's *tokenizers* library).
 
-    :class:`~transformers.RoFormerTokenizerFast` is almost identical to :class:`~transformers.BertTokenizerFast` and
+    [`RoFormerTokenizerFast`] is almost identical to [`BertTokenizerFast`] and
     runs end-to-end tokenization: punctuation splitting and wordpiece. There are some difference between them when
     tokenizing Chinese.
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
     methods. Users should refer to this superclass for more information regarding those methods.
 
-    Example::
+    Example:
 
-        >>> from transformers import RoFormerTokenizerFast
-        >>> tokenizer = RoFormerTokenizerFast.from_pretrained('junnyu/roformer_chinese_base')
-        >>> tokenizer.tokenize("今天天气非常好。")
-        # ['今', '天', '天', '气', '非常', '好', '。']
-
-    """
+    ```python
+    >>> from transformers import RoFormerTokenizerFast
+    >>> tokenizer = RoFormerTokenizerFast.from_pretrained('junnyu/roformer_chinese_base')
+    >>> tokenizer.tokenize("今天天气非常好。")
+    # ['今', '天', '天', '气', '非常', '好', '。']
+    ```"""
 
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
@@ -141,17 +141,17 @@ class RoFormerTokenizerFast(PreTrainedTokenizerFast):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. A RoFormer sequence has the following format:
 
-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
 
@@ -167,21 +167,21 @@ class RoFormerTokenizerFast(PreTrainedTokenizerFast):
         Create a mask from the two sequences passed to be used in a sequence-pair classification task. A RoFormer
         sequence pair mask has the following format:
 
-        ::
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
 
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
-
-        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
             sequence(s).
         """
         sep = [self.sep_token_id]
diff --git a/src/transformers/models/segformer/configuration_segformer.py b/src/transformers/models/segformer/configuration_segformer.py
index c2283169db..750302d1a2 100644
--- a/src/transformers/models/segformer/configuration_segformer.py
+++ b/src/transformers/models/segformer/configuration_segformer.py
@@ -28,75 +28,76 @@ SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class SegformerConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.SegformerModel`. It is used
+    This is the configuration class to store the configuration of a [`SegformerModel`]. It is used
     to instantiate an SegFormer model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the SegFormer
-    `nvidia/segformer-b0-finetuned-ade-512-512 <https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512>`__
+    [nvidia/segformer-b0-finetuned-ade-512-512](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512)
     architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        image_size (:obj:`int`, `optional`, defaults to 512):
+        image_size (`int`, *optional*, defaults to 512):
             The size (resolution) of each image.
-        num_channels (:obj:`int`, `optional`, defaults to 3):
+        num_channels (`int`, *optional*, defaults to 3):
             The number of input channels.
-        num_encoder_blocks (:obj:`int`, `optional`, defaults to 4):
+        num_encoder_blocks (`int`, *optional*, defaults to 4):
             The number of encoder blocks (i.e. stages in the Mix Transformer encoder).
-        depths (:obj:`List[int]`, `optional`, defaults to [2, 2, 2, 2]):
+        depths (`List[int]`, *optional*, defaults to [2, 2, 2, 2]):
             The number of layers in each encoder block.
-        sr_ratios (:obj:`List[int]`, `optional`, defaults to [8, 4, 2, 1]):
+        sr_ratios (`List[int]`, *optional*, defaults to [8, 4, 2, 1]):
             Sequence reduction ratios in each encoder block.
-        hidden_sizes (:obj:`List[int]`, `optional`, defaults to [32, 64, 160, 256]):
+        hidden_sizes (`List[int]`, *optional*, defaults to [32, 64, 160, 256]):
             Dimension of each of the encoder blocks.
-        downsampling_rates (:obj:`List[int]`, `optional`, defaults to [1, 4, 8, 16]):
+        downsampling_rates (`List[int]`, *optional*, defaults to [1, 4, 8, 16]):
             Downsample rate of the image resolution compared to the original image size before each encoder block.
-        patch_sizes (:obj:`List[int]`, `optional`, defaults to [7, 3, 3, 3]):
+        patch_sizes (`List[int]`, *optional*, defaults to [7, 3, 3, 3]):
             Patch size before each encoder block.
-        strides (:obj:`List[int]`, `optional`, defaults to [4, 2, 2, 2]):
+        strides (`List[int]`, *optional*, defaults to [4, 2, 2, 2]):
             Stride before each encoder block.
-        num_attention_heads (:obj:`List[int]`, `optional`, defaults to [1, 2, 4, 8]):
+        num_attention_heads (`List[int]`, *optional*, defaults to [1, 2, 4, 8]):
             Number of attention heads for each attention layer in each block of the Transformer encoder.
-        mlp_ratios (:obj:`List[int]`, `optional`, defaults to [4, 4, 4, 4]):
+        mlp_ratios (`List[int]`, *optional*, defaults to [4, 4, 4, 4]):
             Ratio of the size of the hidden layer compared to the size of the input layer of the Mix FFNs in the
             encoder blocks.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.0):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.0):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        classifier_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        classifier_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probability before the classification head.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        drop_path_rate (:obj:`float`, `optional`, defaults to 0.1):
+        drop_path_rate (`float`, *optional*, defaults to 0.1):
             The dropout probability for stochastic depth, used in the blocks of the Transformer encoder.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-6):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
             The epsilon used by the layer normalization layers.
-        decoder_hidden_size (:obj:`int`, `optional`, defaults to 256):
+        decoder_hidden_size (`int`, *optional*, defaults to 256):
             The dimension of the all-MLP decode head.
-        reshape_last_stage (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether to reshape the features of the last stage back to :obj:`(batch_size, num_channels, height, width)`.
+        reshape_last_stage (`bool`, *optional*, defaults to `True`):
+            Whether to reshape the features of the last stage back to `(batch_size, num_channels, height, width)`.
             Only required for the semantic segmentation model.
-        semantic_loss_ignore_index (:obj:`int`, `optional`, defaults to 255):
+        semantic_loss_ignore_index (`int`, *optional*, defaults to 255):
             The index that is ignored by the loss function of the semantic segmentation model.
 
-    Example::
+    Example:
 
-        >>> from transformers import SegformerModel, SegformerConfig
+    ```python
+    >>> from transformers import SegformerModel, SegformerConfig
 
-        >>> # Initializing a SegFormer nvidia/segformer-b0-finetuned-ade-512-512 style configuration
-        >>> configuration = SegformerConfig()
+    >>> # Initializing a SegFormer nvidia/segformer-b0-finetuned-ade-512-512 style configuration
+    >>> configuration = SegformerConfig()
 
-        >>> # Initializing a model from the nvidia/segformer-b0-finetuned-ade-512-512 style configuration
-        >>> model = SegformerModel(configuration)
+    >>> # Initializing a model from the nvidia/segformer-b0-finetuned-ade-512-512 style configuration
+    >>> model = SegformerModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "segformer"
 
     def __init__(
diff --git a/src/transformers/models/segformer/feature_extraction_segformer.py b/src/transformers/models/segformer/feature_extraction_segformer.py
index 5dbc1d8e98..14c6619446 100644
--- a/src/transformers/models/segformer/feature_extraction_segformer.py
+++ b/src/transformers/models/segformer/feature_extraction_segformer.py
@@ -38,28 +38,28 @@ class SegformerFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMi
     r"""
     Constructs a SegFormer feature extractor.
 
-    This feature extractor inherits from :class:`~transformers.FeatureExtractionMixin` which contains most of the main
+    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main
     methods. Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether to resize the input based on a certain :obj:`size`.
-        size (:obj:`int` or :obj:`Tuple(int)`, `optional`, defaults to 512):
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input based on a certain `size`.
+        size (`int` or `Tuple(int)`, *optional*, defaults to 512):
             Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an
-            integer is provided, then the input will be resized to (size, size). Only has an effect if :obj:`do_resize`
-            is set to :obj:`True`.
-        resample (:obj:`int`, `optional`, defaults to :obj:`PIL.Image.BILINEAR`):
-            An optional resampling filter. This can be one of :obj:`PIL.Image.NEAREST`, :obj:`PIL.Image.BOX`,
-            :obj:`PIL.Image.BILINEAR`, :obj:`PIL.Image.HAMMING`, :obj:`PIL.Image.BICUBIC` or :obj:`PIL.Image.LANCZOS`.
-            Only has an effect if :obj:`do_resize` is set to :obj:`True`.
-        do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            integer is provided, then the input will be resized to (size, size). Only has an effect if `do_resize`
+            is set to `True`.
+        resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
+            An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
+            `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`.
+            Only has an effect if `do_resize` is set to `True`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
             Whether or not to normalize the input with mean and standard deviation.
-        image_mean (:obj:`int`, `optional`, defaults to :obj:`[0.485, 0.456, 0.406]`):
+        image_mean (`int`, *optional*, defaults to `[0.485, 0.456, 0.406]`):
             The sequence of means for each channel, to be used when normalizing images. Defaults to the ImageNet mean.
-        image_std (:obj:`int`, `optional`, defaults to :obj:`[0.229, 0.224, 0.225]`):
+        image_std (`int`, *optional*, defaults to `[0.229, 0.224, 0.225]`):
             The sequence of standard deviations for each channel, to be used when normalizing images. Defaults to the
             ImageNet std.
-        reduce_labels (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        reduce_labels (`bool`, *optional*, defaults to `False`):
             Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 is
             used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). The
             background label will be replaced by 255.
@@ -97,34 +97,36 @@ class SegformerFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMi
         """
         Main method to prepare for the model one or several image(s) and optional corresponding segmentation maps.
 
-        .. warning::
+        <Tip warning={true}>
 
-           NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
-           PIL images.
+        NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
+        PIL images.
+
+        </Tip>
 
         Args:
-            images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is
                 the number of channels, H and W are image height and width.
 
-            segmentation_maps (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`, `optional`):
+            segmentation_maps (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
                 Optionally, the corresponding semantic segmentation maps with the pixel-wise annotations.
 
-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`, defaults to :obj:`'np'`):
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*, defaults to `'np'`):
                 If set, will return tensors of a particular framework. Acceptable values are:
 
-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return NumPy :obj:`np.ndarray` objects.
-                * :obj:`'jax'`: Return JAX :obj:`jnp.ndarray` objects.
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
 
         Returns:
-            :class:`~transformers.BatchFeature`: A :class:`~transformers.BatchFeature` with the following fields:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
             - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
               width).
-            - **labels** -- Optional labels to be fed to a model (when :obj:`segmentation_maps` are provided)
+            - **labels** -- Optional labels to be fed to a model (when `segmentation_maps` are provided)
         """
         # Input type checking for clearer error
         valid_images = False
diff --git a/src/transformers/models/segformer/modeling_segformer.py b/src/transformers/models/segformer/modeling_segformer.py
index bdbc577d43..51313688f6 100755
--- a/src/transformers/models/segformer/modeling_segformer.py
+++ b/src/transformers/models/segformer/modeling_segformer.py
@@ -485,22 +485,23 @@ class SegformerModel(SegformerPreTrainedModel):
         r"""
         Returns:
 
-        Examples::
+        Examples:
 
-            >>> from transformers import SegformerFeatureExtractor, SegformerModel
-            >>> from PIL import Image
-            >>> import requests
+        ```python
+        >>> from transformers import SegformerFeatureExtractor, SegformerModel
+        >>> from PIL import Image
+        >>> import requests
 
-            >>> feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
-            >>> model = SegformerModel("nvidia/segformer-b0-finetuned-ade-512-512")
+        >>> feature_extractor = SegformerFeatureExtractor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
+        >>> model = SegformerModel("nvidia/segformer-b0-finetuned-ade-512-512")
 
-            >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
-            >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> image = Image.open(requests.get(url, stream=True).raw)
 
-            >>> inputs = feature_extractor(images=image, return_tensors="pt")
-            >>> outputs = model(**inputs)
-            >>> sequence_output = outputs.last_hidden_state
-        """
+        >>> inputs = feature_extractor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> sequence_output = outputs.last_hidden_state
+        ```"""
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
diff --git a/src/transformers/models/sew/configuration_sew.py b/src/transformers/models/sew/configuration_sew.py
index a5a7ff7908..cd939e9d71 100644
--- a/src/transformers/models/sew/configuration_sew.py
+++ b/src/transformers/models/sew/configuration_sew.py
@@ -28,123 +28,121 @@ SEW_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class SEWConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.SEWModel`. It is used to
+    This is the configuration class to store the configuration of a [`SEWModel`]. It is used to
     instantiate a SEW model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the SEW `asapp/sew-tiny-100k
-    <https://huggingface.co/asapp/sew-tiny-100k>`__ architecture.
+    configuration with the defaults will yield a similar configuration to that of the SEW [asapp/sew-tiny-100k](https://huggingface.co/asapp/sew-tiny-100k) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 32):
+        vocab_size (`int`, *optional*, defaults to 32):
             Vocabulary size of the SEW model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.SEW`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            `inputs_ids` passed when calling [`SEW`].
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        squeeze_factor (:obj:`int`, `optional`, defaults to 2):
+        squeeze_factor (`int`, *optional*, defaults to 2):
             Sequence length downsampling factor after the encoder and upsampling factor after the transformer.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
+        attention_dropout (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        final_dropout (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probability for the final projection layer of :class:`SEWForCTC`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        final_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the final projection layer of [`SEWForCTC`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        feat_extract_norm (:obj:`str`, `optional`, defaults to :obj:`"group"`):
-            The norm to be applied to 1D convolutional layers in feature extractor. One of :obj:`"group"` for group
-            normalization of only the first 1D convolutional layer or :obj:`"layer"` for layer normalization of all 1D
+        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
+            The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group
+            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
             convolutional layers.
-        feat_proj_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probability for output of the feature extractor.
-        feat_extract_activation (:obj:`str, `optional`, defaults to :obj:`"gelu"`):
+        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the 1D convolutional layers of the feature
-            extractor. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        conv_dim (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512)`):
+            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        conv_dim (`Tuple[int]`, *optional*, defaults to `(64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512)`):
             A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
-            feature extractor. The length of `conv_dim` defines the number of 1D convolutional layers.
-        conv_stride (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1)`):
+            feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers.
+        conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1)`):
             A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length
-            of `conv_stride` defines the number of convolutional layers and has to match the the length of `conv_dim`.
-        conv_kernel (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1)`):
+            of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
+        conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1)`):
             A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The
-            length of `conv_kernel` defines the number of convolutional layers and has to match the the length of
-            `conv_dim`.
-        conv_bias (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
+            *conv_dim*.
+        conv_bias (`bool`, *optional*, defaults to `False`):
             Whether the 1D convolutional layers have a bias.
-        num_conv_pos_embeddings (:obj:`int`, `optional`, defaults to 128):
+        num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
             Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
             embeddings layer.
-        num_conv_pos_embedding_groups (:obj:`int`, `optional`, defaults to 16):
+        num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
             Number of groups of 1D convolutional positional embeddings layer.
-        apply_spec_augment (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        apply_spec_augment (`bool`, *optional*, defaults to `True`):
             Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see
-            `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
-            <https://arxiv.org/abs/1904.08779>`__.
-        mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
+            [SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition](https://arxiv.org/abs/1904.08779).
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
             Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
             procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
             reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
-            masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
-            the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
-        mask_time_length (:obj:`int`, `optional`, defaults to 10):
+            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease
+            the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+        mask_time_length (`int`, *optional*, defaults to 10):
             Length of vector span along the time axis.
-        mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
-            The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
-            step, irrespectively of ``mask_feature_prob``. Only relevant if
+        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
             ''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
-        mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
             Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
             masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
             the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
-            span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
-            overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
-            is True``.
-        mask_feature_length (:obj:`int`, `optional`, defaults to 10):
+            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that
+            overlap may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
             Length of vector span along the feature axis.
-        mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
-            The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
-            step, irrespectively of ``mask_feature_prob``. Only relevant if
+        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
+            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
             ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
-        ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`):
-            Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an
-            instance of :class:`~transformers.SEWForCTC`.
-        ctc_zero_infinity (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to zero infinite losses and the associated gradients of ``torch.nn.CTCLoss``. Infinite losses
+        ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
+            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+            instance of [`SEWForCTC`].
+        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses
             mainly occur when the inputs are too short to be aligned to the targets. Only relevant when training an
-            instance of :class:`~transformers.SEWForCTC`.
-        use_weighted_layer_sum (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            instance of [`SEWForCTC`].
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
             Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
-            instance of :class:`~transformers.Wav2Vec2ForSequenceClassification`.
-        classifier_proj_size (:obj:`int`, `optional`, defaults to 256):
+            instance of [`Wav2Vec2ForSequenceClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 256):
             Dimensionality of the projection before token mean-pooling for classification.
 
-    Example::
+    Example:
 
-        >>> from transformers import SEWModel, SEWConfig
+    ```python
+    >>> from transformers import SEWModel, SEWConfig
 
-        >>> # Initializing a SEW asapp/sew-tiny-100k style configuration
-        >>> configuration = SEWConfig()
+    >>> # Initializing a SEW asapp/sew-tiny-100k style configuration
+    >>> configuration = SEWConfig()
 
-        >>> # Initializing a model from the asapp/sew-tiny-100k style configuration
-        >>> model = SEWModel(configuration)
+    >>> # Initializing a model from the asapp/sew-tiny-100k style configuration
+    >>> model = SEWModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "sew"
 
     def __init__(
diff --git a/src/transformers/models/sew_d/configuration_sew_d.py b/src/transformers/models/sew_d/configuration_sew_d.py
index 09976c9204..31727e85a5 100644
--- a/src/transformers/models/sew_d/configuration_sew_d.py
+++ b/src/transformers/models/sew_d/configuration_sew_d.py
@@ -28,143 +28,141 @@ SEW_D_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class SEWDConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.SEWDModel`. It is used to
+    This is the configuration class to store the configuration of a [`SEWDModel`]. It is used to
     instantiate a SEW-D model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the SEW-D `asapp/sew-d-tiny-100k
-    <https://huggingface.co/asapp/sew-d-tiny-100k>`__ architecture.
+    configuration with the defaults will yield a similar configuration to that of the SEW-D [asapp/sew-d-tiny-100k](https://huggingface.co/asapp/sew-d-tiny-100k) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 32):
+        vocab_size (`int`, *optional*, defaults to 32):
             Vocabulary size of the SEW-D model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.SEWD`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            `inputs_ids` passed when calling [`SEWD`].
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        squeeze_factor (:obj:`int`, `optional`, defaults to 2):
+        squeeze_factor (`int`, *optional*, defaults to 2):
             Sequence length downsampling factor after the encoder and upsampling factor after the transformer.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        position_buckets (:obj:`int`, `optional`, defaults to 256):
+        position_buckets (`int`, *optional*, defaults to 256):
             The maximum size of relative position embeddings.
-        share_att_key (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        share_att_key (`bool`, *optional*, defaults to `True`):
             Whether to share attention key with c2p and p2c.
-        relative_attention (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        relative_attention (`bool`, *optional*, defaults to `True`):
             Whether to use relative position encoding.
-        position_biased_input (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        position_biased_input (`bool`, *optional*, defaults to `False`):
             Whether to add absolute position embedding to content embedding.
-        pos_att_type (:obj:`Tuple[str]`, `optional`, defaults to :obj:`("p2c", "c2p")`):
-            The type of relative position attention, it can be a combination of :obj:`("p2c", "c2p", "p2p")`, e.g.
-            :obj:`("p2c")`, :obj:`("p2c", "c2p")`, :obj:`("p2c", "c2p", 'p2p")`.
-        norm_rel_ebd (:obj:`str`, `optional`, defaults to :obj:`"layer_norm"`):
-            Whether to use layer norm in relative embedding (:obj:`"layer_norm"` if yes)
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu_python"`):
+        pos_att_type (`Tuple[str]`, *optional*, defaults to `("p2c", "c2p")`):
+            The type of relative position attention, it can be a combination of `("p2c", "c2p", "p2p")`, e.g.
+            `("p2c")`, `("p2c", "c2p")`, `("p2c", "c2p", 'p2p")`.
+        norm_rel_ebd (`str`, *optional*, defaults to `"layer_norm"`):
+            Whether to use layer norm in relative embedding (`"layer_norm"` if yes)
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_python"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"`, :obj:`"gelu_python"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"`, `"gelu_python"` and `"gelu_new"` are supported.
+        hidden_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
+        attention_dropout (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        final_dropout (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probability for the final projection layer of :class:`SEWDForCTC`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        final_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the final projection layer of [`SEWDForCTC`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-7):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-7):
             The epsilon used by the layer normalization layers in the transformer encoder.
-        feature_layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-5):
+        feature_layer_norm_eps (`float`, *optional*, defaults to 1e-5):
             The epsilon used by the layer normalization after the feature extractor.
-        feat_extract_norm (:obj:`str`, `optional`, defaults to :obj:`"group"`):
-            The norm to be applied to 1D convolutional layers in feature extractor. One of :obj:`"group"` for group
-            normalization of only the first 1D convolutional layer or :obj:`"layer"` for layer normalization of all 1D
+        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
+            The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group
+            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
             convolutional layers.
-        feat_proj_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probability for output of the feature extractor.
-        feat_extract_activation (:obj:`str, `optional`, defaults to :obj:`"gelu"`):
+        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the 1D convolutional layers of the feature
-            extractor. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        conv_dim (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512)`):
+            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        conv_dim (`Tuple[int]`, *optional*, defaults to `(64, 128, 128, 128, 128, 256, 256, 256, 256, 512, 512, 512, 512)`):
             A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
-            feature extractor. The length of `conv_dim` defines the number of 1D convolutional layers.
-        conv_stride (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1)`):
+            feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers.
+        conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1)`):
             A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length
-            of `conv_stride` defines the number of convolutional layers and has to match the the length of `conv_dim`.
-        conv_kernel (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1)`):
+            of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
+        conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 1, 3, 1, 3, 1, 3, 1, 2, 1, 2, 1)`):
             A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The
-            length of `conv_kernel` defines the number of convolutional layers and has to match the the length of
-            `conv_dim`.
-        conv_bias (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
+            *conv_dim*.
+        conv_bias (`bool`, *optional*, defaults to `False`):
             Whether the 1D convolutional layers have a bias.
-        num_conv_pos_embeddings (:obj:`int`, `optional`, defaults to 128):
+        num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
             Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
             embeddings layer.
-        num_conv_pos_embedding_groups (:obj:`int`, `optional`, defaults to 16):
+        num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
             Number of groups of 1D convolutional positional embeddings layer.
-        apply_spec_augment (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        apply_spec_augment (`bool`, *optional*, defaults to `True`):
             Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see
-            `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
-            <https://arxiv.org/abs/1904.08779>`__.
-        mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
+            [SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition](https://arxiv.org/abs/1904.08779).
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
             Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
             procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
             reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
-            masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
-            the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
-        mask_time_length (:obj:`int`, `optional`, defaults to 10):
+            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease
+            the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+        mask_time_length (`int`, *optional*, defaults to 10):
             Length of vector span along the time axis.
-        mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
-            The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
-            step, irrespectively of ``mask_feature_prob``. Only relevant if
+        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
             ''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
-        mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
             Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
             masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
             the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
-            span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
-            overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
-            is True``.
-        mask_feature_length (:obj:`int`, `optional`, defaults to 10):
+            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that
+            overlap may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
             Length of vector span along the feature axis.
-        mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
-            The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
-            step, irrespectively of ``mask_feature_prob``. Only relevant if
+        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
+            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
             ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
-        diversity_loss_weight (:obj:`int`, `optional`, defaults to 0.1):
+        diversity_loss_weight (`int`, *optional*, defaults to 0.1):
             The weight of the codebook diversity loss component.
-        ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`):
-            Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an
-            instance of :class:`~transformers.SEWDForCTC`.
-        ctc_zero_infinity (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to zero infinite losses and the associated gradients of ``torch.nn.CTCLoss``. Infinite losses
+        ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
+            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+            instance of [`SEWDForCTC`].
+        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses
             mainly occur when the inputs are too short to be aligned to the targets. Only relevant when training an
-            instance of :class:`~transformers.SEWDForCTC`.
-        use_weighted_layer_sum (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            instance of [`SEWDForCTC`].
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
             Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
-            instance of :class:`~transformers.Wav2Vec2ForSequenceClassification`.
-        classifier_proj_size (:obj:`int`, `optional`, defaults to 256):
+            instance of [`Wav2Vec2ForSequenceClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 256):
             Dimensionality of the projection before token mean-pooling for classification.
 
-    Example::
+    Example:
 
-        >>> from transformers import SEWDModel, SEWDConfig
+    ```python
+    >>> from transformers import SEWDModel, SEWDConfig
 
-        >>> # Initializing a SEW-D asapp/sew-d-tiny-100k style configuration
-        >>> configuration = SEWDConfig()
+    >>> # Initializing a SEW-D asapp/sew-d-tiny-100k style configuration
+    >>> configuration = SEWDConfig()
 
-        >>> # Initializing a model from the asapp/sew-d-tiny-100k style configuration
-        >>> model = SEWDModel(configuration)
+    >>> # Initializing a model from the asapp/sew-d-tiny-100k style configuration
+    >>> model = SEWDModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "sew-d"
 
     def __init__(
diff --git a/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py
index dff79b35c3..6fa18bc124 100644
--- a/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py
@@ -26,49 +26,50 @@ logger = logging.get_logger(__name__)
 
 class SpeechEncoderDecoderConfig(PretrainedConfig):
     r"""
-    :class:`~transformers.SpeechEncoderDecoderConfig` is the configuration class to store the configuration of a
-    :class:`~transformers.SpeechEncoderDecoderModel`. It is used to instantiate an Encoder Decoder model according to
+    [`SpeechEncoderDecoderConfig`] is the configuration class to store the configuration of a
+    [`SpeechEncoderDecoderModel`]. It is used to instantiate an Encoder Decoder model according to
     the specified arguments, defining the encoder and decoder configs.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        kwargs (`optional`):
+        kwargs (*optional*):
             Dictionary of keyword arguments. Notably:
 
-                - **encoder** (:class:`~transformers.PretrainedConfig`, `optional`) -- An instance of a configuration
+                - **encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration
                   object that defines the encoder config.
-                - **decoder** (:class:`~transformers.PretrainedConfig`, `optional`) -- An instance of a configuration
+                - **decoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration
                   object that defines the decoder config.
 
-    Examples::
+    Examples:
 
-        >>> from transformers import BertConfig, Wav2Vec2Config, SpeechEncoderDecoderConfig, SpeechEncoderDecoderModel
+    ```python
+    >>> from transformers import BertConfig, Wav2Vec2Config, SpeechEncoderDecoderConfig, SpeechEncoderDecoderModel
 
-        >>> # Initializing a Wav2Vec2 & BERT style configuration
-        >>> config_encoder = Wav2Vec2Config()
-        >>> config_decoder = BertConfig()
+    >>> # Initializing a Wav2Vec2 & BERT style configuration
+    >>> config_encoder = Wav2Vec2Config()
+    >>> config_decoder = BertConfig()
 
-        >>> config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
+    >>> config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
 
-        >>> # Initializing a Wav2Vec2Bert model from a Wav2Vec2 & bert-base-uncased style configurations
-        >>> model = SpeechEncoderDecoderModel(config=config)
+    >>> # Initializing a Wav2Vec2Bert model from a Wav2Vec2 & bert-base-uncased style configurations
+    >>> model = SpeechEncoderDecoderModel(config=config)
 
-        >>> # Accessing the model configuration
-        >>> config_encoder = model.config.encoder
-        >>> config_decoder  = model.config.decoder
-        >>> # set decoder config to causal lm
-        >>> config_decoder.is_decoder = True
-        >>> config_decoder.add_cross_attention = True
+    >>> # Accessing the model configuration
+    >>> config_encoder = model.config.encoder
+    >>> config_decoder  = model.config.decoder
+    >>> # set decoder config to causal lm
+    >>> config_decoder.is_decoder = True
+    >>> config_decoder.add_cross_attention = True
 
-        >>> # Saving the model, including its configuration
-        >>> model.save_pretrained('my-model')
+    >>> # Saving the model, including its configuration
+    >>> model.save_pretrained('my-model')
 
-        >>> # loading model and config from pretrained folder
-        >>> encoder_decoder_config = SpeechEncoderDecoderConfig.from_pretrained('my-model')
-        >>> model = SpeechEncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config)
-    """
+    >>> # loading model and config from pretrained folder
+    >>> encoder_decoder_config = SpeechEncoderDecoderConfig.from_pretrained('my-model')
+    >>> model = SpeechEncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config)
+    ```"""
     model_type = "speech-encoder-decoder"
     is_composition = True
 
@@ -93,11 +94,11 @@ class SpeechEncoderDecoderConfig(PretrainedConfig):
         cls, encoder_config: PretrainedConfig, decoder_config: PretrainedConfig, **kwargs
     ) -> PretrainedConfig:
         r"""
-        Instantiate a :class:`~transformers.SpeechEncoderDecoderConfig` (or a derived class) from a pre-trained encoder
+        Instantiate a [`SpeechEncoderDecoderConfig`] (or a derived class) from a pre-trained encoder
         model configuration and decoder model configuration.
 
         Returns:
-            :class:`SpeechEncoderDecoderConfig`: An instance of a configuration object
+            [`SpeechEncoderDecoderConfig`]: An instance of a configuration object
         """
         logger.info("Setting `config.is_decoder=True` and `config.add_cross_attention=True` for decoder_config")
         decoder_config.is_decoder = True
@@ -107,10 +108,10 @@ class SpeechEncoderDecoderConfig(PretrainedConfig):
 
     def to_dict(self):
         """
-        Serializes this instance to a Python dictionary. Override the default `to_dict()` from `PretrainedConfig`.
+        Serializes this instance to a Python dictionary. Override the default *to_dict()* from *PretrainedConfig*.
 
         Returns:
-            :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
         """
         output = copy.deepcopy(self.__dict__)
         output["encoder"] = self.encoder.to_dict()
diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
index 8523d6ef81..b05b8af285 100644
--- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
@@ -435,26 +435,26 @@ class SpeechEncoderDecoderModel(PreTrainedModel):
         r"""
         Returns:
 
-        Examples::
+        Examples:
 
-            >>> from transformers import SpeechEncoderDecoderModel, Speech2Text2Processor
-            >>> from datasets import load_dataset
-            >>> import torch
+        ```python
+        >>> from transformers import SpeechEncoderDecoderModel, Speech2Text2Processor
+        >>> from datasets import load_dataset
+        >>> import torch
 
-            >>> processor = Speech2Text2Processor.from_pretrained('facebook/s2t-wav2vec2-large-en-de')
-            >>> model = SpeechEncoderDecoderModel.from_pretrained('facebook/s2t-wav2vec2-large-en-de')
+        >>> processor = Speech2Text2Processor.from_pretrained('facebook/s2t-wav2vec2-large-en-de')
+        >>> model = SpeechEncoderDecoderModel.from_pretrained('facebook/s2t-wav2vec2-large-en-de')
 
-            >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
 
-            >>> input_values = processor(ds[0]["audio"]["array"], return_tensors="pt").input_values
-            >>> decoder_input_ids = torch.tensor([[model.config.decoder.decoder_start_token_id]])
-            >>> outputs = model(input_values=input_values, decoder_input_ids=decoder_input_ids)
+        >>> input_values = processor(ds[0]["audio"]["array"], return_tensors="pt").input_values
+        >>> decoder_input_ids = torch.tensor([[model.config.decoder.decoder_start_token_id]])
+        >>> outputs = model(input_values=input_values, decoder_input_ids=decoder_input_ids)
 
-            >>> # inference (generation)
-            >>> generated = model.generate(input_values)
-            >>> translation = processor.batch_decode(generated)
-
-        """
+        >>> # inference (generation)
+        >>> generated = model.generate(input_values)
+        >>> translation = processor.batch_decode(generated)
+        ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         kwargs_encoder = {argument: value for argument, value in kwargs.items() if not argument.startswith("decoder_")}
diff --git a/src/transformers/models/speech_to_text/configuration_speech_to_text.py b/src/transformers/models/speech_to_text/configuration_speech_to_text.py
index 75fbca0f90..4a6b165b4d 100644
--- a/src/transformers/models/speech_to_text/configuration_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/configuration_speech_to_text.py
@@ -28,86 +28,87 @@ SPEECH_TO_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class Speech2TextConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.Speech2TextModel`. It is used
+    This is the configuration class to store the configuration of a [`Speech2TextModel`]. It is used
     to instantiate an Speech2Text model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the Speech2Text
-    `facebook/s2t-small-librispeech-asr <https://huggingface.co/facebook/s2t-small-librispeech-asr>`__ architecture.
+    [facebook/s2t-small-librispeech-asr](https://huggingface.co/facebook/s2t-small-librispeech-asr) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 50265):
+        vocab_size (`int`, *optional*, defaults to 50265):
             Vocabulary size of the Speech2Text model. Defines the number of different tokens that can be represented by
-            the :obj:`inputs_ids` passed when calling :class:`~transformers.Speech2TextModel`
-        d_model (:obj:`int`, `optional`, defaults to 1024):
+            the `inputs_ids` passed when calling [`Speech2TextModel`]
+        d_model (`int`, *optional*, defaults to 1024):
             Dimensionality of the layers and the pooler layer.
-        encoder_layers (:obj:`int`, `optional`, defaults to 12):
+        encoder_layers (`int`, *optional*, defaults to 12):
             Number of encoder layers.
-        decoder_layers (:obj:`int`, `optional`, defaults to 12):
+        decoder_layers (`int`, *optional*, defaults to 12):
             Number of decoder layers.
-        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        activation_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for classifier.
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
+        init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
-        max_source_positions (:obj:`int`, `optional`, defaults to 6000):
+        max_source_positions (`int`, *optional*, defaults to 6000):
             The maximum sequence length of log-mel filter-bank features that this model might ever be used with.
-        max_target_positions (:obj:`int`, `optional`, defaults to 1024):
+        max_target_positions (`int`, *optional*, defaults to 1024):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        num_conv_layers (:obj:`int`, `optional`, defaults to 2):
+        num_conv_layers (`int`, *optional*, defaults to 2):
             Number of 1D convolutional layers in the conv module.
-        conv_kernel_sizes (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(5, 5)`):
+        conv_kernel_sizes (`Tuple[int]`, *optional*, defaults to `(5, 5)`):
             A tuple of integers defining the kernel size of each 1D convolutional layer in the conv module. The length
-            of :obj:`conv_kernel_sizes` has to match :obj:`num_conv_layers`.
-        conv_channels (:obj:`int`, `optional`, defaults to 1024):
+            of `conv_kernel_sizes` has to match `num_conv_layers`.
+        conv_channels (`int`, *optional*, defaults to 1024):
             An integer defining the number of output channels of each convolution layers except the final one in the
             conv module.
-        input_feat_per_channel (:obj:`int`, `optional`, defaults to 80):
+        input_feat_per_channel (`int`, *optional*, defaults to 80):
             An integer specifying the size of feature vector. This is also the dimensions of log-mel filter-bank
             features.
-        input_channels (:obj:`int`, `optional`, defaults to 1):
+        input_channels (`int`, *optional*, defaults to 1):
             An integer specifying number of input channels of the input feature vector.
 
-    Example::
+    Example:
 
-        >>> from transformers import Speech2TextModel, Speech2TextConfig
+    ```python
+    >>> from transformers import Speech2TextModel, Speech2TextConfig
 
-        >>> # Initializing a Speech2Text s2t_transformer_s style configuration
-        >>> configuration = Speech2TextConfig()
+    >>> # Initializing a Speech2Text s2t_transformer_s style configuration
+    >>> configuration = Speech2TextConfig()
 
-        >>> # Initializing a model from the s2t_transformer_s style configuration
-        >>> model = Speech2TextModel(configuration)
+    >>> # Initializing a model from the s2t_transformer_s style configuration
+    >>> model = Speech2TextModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "speech_to_text"
     keys_to_ignore_at_inference = ["past_key_values"]
     attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
diff --git a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
index fd82de9399..8b924cde7b 100644
--- a/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/feature_extraction_speech_to_text.py
@@ -35,26 +35,26 @@ class Speech2TextFeatureExtractor(SequenceFeatureExtractor):
     r"""
     Constructs a Speech2Text feature extractor.
 
-    This feature extractor inherits from :class:`~transformers.Speech2TextFeatureExtractor` which contains most of the
+    This feature extractor inherits from [`Speech2TextFeatureExtractor`] which contains most of the
     main methods. Users should refer to this superclass for more information regarding those methods.
 
     This class extracts mel-filter bank features from raw speech using TorchAudio and applies utterance-level cepstral
     mean and variance normalization to the extracted features.
 
     Args:
-        feature_size (:obj:`int`, defaults to 80):
+        feature_size (`int`, defaults to 80):
             The feature dimension of the extracted features.
-        sampling_rate (:obj:`int`, defaults to 16000):
+        sampling_rate (`int`, defaults to 16000):
             The sampling rate at which the audio files should be digitalized expressed in Hertz per second (Hz).
-        num_mel_bins (:obj:`int`, defaults to 80):
+        num_mel_bins (`int`, defaults to 80):
             Number of Mel-frequency bins.
-        padding_value (:obj:`float`, defaults to 0.0):
+        padding_value (`float`, defaults to 0.0):
             The value that is used to fill the padding vectors.
-        do_ceptral_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_ceptral_normalize (`bool`, *optional*, defaults to `True`):
             Whether or not to apply utterance-level cepstral mean and variance normalization to extracted features.
-        normalize_means (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        normalize_means (`bool`, *optional*, defaults to `True`):
             Whether or not to zero-mean normalize the extracted features.
-        normalize_vars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        normalize_vars (`bool`, *optional*, defaults to `True`):
             Whether or not to unit-variance normalize the extracted features.
     """
 
@@ -140,49 +140,51 @@ class Speech2TextFeatureExtractor(SequenceFeatureExtractor):
         Main method to featurize and prepare for the model one or several sequence(s). sequences.
 
         Args:
-            raw_speech (:obj:`np.ndarray`, :obj:`List[float]`, :obj:`List[np.ndarray]`, :obj:`List[List[float]]`):
+            raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                 The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                 values, a list of numpy arrays or a list of list of float values.
-            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `True`):
                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
                 index) among:
 
-                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a
                   single sequence if provided).
-                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
                   maximum acceptable input length for the model if that argument is not provided.
-                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
                   different lengths).
-            max_length (:obj:`int`, `optional`):
+            max_length (`int`, *optional*):
                 Maximum length of the returned list and optionally padding length (see above).
-            truncation (:obj:`bool`):
-                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
-            pad_to_multiple_of (:obj:`int`, `optional`):
+            truncation (`bool`):
+                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
+            pad_to_multiple_of (`int`, *optional*):
                 If set will pad the sequence to a multiple of the provided value.
 
                 This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                 >= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
-            return_attention_mask (:obj:`bool`, `optional`):
+            return_attention_mask (`bool`, *optional*):
                 Whether to return the attention mask. If left to the default, will return the attention mask according
                 to the specific feature_extractor's default.
 
-                `What are attention masks? <../glossary.html#attention-mask>`__
+                [What are attention masks?](../glossary#attention-mask)
 
-                .. note::
+                <Tip>
 
-                    For Speech2TextTransoformer models, :obj:`attention_mask` should alwys be passed for batched
-                    inference, to avoid subtle bugs.
+                For Speech2TextTransoformer models, `attention_mask` should alwys be passed for batched
+                inference, to avoid subtle bugs.
 
-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+                </Tip>
+
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
 
-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
-            sampling_rate (:obj:`int`, `optional`):
-                The sampling rate at which the :obj:`raw_speech` input was sampled. It is strongly recommended to pass
-                :obj:`sampling_rate` at the forward call to prevent silent errors.
-            padding_value (:obj:`float`, defaults to 0.0):
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors.
+            padding_value (`float`, defaults to 0.0):
                 The value that is used to fill the padding values / vectors.
         """
 
diff --git a/src/transformers/models/speech_to_text/processing_speech_to_text.py b/src/transformers/models/speech_to_text/processing_speech_to_text.py
index 4f46217562..2db1546f89 100644
--- a/src/transformers/models/speech_to_text/processing_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/processing_speech_to_text.py
@@ -26,17 +26,17 @@ class Speech2TextProcessor:
     Constructs a Speech2Text processor which wraps a Speech2Text feature extractor and a Speech2Text tokenizer into a
     single processor.
 
-    :class:`~transformers.Speech2TextProcessor` offers all the functionalities of
-    :class:`~transformers.Speech2TextFeatureExtractor` and :class:`~transformers.Speech2TextTokenizer`. See the
-    :meth:`~transformers.Speech2TextProcessor.__call__` and :meth:`~transformers.Speech2TextProcessor.decode` for more
+    [`Speech2TextProcessor`] offers all the functionalities of
+    [`Speech2TextFeatureExtractor`] and [`Speech2TextTokenizer`]. See the
+    [`~Speech2TextProcessor.__call__`] and [`~Speech2TextProcessor.decode`] for more
     information.
 
     Args:
-        feature_extractor (:obj:`Speech2TextFeatureExtractor`):
-            An instance of :class:`~transformers.Speech2TextFeatureExtractor`. The feature extractor is a required
+        feature_extractor (`Speech2TextFeatureExtractor`):
+            An instance of [`Speech2TextFeatureExtractor`]. The feature extractor is a required
             input.
-        tokenizer (:obj:`Speech2TextTokenizer`):
-            An instance of :class:`~transformers.Speech2TextTokenizer`. The tokenizer is a required input.
+        tokenizer (`Speech2TextTokenizer`):
+            An instance of [`Speech2TextTokenizer`]. The tokenizer is a required input.
     """
 
     def __init__(self, feature_extractor, tokenizer):
@@ -56,17 +56,19 @@ class Speech2TextProcessor:
     def save_pretrained(self, save_directory):
         """
         Save a Speech2Text feature extractor object and Speech2Text tokenizer object to the directory
-        ``save_directory``, so that it can be re-loaded using the
-        :func:`~transformers.Speech2TextProcessor.from_pretrained` class method.
+        `save_directory`, so that it can be re-loaded using the
+        [`~Speech2TextProcessor.from_pretrained`] class method.
 
-        .. note::
+        <Tip>
 
-            This class method is simply calling :meth:`~transformers.PreTrainedFeatureExtractor.save_pretrained` and
-            :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.save_pretrained`. Please refer to the
-            docstrings of the methods above for more information.
+        This class method is simply calling [`~PreTrainedFeatureExtractor.save_pretrained`] and
+        [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the
+        docstrings of the methods above for more information.
+
+        </Tip>
 
         Args:
-            save_directory (:obj:`str` or :obj:`os.PathLike`):
+            save_directory (`str` or `os.PathLike`):
                 Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
                 be created if it does not exist).
         """
@@ -77,30 +79,32 @@ class Speech2TextProcessor:
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         r"""
-        Instantiate a :class:`~transformers.Speech2TextProcessor` from a pretrained Speech2Text processor.
+        Instantiate a [`Speech2TextProcessor`] from a pretrained Speech2Text processor.
 
-        .. note::
+        <Tip>
 
-            This class method is simply calling Speech2TextFeatureExtractor's
-            :meth:`~transformers.PreTrainedFeatureExtractor.from_pretrained` and Speech2TextTokenizer's
-            :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained`. Please refer to the
-            docstrings of the methods above for more information.
+        This class method is simply calling Speech2TextFeatureExtractor's
+        [`~PreTrainedFeatureExtractor.from_pretrained`] and Speech2TextTokenizer's
+        [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the
+        docstrings of the methods above for more information.
+
+        </Tip>
 
         Args:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
                 This can be either:
 
-                - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
-                  namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing a feature extractor file saved using the
-                  :meth:`~transformers.PreTrainedFeatureExtractor.save_pretrained` method, e.g.,
-                  ``./my_model_directory/``.
-                - a path or url to a saved feature extractor JSON `file`, e.g.,
-                  ``./my_model_directory/preprocessor_config.json``.
+                - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
+                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                - a path to a *directory* containing a feature extractor file saved using the
+                  [`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g.,
+                  `./my_model_directory/`.
+                - a path or url to a saved feature extractor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.
             **kwargs
-                Additional keyword arguments passed along to both :class:`~transformers.PreTrainedFeatureExtractor` and
-                :class:`~transformers.PreTrainedTokenizer`
+                Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and
+                [`PreTrainedTokenizer`]
         """
         feature_extractor = Speech2TextFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
         tokenizer = Speech2TextTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
@@ -110,9 +114,9 @@ class Speech2TextProcessor:
     def __call__(self, *args, **kwargs):
         """
         When used in normal mode, this method forwards all its arguments to Speech2TextFeatureExtractor's
-        :meth:`~transformers.Speech2TextFeatureExtractor.__call__` and returns its output. If used in the context
-        :meth:`~transformers.Speech2TextProcessor.as_target_processor` this method forwards all its arguments to
-        Speech2TextTokenizer's :meth:`~transformers.Speech2TextTokenizer.__call__`. Please refer to the doctsring of
+        [`~Speech2TextFeatureExtractor.__call__`] and returns its output. If used in the context
+        [`~Speech2TextProcessor.as_target_processor`] this method forwards all its arguments to
+        Speech2TextTokenizer's [`~Speech2TextTokenizer.__call__`]. Please refer to the doctsring of
         the above two methods for more information.
         """
         return self.current_processor(*args, **kwargs)
@@ -120,7 +124,7 @@ class Speech2TextProcessor:
     def batch_decode(self, *args, **kwargs):
         """
         This method forwards all its arguments to Speech2TextTokenizer's
-        :meth:`~transformers.PreTrainedTokenizer.batch_decode`. Please refer to the docstring of this method for more
+        [`~PreTrainedTokenizer.batch_decode`]. Please refer to the docstring of this method for more
         information.
         """
         return self.tokenizer.batch_decode(*args, **kwargs)
@@ -128,7 +132,7 @@ class Speech2TextProcessor:
     def decode(self, *args, **kwargs):
         """
         This method forwards all its arguments to Speech2TextTokenizer's
-        :meth:`~transformers.PreTrainedTokenizer.decode`. Please refer to the docstring of this method for more
+        [`~PreTrainedTokenizer.decode`]. Please refer to the docstring of this method for more
         information.
         """
         return self.tokenizer.decode(*args, **kwargs)
diff --git a/src/transformers/models/speech_to_text/tokenization_speech_to_text.py b/src/transformers/models/speech_to_text/tokenization_speech_to_text.py
index 488fa0ef82..aeef9d224d 100644
--- a/src/transformers/models/speech_to_text/tokenization_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/tokenization_speech_to_text.py
@@ -56,46 +56,45 @@ class Speech2TextTokenizer(PreTrainedTokenizer):
     """
     Construct an Speech2Text tokenizer.
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains some of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains some of the main methods.
     Users should refer to the superclass for more information regarding such methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             File containing the vocabulary.
-        spm_file (:obj:`str`):
-            Path to the `SentencePiece <https://github.com/google/sentencepiece>`__ model file
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        spm_file (`str`):
+            Path to the [SentencePiece](https://github.com/google/sentencepiece) model file
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sentence token.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sentence token.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        do_upper_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        do_upper_case (`bool`, *optional*, defaults to `False`):
            Whether or not to uppercase the output when decoding.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        do_lower_case (`bool`, *optional*, defaults to `False`):
             Whether or not to lowercase the input when tokenizing.
-        tgt_lang (:obj:`str`, `optional`):
+        tgt_lang (`str`, *optional*):
             A string representing the target language.
-        sp_model_kwargs (:obj:`dict`, `optional`):
-            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
-            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
 
-            - ``enable_sampling``: Enable subword regularization.
-            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
 
-              - ``nbest_size = {0,1}``: No sampling is performed.
-              - ``nbest_size > 1``: samples from the nbest_size results.
-              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                 using forward-filtering-and-backward-sampling algorithm.
 
-            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
               BPE-dropout.
 
         **kwargs
-            Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer`
+            Additional keyword arguments passed along to [`PreTrainedTokenizer`]
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -203,18 +202,18 @@ class Speech2TextTokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
 
         if already_has_special_tokens:
diff --git a/src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py
index abeac09a10..15579c459d 100644
--- a/src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/configuration_speech_to_text_2.py
@@ -28,65 +28,65 @@ SPEECH_TO_TEXT_2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class Speech2Text2Config(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.Speech2Text2ForCausalLM`. It
+    This is the configuration class to store the configuration of a [`Speech2Text2ForCausalLM`]. It
     is used to instantiate an Speech2Text2 model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the Speech2Text2
-    `facebook/s2t-small-librispeech-asr <https://huggingface.co/facebook/s2t-small-librispeech-asr>`__ architecture.
+    [facebook/s2t-small-librispeech-asr](https://huggingface.co/facebook/s2t-small-librispeech-asr) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 50265):
+        vocab_size (`int`, *optional*, defaults to 50265):
             Vocabulary size of the Speech2Text model. Defines the number of different tokens that can be represented by
-            the :obj:`inputs_ids` passed when calling :class:`~transformers.Speech2TextModel`
-        d_model (:obj:`int`, `optional`, defaults to 1024):
+            the `inputs_ids` passed when calling [`Speech2TextModel`]
+        d_model (`int`, *optional*, defaults to 1024):
             Dimensionality of the layers and the pooler layer.
-        decoder_layers (:obj:`int`, `optional`, defaults to 12):
+        decoder_layers (`int`, *optional*, defaults to 12):
             Number of decoder layers.
-        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
-            The non-linear activation function (function or string) in the pooler. If string, :obj:`"gelu"`,
-            :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        activation_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for classifier.
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
+        init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            https://arxiv.org/abs/1909.11556>`__ for more details. decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
-        max_source_positions (:obj:`int`, `optional`, defaults to 6000):
+        max_source_positions (`int`, *optional*, defaults to 6000):
             The maximum sequence length of log-mel filter-bank features that this model might ever be used with.
-        max_target_positions: (:obj:`int`, `optional`, defaults to 1024):
+        max_target_positions: (`int`, *optional*, defaults to 1024):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
 
-        Example::
+    Example:
 
-            >>> from transformers import Speech2Text2ForCausalLM, Speech2Text2Config
+    ```python
+    >>> from transformers import Speech2Text2ForCausalLM, Speech2Text2Config
 
-            >>> # Initializing a Speech2Text2 s2t_transformer_s style configuration
-            >>> configuration = Speech2Text2Config()
+    >>> # Initializing a Speech2Text2 s2t_transformer_s style configuration
+    >>> configuration = Speech2Text2Config()
 
-            >>> # Initializing a model from the s2t_transformer_s style configuration
-            >>> model = Speech2Text2ForCausalLM(configuration)
+    >>> # Initializing a model from the s2t_transformer_s style configuration
+    >>> model = Speech2Text2ForCausalLM(configuration)
 
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "speech_to_text_2"
     keys_to_ignore_at_inference = ["past_key_values"]
     attribute_map = {"num_attention_heads": "decoder_attention_heads", "hidden_size": "d_model"}
diff --git a/src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py
index b1e9b3692d..1d6c260793 100644
--- a/src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py
@@ -27,16 +27,16 @@ class Speech2Text2Processor:
     Constructs a Speech2Text2 processor which wraps a Speech2Text2 feature extractor and a Speech2Text2 tokenizer into
     a single processor.
 
-    :class:`~transformers.Speech2Text2Processor` offers all the functionalities of
-    :class:`~transformers.AutoFeatureExtractor` and :class:`~transformers.Speech2Text2Tokenizer`. See the
-    :meth:`~transformers.Speech2Text2Processor.__call__` and :meth:`~transformers.Speech2Text2Processor.decode` for
+    [`Speech2Text2Processor`] offers all the functionalities of
+    [`AutoFeatureExtractor`] and [`Speech2Text2Tokenizer`]. See the
+    [`~Speech2Text2Processor.__call__`] and [`~Speech2Text2Processor.decode`] for
     more information.
 
     Args:
-        feature_extractor (:obj:`AutoFeatureExtractor`):
-            An instance of :class:`~transformers.AutoFeatureExtractor`. The feature extractor is a required input.
-        tokenizer (:obj:`Speech2Text2Tokenizer`):
-            An instance of :class:`~transformers.Speech2Text2Tokenizer`. The tokenizer is a required input.
+        feature_extractor (`AutoFeatureExtractor`):
+            An instance of [`AutoFeatureExtractor`]. The feature extractor is a required input.
+        tokenizer (`Speech2Text2Tokenizer`):
+            An instance of [`Speech2Text2Tokenizer`]. The tokenizer is a required input.
     """
 
     def __init__(self, feature_extractor, tokenizer):
@@ -56,17 +56,19 @@ class Speech2Text2Processor:
     def save_pretrained(self, save_directory):
         """
         Save a Speech2Text2 feature extractor object and Speech2Text2 tokenizer object to the directory
-        ``save_directory``, so that it can be re-loaded using the
-        :func:`~transformers.Speech2Text2Processor.from_pretrained` class method.
+        `save_directory`, so that it can be re-loaded using the
+        [`~Speech2Text2Processor.from_pretrained`] class method.
 
-        .. note::
+        <Tip>
 
-            This class method is simply calling :meth:`~transformers.PreTrainedFeatureExtractor.save_pretrained` and
-            :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.save_pretrained`. Please refer to the
-            docstrings of the methods above for more information.
+        This class method is simply calling [`~PreTrainedFeatureExtractor.save_pretrained`] and
+        [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the
+        docstrings of the methods above for more information.
+
+        </Tip>
 
         Args:
-            save_directory (:obj:`str` or :obj:`os.PathLike`):
+            save_directory (`str` or `os.PathLike`):
                 Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
                 be created if it does not exist).
         """
@@ -77,30 +79,32 @@ class Speech2Text2Processor:
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         r"""
-        Instantiate a :class:`~transformers.Speech2Text2Processor` from a pretrained Speech2Text2 processor.
+        Instantiate a [`Speech2Text2Processor`] from a pretrained Speech2Text2 processor.
 
-        .. note::
+        <Tip>
 
-            This class method is simply calling AutoFeatureExtractor's
-            :meth:`~transformers.PreTrainedFeatureExtractor.from_pretrained` and Speech2Text2Tokenizer's
-            :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained`. Please refer to the
-            docstrings of the methods above for more information.
+        This class method is simply calling AutoFeatureExtractor's
+        [`~PreTrainedFeatureExtractor.from_pretrained`] and Speech2Text2Tokenizer's
+        [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the
+        docstrings of the methods above for more information.
+
+        </Tip>
 
         Args:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
                 This can be either:
 
-                - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
-                  namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing a feature extractor file saved using the
-                  :meth:`~transformers.PreTrainedFeatureExtractor.save_pretrained` method, e.g.,
-                  ``./my_model_directory/``.
-                - a path or url to a saved feature extractor JSON `file`, e.g.,
-                  ``./my_model_directory/preprocessor_config.json``.
+                - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
+                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                - a path to a *directory* containing a feature extractor file saved using the
+                  [`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g.,
+                  `./my_model_directory/`.
+                - a path or url to a saved feature extractor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.
             **kwargs
-                Additional keyword arguments passed along to both :class:`~transformers.PreTrainedFeatureExtractor` and
-                :class:`~transformers.PreTrainedTokenizer`
+                Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and
+                [`PreTrainedTokenizer`]
         """
         feature_extractor = AutoFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
         tokenizer = Speech2Text2Tokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
@@ -110,9 +114,9 @@ class Speech2Text2Processor:
     def __call__(self, *args, **kwargs):
         """
         When used in normal mode, this method forwards all its arguments to AutoFeatureExtractor's
-        :meth:`~transformers.AutoFeatureExtractor.__call__` and returns its output. If used in the context
-        :meth:`~transformers.Speech2Text2Processor.as_target_processor` this method forwards all its arguments to
-        Speech2Text2Tokenizer's :meth:`~transformers.Speech2Text2Tokenizer.__call__`. Please refer to the doctsring of
+        [`~AutoFeatureExtractor.__call__`] and returns its output. If used in the context
+        [`~Speech2Text2Processor.as_target_processor`] this method forwards all its arguments to
+        Speech2Text2Tokenizer's [`~Speech2Text2Tokenizer.__call__`]. Please refer to the doctsring of
         the above two methods for more information.
         """
         return self.current_processor(*args, **kwargs)
@@ -120,7 +124,7 @@ class Speech2Text2Processor:
     def batch_decode(self, *args, **kwargs):
         """
         This method forwards all its arguments to Speech2Text2Tokenizer's
-        :meth:`~transformers.PreTrainedTokenizer.batch_decode`. Please refer to the docstring of this method for more
+        [`~PreTrainedTokenizer.batch_decode`]. Please refer to the docstring of this method for more
         information.
         """
         return self.tokenizer.batch_decode(*args, **kwargs)
@@ -128,7 +132,7 @@ class Speech2Text2Processor:
     def decode(self, *args, **kwargs):
         """
         This method forwards all its arguments to Speech2Text2Tokenizer's
-        :meth:`~transformers.PreTrainedTokenizer.decode`. Please refer to the docstring of this method for more
+        [`~PreTrainedTokenizer.decode`]. Please refer to the docstring of this method for more
         information.
         """
         return self.tokenizer.decode(*args, **kwargs)
diff --git a/src/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py
index fb804917d4..fc27a53a76 100644
--- a/src/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/tokenization_speech_to_text_2.py
@@ -68,24 +68,24 @@ class Speech2Text2Tokenizer(PreTrainedTokenizer):
     """
     Constructs a Speech2Text2Tokenizer.
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains some of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains some of the main methods.
     Users should refer to the superclass for more information regarding such methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             File containing the vocabulary.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sentence token.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sentence token.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
 
         **kwargs
-            Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer`
+            Additional keyword arguments passed along to [`PreTrainedTokenizer`]
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
diff --git a/src/transformers/models/splinter/configuration_splinter.py b/src/transformers/models/splinter/configuration_splinter.py
index 986e436fe7..f7f5ca0a0e 100644
--- a/src/transformers/models/splinter/configuration_splinter.py
+++ b/src/transformers/models/splinter/configuration_splinter.py
@@ -31,62 +31,62 @@ SPLINTER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class SplinterConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.SplinterModel`. It is used to
+    This is the configuration class to store the configuration of a [`SplinterModel`]. It is used to
     instantiate an Splinter model according to the specified arguments, defining the model architecture. Instantiating
-    a configuration with the defaults will yield a similar configuration to that of the Splinter `tau/splinter-base
-    <https://huggingface.co/tau/splinter-base>`__ architecture.
+    a configuration with the defaults will yield a similar configuration to that of the Splinter [tau/splinter-base](https://huggingface.co/tau/splinter-base) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the Splinter model. Defines the number of different tokens that can be represented by
-            the :obj:`inputs_ids` passed when calling :class:`~transformers.SplinterModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            the `inputs_ids` passed when calling [`SplinterModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimension of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.SplinterModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`SplinterModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if ``config.is_decoder=True``.
-        question_token_id (:obj:`int`, `optional`, defaults to 104):
-            The id of the ``[QUESTION]`` token.
+            relevant if `config.is_decoder=True`.
+        question_token_id (`int`, *optional*, defaults to 104):
+            The id of the `[QUESTION]` token.
 
-        Example::
+    Example:
 
-            >>> from transformers import SplinterModel, SplinterConfig
+    ```python
+    >>> from transformers import SplinterModel, SplinterConfig
 
-            >>> # Initializing a Splinter tau/splinter-base style configuration
-            >>> configuration = SplinterConfig()
+    >>> # Initializing a Splinter tau/splinter-base style configuration
+    >>> configuration = SplinterConfig()
 
-            >>> # Initializing a model from the tau/splinter-base style configuration
-            >>> model = SplinterModel(configuration)
+    >>> # Initializing a model from the tau/splinter-base style configuration
+    >>> model = SplinterModel(configuration)
 
-            >>> # Accessing the model configuration
-            >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "splinter"
 
     def __init__(
diff --git a/src/transformers/models/splinter/tokenization_splinter.py b/src/transformers/models/splinter/tokenization_splinter.py
index b06a9e4184..f20b046820 100644
--- a/src/transformers/models/splinter/tokenization_splinter.py
+++ b/src/transformers/models/splinter/tokenization_splinter.py
@@ -76,44 +76,43 @@ class SplinterTokenizer(PreTrainedTokenizer):
     r"""
     Construct a Splinter tokenizer. Based on WordPiece.
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             File containing the vocabulary.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
             Whether or not to lowercase the input when tokenizing.
-        do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
             Whether or not to do basic tokenization before WordPiece.
-        never_split (:obj:`Iterable`, `optional`):
+        never_split (`Iterable`, *optional*):
             Collection of tokens which will never be split during tokenization. Only has an effect when
-            :obj:`do_basic_tokenize=True`
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+            `do_basic_tokenize=True`
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
             The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        question_token (:obj:`str`, `optional`, defaults to :obj:`"[QUESTION]"`):
+        question_token (`str`, *optional*, defaults to `"[QUESTION]"`):
             The token used for constructing question representations.
-        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
             Whether or not to tokenize Chinese characters.
 
-            This should likely be deactivated for Japanese (see this `issue
-            <https://github.com/huggingface/transformers/issues/328>`__).
-        strip_accents: (:obj:`bool`, `optional`):
+            This should likely be deactivated for Japanese (see this [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents: (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for :obj:`lowercase` (as in the original BERT).
+            value for `lowercase` (as in the original BERT).
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -172,7 +171,7 @@ class SplinterTokenizer(PreTrainedTokenizer):
     @property
     def question_token_id(self):
         """
-        :obj:`Optional[int]`: Id of the question token in the vocabulary, used to condition the answer on a question
+        `Optional[int]`: Id of the question token in the vocabulary, used to condition the answer on a question
         representation.
         """
         return self.convert_tokens_to_ids(self.question_token)
@@ -222,17 +221,17 @@ class SplinterTokenizer(PreTrainedTokenizer):
         Build model inputs from a pair of sequence for question answering tasks by concatenating and adding special
         tokens. A Splinter sequence has the following format:
 
-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences for question answering: ``[CLS] question_tokens [QUESTION] . [SEP] context_tokens [SEP]``
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences for question answering: `[CLS] question_tokens [QUESTION] . [SEP] context_tokens [SEP]`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 The question token IDs if pad_on_right, else context tokens IDs
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 The context token IDs if pad_on_right, else question token IDs
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         if token_ids_1 is None:
             return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -252,18 +251,18 @@ class SplinterTokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
 
         if already_has_special_tokens:
@@ -279,17 +278,16 @@ class SplinterTokenizer(PreTrainedTokenizer):
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
     ) -> List[int]:
         """
-        Create the token type IDs corresponding to the sequences passed. `What are token type IDs?
-        <../glossary.html#token-type-ids>`__
+        Create the token type IDs corresponding to the sequences passed. [What are token type IDs?](../glossary#token-type-ids)
 
         Should be overridden in a subclass if the model has a special way of building those.
 
         Args:
-            token_ids_0 (:obj:`List[int]`): The first tokenized sequence.
-            token_ids_1 (:obj:`List[int]`, `optional`): The second tokenized sequence.
+            token_ids_0 (`List[int]`): The first tokenized sequence.
+            token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.
 
         Returns:
-            :obj:`List[int]`: The token type ids.
+            `List[int]`: The token type ids.
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
@@ -330,19 +328,18 @@ class BasicTokenizer(object):
     Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
 
     Args:
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
             Whether or not to lowercase the input when tokenizing.
-        never_split (:obj:`Iterable`, `optional`):
+        never_split (`Iterable`, *optional*):
             Collection of tokens which will never be split during tokenization. Only has an effect when
-            :obj:`do_basic_tokenize=True`
-        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            `do_basic_tokenize=True`
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
             Whether or not to tokenize Chinese characters.
 
-            This should likely be deactivated for Japanese (see this `issue
-            <https://github.com/huggingface/transformers/issues/328>`__).
-        strip_accents: (:obj:`bool`, `optional`):
+            This should likely be deactivated for Japanese (see this [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents: (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for :obj:`lowercase` (as in the original BERT).
+            value for `lowercase` (as in the original BERT).
     """
 
     def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
@@ -359,9 +356,9 @@ class BasicTokenizer(object):
         WordPieceTokenizer.
 
         Args:
-            **never_split**: (`optional`) list of str
+            **never_split**: (*optional*) list of str
                 Kept for backward compatibility purposes. Now implemented directly at the base class level (see
-                :func:`PreTrainedTokenizer.tokenize`) List of token not to split.
+                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
         """
         # union() returns a new set by concatenating the two sets.
         never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
@@ -487,11 +484,11 @@ class WordpieceTokenizer(object):
         Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
         tokenization using the given vocabulary.
 
-        For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`.
+        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
 
         Args:
           text: A single token or whitespace separated tokens. This should have
-            already been passed through `BasicTokenizer`.
+            already been passed through *BasicTokenizer*.
 
         Returns:
           A list of wordpiece tokens.
diff --git a/src/transformers/models/splinter/tokenization_splinter_fast.py b/src/transformers/models/splinter/tokenization_splinter_fast.py
index 04bdab4204..dd50534234 100644
--- a/src/transformers/models/splinter/tokenization_splinter_fast.py
+++ b/src/transformers/models/splinter/tokenization_splinter_fast.py
@@ -54,43 +54,43 @@ PRETRAINED_INIT_CONFIGURATION = {
 
 class SplinterTokenizerFast(PreTrainedTokenizerFast):
     r"""
-    Construct a "fast" Splinter tokenizer (backed by HuggingFace's `tokenizers` library). Based on WordPiece.
+    Construct a "fast" Splinter tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
     methods. Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             File containing the vocabulary.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
             Whether or not to lowercase the input when tokenizing.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
             The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        question_token (:obj:`str`, `optional`, defaults to :obj:`"[QUESTION]"`):
+        question_token (`str`, *optional*, defaults to `"[QUESTION]"`):
             The token used for constructing question representations.
-        clean_text (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        clean_text (`bool`, *optional*, defaults to `True`):
             Whether or not to clean the text before tokenization by removing any control characters and replacing all
             whitespaces by the classic one.
-        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see `this
-            issue <https://github.com/huggingface/transformers/issues/328>`__).
-        strip_accents: (:obj:`bool`, `optional`):
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
+            issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents: (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for :obj:`lowercase` (as in the original BERT).
-        wordpieces_prefix: (:obj:`str`, `optional`, defaults to :obj:`"##"`):
+            value for `lowercase` (as in the original BERT).
+        wordpieces_prefix: (`str`, *optional*, defaults to `"##"`):
             The prefix for subwords.
     """
 
@@ -145,7 +145,7 @@ class SplinterTokenizerFast(PreTrainedTokenizerFast):
     @property
     def question_token_id(self):
         """
-        :obj:`Optional[int]`: Id of the question token in the vocabulary, used to condition the answer on a question
+        `Optional[int]`: Id of the question token in the vocabulary, used to condition the answer on a question
         representation.
         """
         return self.convert_tokens_to_ids(self.question_token)
@@ -157,17 +157,17 @@ class SplinterTokenizerFast(PreTrainedTokenizerFast):
         Build model inputs from a pair of sequence for question answering tasks by concatenating and adding special
         tokens. A Splinter sequence has the following format:
 
-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences for question answering: ``[CLS] question_tokens [QUESTION] . [SEP] context_tokens [SEP]``
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences for question answering: `[CLS] question_tokens [QUESTION] . [SEP] context_tokens [SEP]`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 The question token IDs if pad_on_right, else context tokens IDs
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 The context token IDs if pad_on_right, else question token IDs
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         if token_ids_1 is None:
             return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -186,17 +186,16 @@ class SplinterTokenizerFast(PreTrainedTokenizerFast):
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
     ) -> List[int]:
         """
-        Create the token type IDs corresponding to the sequences passed. `What are token type IDs?
-        <../glossary.html#token-type-ids>`__
+        Create the token type IDs corresponding to the sequences passed. [What are token type IDs?](../glossary#token-type-ids)
 
         Should be overridden in a subclass if the model has a special way of building those.
 
         Args:
-            token_ids_0 (:obj:`List[int]`): The first tokenized sequence.
-            token_ids_1 (:obj:`List[int]`, `optional`): The second tokenized sequence.
+            token_ids_0 (`List[int]`): The first tokenized sequence.
+            token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.
 
         Returns:
-            :obj:`List[int]`: The token type ids.
+            `List[int]`: The token type ids.
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
diff --git a/src/transformers/models/squeezebert/configuration_squeezebert.py b/src/transformers/models/squeezebert/configuration_squeezebert.py
index c3ed53e5dc..7a8930bfc1 100644
--- a/src/transformers/models/squeezebert/configuration_squeezebert.py
+++ b/src/transformers/models/squeezebert/configuration_squeezebert.py
@@ -29,72 +29,74 @@ SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class SqueezeBertConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.SqueezeBertModel`. It is used
+    This is the configuration class to store the configuration of a [`SqueezeBertModel`]. It is used
     to instantiate a SqueezeBERT model according to the specified arguments, defining the model architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the SqueezeBERT model. Defines the number of different tokens that can be represented by
-            the :obj:`inputs_ids` passed when calling :class:`~transformers.SqueezeBertModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            the `inputs_ids` passed when calling [`SqueezeBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or
-            :class:`~transformers.TFBertModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or
+            [`TFBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
 
-        pad_token_id (:obj:`int`, `optional`, defaults to 0):
+        pad_token_id (`int`, *optional*, defaults to 0):
             The ID of the token in the word embedding to use as padding.
-        embedding_size (:obj:`int`, `optional`, defaults to 768):
+        embedding_size (`int`, *optional*, defaults to 768):
             The dimension of the word embedding vectors.
 
-        q_groups (:obj:`int`, `optional`, defaults to 4):
+        q_groups (`int`, *optional*, defaults to 4):
             The number of groups in Q layer.
-        k_groups (:obj:`int`, `optional`, defaults to 4):
+        k_groups (`int`, *optional*, defaults to 4):
             The number of groups in K layer.
-        v_groups (:obj:`int`, `optional`, defaults to 4):
+        v_groups (`int`, *optional*, defaults to 4):
             The number of groups in V layer.
-        post_attention_groups (:obj:`int`, `optional`, defaults to 1):
+        post_attention_groups (`int`, *optional*, defaults to 1):
             The number of groups in the first feed forward network layer.
-        intermediate_groups (:obj:`int`, `optional`, defaults to 4):
+        intermediate_groups (`int`, *optional*, defaults to 4):
             The number of groups in the second feed forward network layer.
-        output_groups (:obj:`int`, `optional`, defaults to 4):
+        output_groups (`int`, *optional*, defaults to 4):
             The number of groups in the third feed forward network layer.
 
-    Examples::
+    Examples:
 
-        >>> from transformers import SqueezeBertModel, SqueezeBertConfig
+    ```python
+    >>> from transformers import SqueezeBertModel, SqueezeBertConfig
 
-        >>> # Initializing a SqueezeBERT configuration
-        >>> configuration = SqueezeBertConfig()
+    >>> # Initializing a SqueezeBERT configuration
+    >>> configuration = SqueezeBertConfig()
 
-        >>> # Initializing a model from the configuration above
-        >>> model = SqueezeBertModel(configuration)
+    >>> # Initializing a model from the configuration above
+    >>> model = SqueezeBertModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
 
     Attributes: pretrained_config_archive_map (Dict[str, str]): A dictionary containing all the available pre-trained
     checkpoints.
diff --git a/src/transformers/models/squeezebert/tokenization_squeezebert.py b/src/transformers/models/squeezebert/tokenization_squeezebert.py
index d73bb732d6..2b89f724c0 100644
--- a/src/transformers/models/squeezebert/tokenization_squeezebert.py
+++ b/src/transformers/models/squeezebert/tokenization_squeezebert.py
@@ -48,10 +48,10 @@ class SqueezeBertTokenizer(BertTokenizer):
     r"""
     Constructs a SqueezeBert tokenizer.
 
-    :class:`~transformers.SqueezeBertTokenizer is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
+    [`SqueezeBertTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end
     tokenization: punctuation splitting + wordpiece.
 
-    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning
     parameters.
     """
 
diff --git a/src/transformers/models/squeezebert/tokenization_squeezebert_fast.py b/src/transformers/models/squeezebert/tokenization_squeezebert_fast.py
index d6de6e63f8..ae1afb282e 100644
--- a/src/transformers/models/squeezebert/tokenization_squeezebert_fast.py
+++ b/src/transformers/models/squeezebert/tokenization_squeezebert_fast.py
@@ -52,12 +52,12 @@ PRETRAINED_INIT_CONFIGURATION = {
 
 class SqueezeBertTokenizerFast(BertTokenizerFast):
     r"""
-    Constructs a "Fast" SqueezeBert tokenizer (backed by HuggingFace's `tokenizers` library).
+    Constructs a "Fast" SqueezeBert tokenizer (backed by HuggingFace's *tokenizers* library).
 
-    :class:`~transformers.SqueezeBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
+    [`SqueezeBertTokenizerFast`] is identical to [`BertTokenizerFast`] and runs
     end-to-end tokenization: punctuation splitting + wordpiece.
 
-    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning
     parameters.
     """
 
diff --git a/src/transformers/models/t5/configuration_t5.py b/src/transformers/models/t5/configuration_t5.py
index bb16a5fb0f..c4e386fd35 100644
--- a/src/transformers/models/t5/configuration_t5.py
+++ b/src/transformers/models/t5/configuration_t5.py
@@ -37,45 +37,44 @@ T5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class T5Config(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.T5Model` or a
-    :class:`~transformers.TFT5Model`. It is used to instantiate a T5 model according to the specified arguments,
+    This is the configuration class to store the configuration of a [`T5Model`] or a
+    [`TFT5Model`]. It is used to instantiate a T5 model according to the specified arguments,
     defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
-    to that of the T5 `t5-small <https://huggingface.co/t5-small>`__ architecture.
+    to that of the T5 [t5-small](https://huggingface.co/t5-small) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
     Arguments:
-        vocab_size (:obj:`int`, `optional`, defaults to 32128):
+        vocab_size (`int`, *optional*, defaults to 32128):
             Vocabulary size of the T5 model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.T5Model` or :class:`~transformers.TFT5Model`.
-        d_model (:obj:`int`, `optional`, defaults to 512):
+            `inputs_ids` passed when calling [`T5Model`] or [`TFT5Model`].
+        d_model (`int`, *optional*, defaults to 512):
             Size of the encoder layers and the pooler layer.
-        d_kv (:obj:`int`, `optional`, defaults to 64):
-            Size of the key, query, value projections per attention head. :obj:`d_kv` has to be equal to :obj:`d_model
-            // num_heads`.
-        d_ff (:obj:`int`, `optional`, defaults to 2048):
-            Size of the intermediate feed forward layer in each :obj:`T5Block`.
-        num_layers (:obj:`int`, `optional`, defaults to 6):
+        d_kv (`int`, *optional*, defaults to 64):
+            Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model // num_heads`.
+        d_ff (`int`, *optional*, defaults to 2048):
+            Size of the intermediate feed forward layer in each `T5Block`.
+        num_layers (`int`, *optional*, defaults to 6):
             Number of hidden layers in the Transformer encoder.
-        num_decoder_layers (:obj:`int`, `optional`):
-            Number of hidden layers in the Transformer decoder. Will use the same value as :obj:`num_layers` if not
+        num_decoder_layers (`int`, *optional*):
+            Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not
             set.
-        num_heads (:obj:`int`, `optional`, defaults to 8):
+        num_heads (`int`, *optional*, defaults to 8):
             Number of attention heads for each attention layer in the Transformer encoder.
-        relative_attention_num_buckets (:obj:`int`, `optional`, defaults to 32):
+        relative_attention_num_buckets (`int`, *optional*, defaults to 32):
             The number of buckets to use for each attention layer.
-        dropout_rate (:obj:`float`, `optional`, defaults to 0.1):
+        dropout_rate (`float`, *optional*, defaults to 0.1):
             The ratio for all dropout layers.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-6):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
             The epsilon used by the layer normalization layers.
-        initializer_factor (:obj:`float`, `optional`, defaults to 1):
+        initializer_factor (`float`, *optional*, defaults to 1):
             A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
             testing).
-        feed_forward_proj (:obj:`string`, `optional`, defaults to :obj:`"relu"`):
-            Type of feed forward layer to be used. Should be one of :obj:`"relu"` or :obj:`"gated-gelu"`. T5v1.1 uses
-            the :obj:`"gated-gelu"` feed forward projection. Original T5 uses :obj:`"relu"`.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        feed_forward_proj (`string`, *optional*, defaults to `"relu"`):
+            Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`. T5v1.1 uses
+            the `"gated-gelu"` feed forward projection. Original T5 uses `"relu"`.
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
     """
     model_type = "t5"
diff --git a/src/transformers/models/t5/modeling_flax_t5.py b/src/transformers/models/t5/modeling_flax_t5.py
index 8d46bb7400..8d6cae04e4 100644
--- a/src/transformers/models/t5/modeling_flax_t5.py
+++ b/src/transformers/models/t5/modeling_flax_t5.py
@@ -1054,17 +1054,18 @@ class FlaxT5PreTrainedModel(FlaxPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import T5Tokenizer, FlaxT5ForConditionalGeneration
+        ```python
+        >>> from transformers import T5Tokenizer, FlaxT5ForConditionalGeneration
 
-            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-            >>> model = FlaxT5ForConditionalGeneration.from_pretrained('t5-small')
+        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        >>> model = FlaxT5ForConditionalGeneration.from_pretrained('t5-small')
 
-            >>> text = "My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, return_tensors='np')
-            >>> encoder_outputs = model.encode(**inputs)
-        """
+        >>> text = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer(text, return_tensors='np')
+        >>> encoder_outputs = model.encode(**inputs)
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1114,24 +1115,25 @@ class FlaxT5PreTrainedModel(FlaxPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import T5Tokenizer, FlaxT5ForConditionalGeneration
-            >>> import jax.numpy as jnp
+        ```python
+        >>> from transformers import T5Tokenizer, FlaxT5ForConditionalGeneration
+        >>> import jax.numpy as jnp
 
-            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-            >>> model = FlaxT5ForConditionalGeneration.from_pretrained('t5-small')
+        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        >>> model = FlaxT5ForConditionalGeneration.from_pretrained('t5-small')
 
-            >>> text = "My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, return_tensors='np')
-            >>> encoder_outputs = model.encode(**inputs)
+        >>> text = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer(text, return_tensors='np')
+        >>> encoder_outputs = model.encode(**inputs)
 
-            >>> decoder_start_token_id = model.config.decoder_start_token_id
-            >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
 
-            >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
-            >>> logits = outputs.logits
-        """
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> logits = outputs.logits
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1329,19 +1331,21 @@ append_call_sample_docstring(
 FLAX_T5_MODEL_DOCSTRING = """
     Returns:
 
-    Example::
+    Example:
 
-        >>> from transformers import T5Tokenizer, FlaxT5Model
+    ```python
+    >>> from transformers import T5Tokenizer, FlaxT5Model
 
-        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        >>> model = FlaxT5Model.from_pretrained('t5-small')
+    >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+    >>> model = FlaxT5Model.from_pretrained('t5-small')
 
-        >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="np").input_ids
-        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="np").input_ids
+    >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="np").input_ids
+    >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="np").input_ids
 
-        >>> # forward pass
-        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
-        >>> last_hidden_states = outputs.last_hidden_state
+    >>> # forward pass
+    >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+    >>> last_hidden_states = outputs.last_hidden_state
+    ```
 """
 
 
@@ -1476,24 +1480,25 @@ class FlaxT5ForConditionalGeneration(FlaxT5PreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import T5Tokenizer, FlaxT5ForConditionalGeneration
-            >>> import jax.numpy as jnp
+        ```python
+        >>> from transformers import T5Tokenizer, FlaxT5ForConditionalGeneration
+        >>> import jax.numpy as jnp
 
-            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-            >>> model = FlaxT5ForConditionalGeneration.from_pretrained('t5-small')
+        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        >>> model = FlaxT5ForConditionalGeneration.from_pretrained('t5-small')
 
-            >>> text = "summarize: My friends are cool but they eat too many carbs."
-            >>> inputs = tokenizer(text, return_tensors='np')
-            >>> encoder_outputs = model.encode(**inputs)
+        >>> text = "summarize: My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer(text, return_tensors='np')
+        >>> encoder_outputs = model.encode(**inputs)
 
-            >>> decoder_start_token_id = model.config.decoder_start_token_id
-            >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
 
-            >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
-            >>> logits = outputs.logits
-        """
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> logits = outputs.logits
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1624,19 +1629,21 @@ class FlaxT5ForConditionalGeneration(FlaxT5PreTrainedModel):
 FLAX_T5_CONDITIONAL_GENERATION_DOCSTRING = """
     Returns:
 
-    Example::
+    Example:
 
-        >>> from transformers import T5Tokenizer, FlaxT5ForConditionalGeneration
+    ```python
+    >>> from transformers import T5Tokenizer, FlaxT5ForConditionalGeneration
 
-        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-        >>> model = FlaxT5ForConditionalGeneration.from_pretrained('t5-small')
+    >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+    >>> model = FlaxT5ForConditionalGeneration.from_pretrained('t5-small')
 
-        >>> ARTICLE_TO_SUMMARIZE = "summarize: My friends are cool but they eat too many carbs."
-        >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], return_tensors='np')
+    >>> ARTICLE_TO_SUMMARIZE = "summarize: My friends are cool but they eat too many carbs."
+    >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], return_tensors='np')
 
-        >>> # Generate Summary
-        >>> summary_ids = model.generate(inputs['input_ids']).sequences
-        >>> print(tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False))
+    >>> # Generate Summary
+    >>> summary_ids = model.generate(inputs['input_ids']).sequences
+    >>> print(tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False))
+    ```
 """
 
 
diff --git a/src/transformers/models/t5/modeling_t5.py b/src/transformers/models/t5/modeling_t5.py
index 802d3fb967..3f34de5476 100644
--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -216,17 +216,19 @@ PARALLELIZE_DOCSTRING = r"""
 DEPARALLELIZE_DOCSTRING = r"""
     Moves the model to cpu from a model parallel state.
 
-    Example::
+    Example:
 
-        # On a 4 GPU machine with t5-3b:
-        model = T5ForConditionalGeneration.from_pretrained('t5-3b')
-        device_map = {0: [0, 1, 2],
+    ```python
+    # On a 4 GPU machine with t5-3b:
+    model = T5ForConditionalGeneration.from_pretrained('t5-3b')
+    device_map = {0: [0, 1, 2],
 
-                     1: [3, 4, 5, 6, 7, 8, 9],
-                     2: [10, 11, 12, 13, 14, 15, 16],
-                     3: [17, 18, 19, 20, 21, 22, 23]}
-        model.parallelize(device_map) # Splits the model across several devices
-        model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
+                 1: [3, 4, 5, 6, 7, 8, 9],
+                 2: [10, 11, 12, 13, 14, 15, 16],
+                 3: [17, 18, 19, 20, 21, 22, 23]}
+    model.parallelize(device_map) # Splits the model across several devices
+    model.deparallelize() # Put the model back on cpu and cleans memory by calling torch.cuda.empty_cache()
+    ```
 """
 
 
@@ -1339,20 +1341,21 @@ class T5Model(T5PreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import T5Tokenizer, T5Model
+        ```python
+        >>> from transformers import T5Tokenizer, T5Model
 
-            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-            >>> model = T5Model.from_pretrained('t5-small')
+        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        >>> model = T5Model.from_pretrained('t5-small')
 
-            >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
-            >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
+        >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
 
-            >>> # forward pass
-            >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
-            >>> last_hidden_states = outputs.last_hidden_state
-        """
+        >>> # forward pass
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
         use_cache = use_cache if use_cache is not None else self.config.use_cache
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
@@ -1790,15 +1793,16 @@ class T5EncoderModel(T5PreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import T5Tokenizer, T5EncoderModel
-            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-            >>> model = T5EncoderModel.from_pretrained('t5-small')
-            >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
-            >>> outputs = model(input_ids=input_ids)
-            >>> last_hidden_states = outputs.last_hidden_state
-        """
+        ```python
+        >>> from transformers import T5Tokenizer, T5EncoderModel
+        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        >>> model = T5EncoderModel.from_pretrained('t5-small')
+        >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
+        >>> outputs = model(input_ids=input_ids)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         encoder_outputs = self.encoder(
diff --git a/src/transformers/models/t5/modeling_tf_t5.py b/src/transformers/models/t5/modeling_tf_t5.py
index 1750da6718..36654c68ff 100644
--- a/src/transformers/models/t5/modeling_tf_t5.py
+++ b/src/transformers/models/t5/modeling_tf_t5.py
@@ -1172,21 +1172,21 @@ class TFT5Model(TFT5PreTrainedModel):
         r"""
         Returns:
 
-        Examples::
+        Examples:
 
-            >>> from transformers import T5Tokenizer, TFT5Model
+        ```python
+        >>> from transformers import T5Tokenizer, TFT5Model
 
-            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-            >>> model = TFT5Model.from_pretrained('t5-small')
+        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        >>> model = TFT5Model.from_pretrained('t5-small')
 
-            >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="tf").input_ids  # Batch size 1
-            >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="tf").input_ids  # Batch size 1
+        >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="tf").input_ids  # Batch size 1
+        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="tf").input_ids  # Batch size 1
 
-            >>> # forward pass
-            >>> outputs = model(input_ids, decoder_input_ids=decoder_input_ids)
-            >>> last_hidden_states = outputs.last_hidden_state
-
-        """
+        >>> # forward pass
+        >>> outputs = model(input_ids, decoder_input_ids=decoder_input_ids)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
         # FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask
         if head_mask is not None and decoder_head_mask is None:
             warnings.warn(_HEAD_MASK_WARNING_MSG, FutureWarning)
@@ -1627,17 +1627,17 @@ class TFT5EncoderModel(TFT5PreTrainedModel):
         r"""
         Returns:
 
-        Examples::
+        Examples:
 
-            >>> from transformers import T5Tokenizer, TFT5EncoderModel
+        ```python
+        >>> from transformers import T5Tokenizer, TFT5EncoderModel
 
-            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-            >>> model = TFT5EncoderModel.from_pretrained('t5-small')
+        >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        >>> model = TFT5EncoderModel.from_pretrained('t5-small')
 
-            >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="tf").input_ids  # Batch size 1
-            >>> outputs = model(input_ids)
-
-        """
+        >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="tf").input_ids  # Batch size 1
+        >>> outputs = model(input_ids)
+        ```"""
         inputs = input_processing(
             func=self.call,
             config=self.config,
diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py
index 6daf19d4c8..7b49310568 100644
--- a/src/transformers/models/t5/tokenization_t5.py
+++ b/src/transformers/models/t5/tokenization_t5.py
@@ -52,53 +52,54 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 
 class T5Tokenizer(PreTrainedTokenizer):
     """
-    Construct a T5 tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
+    Construct a T5 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        extra_ids (:obj:`int`, `optional`, defaults to 100):
+        extra_ids (`int`, *optional*, defaults to 100):
             Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are
             accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are
             indexed from the end of the vocabulary up to beginning ("<extra_id_0>" is the last token in the vocabulary
-            like in T5 preprocessing see `here
-            <https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__).
-        additional_special_tokens (:obj:`List[str]`, `optional`):
+            like in T5 preprocessing see [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)).
+        additional_special_tokens (`List[str]`, *optional*):
             Additional special tokens used by the tokenizer.
-        sp_model_kwargs (:obj:`dict`, `optional`):
-            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
-            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
 
-            - ``enable_sampling``: Enable subword regularization.
-            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
 
-              - ``nbest_size = {0,1}``: No sampling is performed.
-              - ``nbest_size > 1``: samples from the nbest_size results.
-              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                 using forward-filtering-and-backward-sampling algorithm.
 
-            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
               BPE-dropout.
 
     Attributes:
-        sp_model (:obj:`SentencePieceProcessor`):
-            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -161,18 +162,18 @@ class T5Tokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
         if already_has_special_tokens:
             return super().get_special_tokens_mask(
@@ -202,13 +203,13 @@ class T5Tokenizer(PreTrainedTokenizer):
         use of token type ids, therefore a list of zeros is returned.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of zeros.
+            `List[int]`: List of zeros.
         """
         eos = [self.eos_token_id]
 
@@ -223,17 +224,17 @@ class T5Tokenizer(PreTrainedTokenizer):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. A sequence has the following format:
 
-        - single sequence: ``X </s>``
-        - pair of sequences: ``A </s> B </s>``
+        - single sequence: `X </s>`
+        - pair of sequences: `A </s> B </s>`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         token_ids_0 = self._add_eos_if_not_present(token_ids_0)
         if token_ids_1 is None:
diff --git a/src/transformers/models/t5/tokenization_t5_fast.py b/src/transformers/models/t5/tokenization_t5_fast.py
index faf8681071..88833b2daa 100644
--- a/src/transformers/models/t5/tokenization_t5_fast.py
+++ b/src/transformers/models/t5/tokenization_t5_fast.py
@@ -62,35 +62,36 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 
 class T5TokenizerFast(PreTrainedTokenizerFast):
     """
-    Construct a "fast" T5 tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
-    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__.
+    Construct a "fast" T5 tokenizer (backed by HuggingFace's *tokenizers* library). Based on [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
     methods. Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        extra_ids (:obj:`int`, `optional`, defaults to 100):
+        extra_ids (`int`, *optional*, defaults to 100):
             Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are
             accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are
             indexed from the end of the vocabulary up to beginning ("<extra_id_0>" is the last token in the vocabulary
-            like in T5 preprocessing see `here
-            <https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__).
-        additional_special_tokens (:obj:`List[str]`, `optional`):
+            like in T5 preprocessing see [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)).
+        additional_special_tokens (`List[str]`, *optional*):
             Additional special tokens used by the tokenizer.
     """
 
@@ -167,17 +168,17 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. A sequence has the following format:
 
-        - single sequence: ``X </s>``
-        - pair of sequences: ``A </s> B </s>``
+        - single sequence: `X </s>`
+        - pair of sequences: `A </s> B </s>`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         token_ids_0 = token_ids_0 + [self.eos_token_id]
         if token_ids_1 is None:
@@ -194,13 +195,13 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
         use of token type ids, therefore a list of zeros is returned.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of zeros.
+            `List[int]`: List of zeros.
         """
         eos = [self.eos_token_id]
 
diff --git a/src/transformers/models/tapas/configuration_tapas.py b/src/transformers/models/tapas/configuration_tapas.py
index d59dc00f45..5c9c06dbc1 100644
--- a/src/transformers/models/tapas/configuration_tapas.py
+++ b/src/transformers/models/tapas/configuration_tapas.py
@@ -36,109 +36,110 @@ TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class TapasConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.TapasModel`. It is used to
+    This is the configuration class to store the configuration of a [`TapasModel`]. It is used to
     instantiate a TAPAS model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the TAPAS `tapas-base-finetuned-sqa`
-    architecture. Configuration objects inherit from :class:`~transformers.PreTrainedConfig` and can be used to control
-    the model outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    configuration with the defaults will yield a similar configuration to that of the TAPAS *tapas-base-finetuned-sqa*
+    architecture. Configuration objects inherit from [`PreTrainedConfig`] and can be used to control
+    the model outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
     Hyperparameters additional to BERT are taken from run_task_main.py and hparam_utils.py of the original
     implementation. Original implementation available at https://github.com/google-research/tapas/tree/master.
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the TAPAS model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.TapasModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            `inputs_ids` passed when calling [`TapasModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"swish"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_sizes (:obj:`List[int]`, `optional`, defaults to :obj:`[3, 256, 256, 2, 256, 256, 10]`):
-            The vocabulary sizes of the :obj:`token_type_ids` passed when calling :class:`~transformers.TapasModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_sizes (`List[int]`, *optional*, defaults to `[3, 256, 256, 2, 256, 256, 10]`):
+            The vocabulary sizes of the `token_type_ids` passed when calling [`TapasModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        positive_label_weight (:obj:`float`, `optional`, defaults to 10.0):
+        positive_label_weight (`float`, *optional*, defaults to 10.0):
             Weight for positive labels.
-        num_aggregation_labels (:obj:`int`, `optional`, defaults to 0):
+        num_aggregation_labels (`int`, *optional*, defaults to 0):
             The number of aggregation operators to predict.
-        aggregation_loss_weight (:obj:`float`, `optional`, defaults to 1.0):
+        aggregation_loss_weight (`float`, *optional*, defaults to 1.0):
             Importance weight for the aggregation loss.
-        use_answer_as_supervision (:obj:`bool`, `optional`):
+        use_answer_as_supervision (`bool`, *optional*):
             Whether to use the answer as the only supervision for aggregation examples.
-        answer_loss_importance (:obj:`float`, `optional`, defaults to 1.0):
+        answer_loss_importance (`float`, *optional*, defaults to 1.0):
             Importance weight for the regression loss.
-        use_normalized_answer_loss (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        use_normalized_answer_loss (`bool`, *optional*, defaults to `False`):
             Whether to normalize the answer loss by the maximum of the predicted and expected value.
-        huber_loss_delta (:obj:`float`, `optional`):
+        huber_loss_delta (`float`, *optional*):
             Delta parameter used to calculate the regression loss.
-        temperature (:obj:`float`, `optional`, defaults to 1.0):
+        temperature (`float`, *optional*, defaults to 1.0):
             Value used to control (OR change) the skewness of cell logits probabilities.
-        aggregation_temperature (:obj:`float`, `optional`, defaults to 1.0):
+        aggregation_temperature (`float`, *optional*, defaults to 1.0):
             Scales aggregation logits to control the skewness of probabilities.
-        use_gumbel_for_cells (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        use_gumbel_for_cells (`bool`, *optional*, defaults to `False`):
             Whether to apply Gumbel-Softmax to cell selection.
-        use_gumbel_for_aggregation (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        use_gumbel_for_aggregation (`bool`, *optional*, defaults to `False`):
             Whether to apply Gumbel-Softmax to aggregation selection.
-        average_approximation_function (:obj:`string`, `optional`, defaults to :obj:`"ratio"`):
-            Method to calculate the expected average of cells in the weak supervision case. One of :obj:`"ratio"`,
-            :obj:`"first_order"` or :obj:`"second_order"`.
-        cell_selection_preference (:obj:`float`, `optional`):
+        average_approximation_function (`string`, *optional*, defaults to `"ratio"`):
+            Method to calculate the expected average of cells in the weak supervision case. One of `"ratio"`,
+            `"first_order"` or `"second_order"`.
+        cell_selection_preference (`float`, *optional*):
             Preference for cell selection in ambiguous cases. Only applicable in case of weak supervision for
             aggregation (WTQ, WikiSQL). If the total mass of the aggregation probabilities (excluding the "NONE"
             operator) is higher than this hyperparameter, then aggregation is predicted for an example.
-        answer_loss_cutoff (:obj:`float`, `optional`):
+        answer_loss_cutoff (`float`, *optional*):
             Ignore examples with answer loss larger than cutoff.
-        max_num_rows (:obj:`int`, `optional`, defaults to 64):
+        max_num_rows (`int`, *optional*, defaults to 64):
             Maximum number of rows.
-        max_num_columns (:obj:`int`, `optional`, defaults to 32):
+        max_num_columns (`int`, *optional*, defaults to 32):
             Maximum number of columns.
-        average_logits_per_cell (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        average_logits_per_cell (`bool`, *optional*, defaults to `False`):
             Whether to average logits per cell.
-        select_one_column (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        select_one_column (`bool`, *optional*, defaults to `True`):
             Whether to constrain the model to only select cells from a single column.
-        allow_empty_column_selection (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        allow_empty_column_selection (`bool`, *optional*, defaults to `False`):
             Whether to allow not to select any column.
-        init_cell_selection_weights_to_zero (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        init_cell_selection_weights_to_zero (`bool`, *optional*, defaults to `False`):
             Whether to initialize cell selection weights to 0 so that the initial probabilities are 50%.
-        reset_position_index_per_cell (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        reset_position_index_per_cell (`bool`, *optional*, defaults to `True`):
             Whether to restart position indexes at every cell (i.e. use relative position embeddings).
-        disable_per_token_loss (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        disable_per_token_loss (`bool`, *optional*, defaults to `False`):
             Whether to disable any (strong or weak) supervision on cells.
-        aggregation_labels (:obj:`Dict[int, label]`, `optional`):
+        aggregation_labels (`Dict[int, label]`, *optional*):
             The aggregation labels used to aggregate the results. For example, the WTQ models have the following
-            aggregation labels: :obj:`{0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"}`
-        no_aggregation_label_index (:obj:`int`, `optional`):
+            aggregation labels: `{0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"}`
+        no_aggregation_label_index (`int`, *optional*):
             If the aggregation labels are defined and one of these labels represents "No aggregation", this should be
             set to its index. For example, the WTQ models have the "NONE" aggregation label at index 0, so that value
             should be set to 0 for these models.
 
 
-    Example::
+    Example:
 
-        >>> from transformers import TapasModel, TapasConfig
-        >>> # Initializing a default (SQA) Tapas configuration
-        >>> configuration = TapasConfig()
-        >>> # Initializing a model from the configuration
-        >>> model = TapasModel(configuration)
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    ```python
+    >>> from transformers import TapasModel, TapasConfig
+    >>> # Initializing a default (SQA) Tapas configuration
+    >>> configuration = TapasConfig()
+    >>> # Initializing a model from the configuration
+    >>> model = TapasModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
 
     model_type = "tapas"
 
diff --git a/src/transformers/models/tapas/modeling_tapas.py b/src/transformers/models/tapas/modeling_tapas.py
index cca691bb76..e9b87abafc 100644
--- a/src/transformers/models/tapas/modeling_tapas.py
+++ b/src/transformers/models/tapas/modeling_tapas.py
@@ -912,26 +912,27 @@ class TapasModel(TapasPreTrainedModel):
         r"""
         Returns:
 
-        Examples::
+        Examples:
 
-            >>> from transformers import TapasTokenizer, TapasModel
-            >>> import pandas as pd
+        ```python
+        >>> from transformers import TapasTokenizer, TapasModel
+        >>> import pandas as pd
 
-            >>> tokenizer = TapasTokenizer.from_pretrained('google/tapas-base')
-            >>> model = TapasModel.from_pretrained('google/tapas-base')
+        >>> tokenizer = TapasTokenizer.from_pretrained('google/tapas-base')
+        >>> model = TapasModel.from_pretrained('google/tapas-base')
 
-            >>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
-            ...         'Age': ["56", "45", "59"],
-            ...         'Number of movies': ["87", "53", "69"]
-            ... }
-            >>> table = pd.DataFrame.from_dict(data)
-            >>> queries = ["How many movies has George Clooney played in?", "How old is Brad Pitt?"]
+        >>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
+        ...         'Age': ["56", "45", "59"],
+        ...         'Number of movies': ["87", "53", "69"]
+        ... }
+        >>> table = pd.DataFrame.from_dict(data)
+        >>> queries = ["How many movies has George Clooney played in?", "How old is Brad Pitt?"]
 
-            >>> inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="pt")
-            >>> outputs = model(**inputs)
+        >>> inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="pt")
+        >>> outputs = model(**inputs)
 
-            >>> last_hidden_states = outputs.last_hidden_state
-        """
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/tapas/modeling_tf_tapas.py b/src/transformers/models/tapas/modeling_tf_tapas.py
index 946dbc86db..b9269bd55f 100644
--- a/src/transformers/models/tapas/modeling_tf_tapas.py
+++ b/src/transformers/models/tapas/modeling_tf_tapas.py
@@ -1004,26 +1004,27 @@ class TFTapasModel(TFTapasPreTrainedModel):
         r"""
         Returns:
 
-        Examples::
+        Examples:
 
-            >>> from transformers import TapasTokenizer, TapasModel
-            >>> import pandas as pd
+        ```python
+        >>> from transformers import TapasTokenizer, TapasModel
+        >>> import pandas as pd
 
-            >>> tokenizer = TapasTokenizer.from_pretrained('google/tapas-base')
-            >>> model = TapasModel.from_pretrained('google/tapas-base')
+        >>> tokenizer = TapasTokenizer.from_pretrained('google/tapas-base')
+        >>> model = TapasModel.from_pretrained('google/tapas-base')
 
-            >>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
-            ...         'Age': ["56", "45", "59"],
-            ...         'Number of movies': ["87", "53", "69"]
-            ... }
-            >>> table = pd.DataFrame.from_dict(data)
-            >>> queries = ["How many movies has George Clooney played in?", "How old is Brad Pitt?"]
+        >>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
+        ...         'Age': ["56", "45", "59"],
+        ...         'Number of movies': ["87", "53", "69"]
+        ... }
+        >>> table = pd.DataFrame.from_dict(data)
+        >>> queries = ["How many movies has George Clooney played in?", "How old is Brad Pitt?"]
 
-            >>> inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="tf")
-            >>> outputs = model(**inputs)
+        >>> inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="tf")
+        >>> outputs = model(**inputs)
 
-            >>> last_hidden_states = outputs.last_hidden_state
-        """
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
         inputs = input_processing(
             func=self.call,
             config=self.config,
diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py
index ef80857da5..92117b7031 100644
--- a/src/transformers/models/tapas/tokenization_tapas.py
+++ b/src/transformers/models/tapas/tokenization_tapas.py
@@ -89,7 +89,7 @@ PRETRAINED_INIT_CONFIGURATION = {name: {"do_lower_case": True} for name in PRETR
 
 class TapasTruncationStrategy(ExplicitEnum):
     """
-    Possible values for the ``truncation`` argument in :meth:`~transformers.TapasTokenizer.__call__`. Useful for
+    Possible values for the `truncation` argument in [`~TapasTokenizer.__call__`]. Useful for
     tab-completion in an IDE.
     """
 
@@ -146,44 +146,44 @@ def whitespace_tokenize(text):
 
 
 TAPAS_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
-            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            add_special_tokens (`bool`, *optional*, defaults to `True`):
                 Whether or not to encode the sequences with the special tokens relative to their model.
-            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
                 Activates and controls padding. Accepts the following values:
 
-                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a
                   single sequence if provided).
-                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
                   maximum acceptable input length for the model if that argument is not provided.
-                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
                   different lengths).
-            truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.TapasTruncationStrategy`, `optional`, defaults to :obj:`False`):
+            truncation (`bool`, `str` or [`TapasTruncationStrategy`], *optional*, defaults to `False`):
                 Activates and controls truncation. Accepts the following values:
 
-                * :obj:`True` or :obj:`'drop_rows_to_fit'`: Truncate to a maximum length specified with the argument
-                  :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
+                - `True` or `'drop_rows_to_fit'`: Truncate to a maximum length specified with the argument
+                  `max_length` or to the maximum acceptable input length for the model if that argument is not
                   provided. This will truncate row by row, removing rows from the table.
-                * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with
+                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with
                   sequence lengths greater than the model maximum admissible input size).
-            max_length (:obj:`int`, `optional`):
+            max_length (`int`, *optional*):
                 Controls the maximum length to use by one of the truncation/padding parameters.
 
-                If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
+                If left unset or set to `None`, this will use the predefined model maximum length if a maximum
                 length is required by one of the truncation/padding parameters. If the model has no specific maximum
                 input length (like XLNet) truncation/padding to a maximum length will be deactivated.
-            is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the input is already pre-tokenized (e.g., split into words). If set to :obj:`True`, the
+            is_split_into_words (`bool`, *optional*, defaults to `False`):
+                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
                 tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                 which it will tokenize. This is useful for NER or token classification.
-            pad_to_multiple_of (:obj:`int`, `optional`):
+            pad_to_multiple_of (`int`, *optional*):
                 If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                 the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
 
-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
 """
 
 
@@ -192,11 +192,11 @@ class TapasTokenizer(PreTrainedTokenizer):
     Construct a TAPAS tokenizer. Based on WordPiece. Flattens a table and one or more related sentences to be used by
     TAPAS models.
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
-    :class:`~transformers.TapasTokenizer` creates several token type ids to encode tabular structure. To be more
-    precise, it adds 7 token type ids, in the following order: :obj:`segment_ids`, :obj:`column_ids`, :obj:`row_ids`,
-    :obj:`prev_labels`, :obj:`column_ranks`, :obj:`inv_column_ranks` and :obj:`numeric_relations`:
+    [`TapasTokenizer`] creates several token type ids to encode tabular structure. To be more
+    precise, it adds 7 token type ids, in the following order: `segment_ids`, `column_ids`, `row_ids`,
+    `prev_labels`, `column_ranks`, `inv_column_ranks` and `numeric_relations`:
 
     - segment_ids: indicate whether a token belongs to the question (0) or the table (1). 0 for special tokens and
       padding.
@@ -215,56 +215,56 @@ class TapasTokenizer(PreTrainedTokenizer):
     - numeric_relations: indicate numeric relations between the question and the tokens of the table. 0 for all
       question tokens, special tokens and padding.
 
-    :class:`~transformers.TapasTokenizer` runs end-to-end tokenization on a table and associated sentences: punctuation
+    [`TapasTokenizer`] runs end-to-end tokenization on a table and associated sentences: punctuation
     splitting and wordpiece.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             File containing the vocabulary.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
             Whether or not to lowercase the input when tokenizing.
-        do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
             Whether or not to do basic tokenization before WordPiece.
-        never_split (:obj:`Iterable`, `optional`):
+        never_split (`Iterable`, *optional*):
             Collection of tokens which will never be split during tokenization. Only has an effect when
-            :obj:`do_basic_tokenize=True`
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"[UNK]"`):
+            `do_basic_tokenize=True`
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"[PAD]"`):
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
             The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        empty_token (:obj:`str`, `optional`, defaults to :obj:`"[EMPTY]"`):
+        empty_token (`str`, *optional*, defaults to `"[EMPTY]"`):
             The token used for empty cell values in a table. Empty cell values include "", "n/a", "nan" and "?".
-        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
             Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see this
-            `issue <https://github.com/huggingface/transformers/issues/328>`__).
-        strip_accents: (:obj:`bool`, `optional`):
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents: (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for :obj:`lowercase` (as in the original BERT).
-        cell_trim_length (:obj:`int`, `optional`, defaults to -1):
+            value for `lowercase` (as in the original BERT).
+        cell_trim_length (`int`, *optional*, defaults to -1):
             If > 0: Trim cells so that the length is <= this value. Also disables further cell trimming, should thus be
-            used with :obj:`truncation` set to :obj:`True`.
-        max_column_id (:obj:`int`, `optional`):
+            used with `truncation` set to `True`.
+        max_column_id (`int`, *optional*):
             Max column id to extract.
-        max_row_id (:obj:`int`, `optional`):
+        max_row_id (`int`, *optional*):
             Max row id to extract.
-        strip_column_names (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        strip_column_names (`bool`, *optional*, defaults to `False`):
             Whether to add empty strings instead of column names.
-        update_answer_coordinates (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        update_answer_coordinates (`bool`, *optional*, defaults to `False`):
             Whether to recompute the answer coordinates from the answer text.
-        min_question_length (:obj:`int`, `optional`):
+        min_question_length (`int`, *optional*):
             Minimum length of each question in terms of tokens (will be skipped otherwise).
-        max_question_length (:obj:`int`, `optional`):
+        max_question_length (`int`, *optional*):
             Maximum length of each question in terms of tokens (will be skipped otherwise).
     """
 
@@ -421,12 +421,12 @@ class TapasTokenizer(PreTrainedTokenizer):
         Creates the attention mask according to the query token IDs and a list of table values.
 
         Args:
-            query_ids (:obj:`List[int]`): list of token IDs corresponding to the ID.
-            table_values (:obj:`List[TableValue]`): lift of table values, which are named tuples containing the
+            query_ids (`List[int]`): list of token IDs corresponding to the ID.
+            table_values (`List[TableValue]`): lift of table values, which are named tuples containing the
                 token value, the column ID and the row ID of said token.
 
         Returns:
-            :obj:`List[int]`: List of ints containing the attention mask values.
+            `List[int]`: List of ints containing the attention mask values.
         """
         return [1] * (1 + len(query_ids) + 1 + len(table_values))
 
@@ -437,12 +437,12 @@ class TapasTokenizer(PreTrainedTokenizer):
         Creates the segment token type IDs according to the query token IDs and a list of table values.
 
         Args:
-            query_ids (:obj:`List[int]`): list of token IDs corresponding to the ID.
-            table_values (:obj:`List[TableValue]`): lift of table values, which are named tuples containing the
+            query_ids (`List[int]`): list of token IDs corresponding to the ID.
+            table_values (`List[TableValue]`): lift of table values, which are named tuples containing the
                 token value, the column ID and the row ID of said token.
 
         Returns:
-            :obj:`List[int]`: List of ints containing the segment token type IDs values.
+            `List[int]`: List of ints containing the segment token type IDs values.
         """
         table_ids = list(zip(*table_values))[0] if table_values else []
         return [0] * (1 + len(query_ids) + 1) + [1] * len(table_ids)
@@ -454,12 +454,12 @@ class TapasTokenizer(PreTrainedTokenizer):
         Creates the column token type IDs according to the query token IDs and a list of table values.
 
         Args:
-            query_ids (:obj:`List[int]`): list of token IDs corresponding to the ID.
-            table_values (:obj:`List[TableValue]`): lift of table values, which are named tuples containing the
+            query_ids (`List[int]`): list of token IDs corresponding to the ID.
+            table_values (`List[TableValue]`): lift of table values, which are named tuples containing the
                 token value, the column ID and the row ID of said token.
 
         Returns:
-            :obj:`List[int]`: List of ints containing the column token type IDs values.
+            `List[int]`: List of ints containing the column token type IDs values.
         """
         table_column_ids = list(zip(*table_values))[1] if table_values else []
         return [0] * (1 + len(query_ids) + 1) + list(table_column_ids)
@@ -471,12 +471,12 @@ class TapasTokenizer(PreTrainedTokenizer):
         Creates the row token type IDs according to the query token IDs and a list of table values.
 
         Args:
-            query_ids (:obj:`List[int]`): list of token IDs corresponding to the ID.
-            table_values (:obj:`List[TableValue]`): lift of table values, which are named tuples containing the
+            query_ids (`List[int]`): list of token IDs corresponding to the ID.
+            table_values (`List[TableValue]`): lift of table values, which are named tuples containing the
                 token value, the column ID and the row ID of said token.
 
         Returns:
-            :obj:`List[int]`: List of ints containing the row token type IDs values.
+            `List[int]`: List of ints containing the row token type IDs values.
         """
         table_row_ids = list(zip(*table_values))[2] if table_values else []
         return [0] * (1 + len(query_ids) + 1) + list(table_row_ids)
@@ -489,11 +489,11 @@ class TapasTokenizer(PreTrainedTokenizer):
         by concatenating and adding special tokens.
 
         Args:
-            token_ids_0 (:obj:`List[int]`): The ids of the question.
-            token_ids_1 (:obj:`List[int]`, `optional`): The ids of the flattened table.
+            token_ids_0 (`List[int]`): The ids of the question.
+            token_ids_1 (`List[int]`, *optional*): The ids of the flattened table.
 
         Returns:
-            :obj:`List[int]`: The model input with special tokens.
+            `List[int]`: The model input with special tokens.
         """
         if token_ids_1 is None:
             raise ValueError("With TAPAS, you must provide both question IDs and table IDs.")
@@ -505,18 +505,18 @@ class TapasTokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of question IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 List of flattened table IDs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
 
         if already_has_special_tokens:
@@ -563,20 +563,20 @@ class TapasTokenizer(PreTrainedTokenizer):
         Main method to tokenize and prepare for the model one or several sequence(s) related to a table.
 
         Args:
-            table (:obj:`pd.DataFrame`):
-                Table containing tabular data. Note that all cell values must be text. Use `.astype(str)` on a Pandas
+            table (`pd.DataFrame`):
+                Table containing tabular data. Note that all cell values must be text. Use *.astype(str)* on a Pandas
                 dataframe to convert it to string.
-            queries (:obj:`str` or :obj:`List[str]`):
+            queries (`str` or `List[str]`):
                 Question or batch of questions related to a table to be encoded. Note that in case of a batch, all
                 questions must refer to the **same** table.
-            answer_coordinates (:obj:`List[Tuple]` or :obj:`List[List[Tuple]]`, `optional`):
+            answer_coordinates (`List[Tuple]` or `List[List[Tuple]]`, *optional*):
                 Answer coordinates of each table-question pair in the batch. In case only a single table-question pair
                 is provided, then the answer_coordinates must be a single list of one or more tuples. Each tuple must
                 be a (row_index, column_index) pair. The first data row (not the column header row) has index 0. The
                 first column has index 0. In case a batch of table-question pairs is provided, then the
                 answer_coordinates must be a list of lists of tuples (each list corresponding to a single
                 table-question pair).
-            answer_text (:obj:`List[str]` or :obj:`List[List[str]]`, `optional`):
+            answer_text (`List[str]` or `List[List[str]]`, *optional*):
                 Answer text of each table-question pair in the batch. In case only a single table-question pair is
                 provided, then the answer_text must be a single list of one or more strings. Each string must be the
                 answer text of a corresponding answer coordinate. In case a batch of table-question pairs is provided,
@@ -675,22 +675,25 @@ class TapasTokenizer(PreTrainedTokenizer):
         """
         Prepare a table and a list of strings for the model.
 
-        .. warning::
-            This method is deprecated, ``__call__`` should be used instead.
+        <Tip warning={true}>
+
+        This method is deprecated, `__call__` should be used instead.
+
+        </Tip>
 
         Args:
-            table (:obj:`pd.DataFrame`):
-                Table containing tabular data. Note that all cell values must be text. Use `.astype(str)` on a Pandas
+            table (`pd.DataFrame`):
+                Table containing tabular data. Note that all cell values must be text. Use *.astype(str)* on a Pandas
                 dataframe to convert it to string.
-            queries (:obj:`List[str]`):
+            queries (`List[str]`):
                 Batch of questions related to a table to be encoded. Note that all questions must refer to the **same**
                 table.
-            answer_coordinates (:obj:`List[Tuple]` or :obj:`List[List[Tuple]]`, `optional`):
+            answer_coordinates (`List[Tuple]` or `List[List[Tuple]]`, *optional*):
                 Answer coordinates of each table-question pair in the batch. Each tuple must be a (row_index,
                 column_index) pair. The first data row (not the column header row) has index 0. The first column has
                 index 0. The answer_coordinates must be a list of lists of tuples (each list corresponding to a single
                 table-question pair).
-            answer_text (:obj:`List[str]` or :obj:`List[List[str]]`, `optional`):
+            answer_text (`List[str]` or `List[List[str]]`, *optional*):
                 Answer text of each table-question pair in the batch. In case a batch of table-question pairs is
                 provided, then the answer_coordinates must be a list of lists of strings (each list corresponding to a
                 single table-question pair). Each string must be the answer text of a corresponding answer coordinate.
@@ -900,13 +903,13 @@ class TapasTokenizer(PreTrainedTokenizer):
         """
         Prepare a table and a string for the model. This method does not return token type IDs, attention masks, etc.
         which are necessary for the model to work correctly. Use that method if you want to build your processing on
-        your own, otherwise refer to ``__call__``.
+        your own, otherwise refer to `__call__`.
 
         Args:
-            table (:obj:`pd.DataFrame`):
-                Table containing tabular data. Note that all cell values must be text. Use `.astype(str)` on a Pandas
+            table (`pd.DataFrame`):
+                Table containing tabular data. Note that all cell values must be text. Use *.astype(str)* on a Pandas
                 dataframe to convert it to string.
-            query (:obj:`str` or :obj:`List[str]`):
+            query (`str` or `List[str]`):
                 Question related to a table to be encoded.
         """
         encoded_inputs = self.encode_plus(
@@ -953,16 +956,16 @@ class TapasTokenizer(PreTrainedTokenizer):
         Prepare a table and a string for the model.
 
         Args:
-            table (:obj:`pd.DataFrame`):
-                Table containing tabular data. Note that all cell values must be text. Use `.astype(str)` on a Pandas
+            table (`pd.DataFrame`):
+                Table containing tabular data. Note that all cell values must be text. Use *.astype(str)* on a Pandas
                 dataframe to convert it to string.
-            query (:obj:`str` or :obj:`List[str]`):
+            query (`str` or `List[str]`):
                 Question related to a table to be encoded.
-            answer_coordinates (:obj:`List[Tuple]` or :obj:`List[List[Tuple]]`, `optional`):
+            answer_coordinates (`List[Tuple]` or `List[List[Tuple]]`, *optional*):
                 Answer coordinates of each table-question pair in the batch. The answer_coordinates must be a single
                 list of one or more tuples. Each tuple must be a (row_index, column_index) pair. The first data row
                 (not the column header row) has index 0. The first column has index 0.
-            answer_text (:obj:`List[str]` or :obj:`List[List[str]]`, `optional`):
+            answer_text (`List[str]` or `List[List[str]]`, *optional*):
                 Answer text of each table-question pair in the batch. The answer_text must be a single list of one or
                 more strings. Each string must be the answer text of a corresponding answer coordinate.
         """
@@ -1094,19 +1097,19 @@ class TapasTokenizer(PreTrainedTokenizer):
         sequences if overflowing while taking into account the special tokens.
 
         Args:
-            raw_table (:obj:`pd.DataFrame`):
+            raw_table (`pd.DataFrame`):
                 The original table before any transformation (like tokenization) was applied to it.
-            raw_query (:obj:`TextInput` or :obj:`PreTokenizedInput` or :obj:`EncodedInput`):
+            raw_query (`TextInput` or `PreTokenizedInput` or `EncodedInput`):
                 The original query before any transformation (like tokenization) was applied to it.
-            tokenized_table (:obj:`TokenizedTable`):
+            tokenized_table (`TokenizedTable`):
                 The table after tokenization.
-            query_tokens (:obj:`List[str]`):
+            query_tokens (`List[str]`):
                 The query after tokenization.
-            answer_coordinates (:obj:`List[Tuple]` or :obj:`List[List[Tuple]]`, `optional`):
+            answer_coordinates (`List[Tuple]` or `List[List[Tuple]]`, *optional*):
                 Answer coordinates of each table-question pair in the batch. The answer_coordinates must be a single
                 list of one or more tuples. Each tuple must be a (row_index, column_index) pair. The first data row
                 (not the column header row) has index 0. The first column has index 0.
-            answer_text (:obj:`List[str]` or :obj:`List[List[str]]`, `optional`):
+            answer_text (`List[str]` or `List[List[str]]`, *optional*):
                 Answer text of each table-question pair in the batch. The answer_text must be a single list of one or
                 more strings. Each string must be the answer text of a corresponding answer coordinate.
         """
@@ -1267,22 +1270,22 @@ class TapasTokenizer(PreTrainedTokenizer):
         Truncates a sequence pair in-place following the strategy.
 
         Args:
-            query_tokens (:obj:`List[str]`):
+            query_tokens (`List[str]`):
                 List of strings corresponding to the tokenized query.
-            tokenized_table (:obj:`TokenizedTable`):
+            tokenized_table (`TokenizedTable`):
                 Tokenized table
-            num_rows (:obj:`int`):
+            num_rows (`int`):
                 Total number of table rows
-            num_columns (:obj:`int`):
+            num_columns (`int`):
                 Total number of table columns
-            max_length (:obj:`int`):
+            max_length (`int`):
                 Total maximum length.
-            truncation_strategy (:obj:`str` or :class:`~transformers.TapasTruncationStrategy`):
+            truncation_strategy (`str` or [`TapasTruncationStrategy`]):
                 Truncation strategy to use. Seeing as this method should only be called when truncating, the only
-                available strategy is the :obj:`"drop_rows_to_fit"` strategy.
+                available strategy is the `"drop_rows_to_fit"` strategy.
 
         Returns:
-            :obj:`Tuple(int, int)`: tuple containing the number of rows after truncation, and the number of tokens
+            `Tuple(int, int)`: tuple containing the number of rows after truncation, and the number of tokens
             available for each table element.
         """
         if not isinstance(truncation_strategy, TapasTruncationStrategy):
@@ -1319,8 +1322,8 @@ class TapasTokenizer(PreTrainedTokenizer):
         Tokenizes column headers and cell texts of a table.
 
         Args:
-            table (:obj:`pd.Dataframe`):
-                Table. Returns: :obj:`TokenizedTable`: TokenizedTable object.
+            table (`pd.Dataframe`):
+                Table. Returns: `TokenizedTable`: TokenizedTable object.
         """
         tokenized_rows = []
         tokenized_row = []
@@ -1366,8 +1369,8 @@ class TapasTokenizer(PreTrainedTokenizer):
         sequence length of the model.
 
         Args:
-            question_tokens (:obj:`List[String]`):
-                List of question tokens. Returns: :obj:`int`: the number of tokens left for the table, given the model
+            question_tokens (`List[String]`):
+                List of question tokens. Returns: `int`: the number of tokens left for the table, given the model
                 max length.
         """
         return (max_length if max_length is not None else self.model_max_length) - self._question_encoding_cost(
@@ -1887,33 +1890,32 @@ class TapasTokenizer(PreTrainedTokenizer):
 
     def convert_logits_to_predictions(self, data, logits, logits_agg=None, cell_classification_threshold=0.5):
         """
-        Converts logits of :class:`~transformers.TapasForQuestionAnswering` to actual predicted answer coordinates and
+        Converts logits of [`TapasForQuestionAnswering`] to actual predicted answer coordinates and
         optional aggregation indices.
 
-        The original implementation, on which this function is based, can be found `here
-        <https://github.com/google-research/tapas/blob/4908213eb4df7aa988573350278b44c4dbe3f71b/tapas/experiments/prediction_utils.py#L288>`__.
+        The original implementation, on which this function is based, can be found [here](https://github.com/google-research/tapas/blob/4908213eb4df7aa988573350278b44c4dbe3f71b/tapas/experiments/prediction_utils.py#L288).
 
         Args:
-            data (:obj:`dict`):
+            data (`dict`):
                 Dictionary mapping features to actual values. Should be created using
-                :class:`~transformers.TapasTokenizer`.
-            logits (:obj:`torch.Tensor` or :obj:`tf.Tensor` of shape ``(batch_size, sequence_length)``):
+                [`TapasTokenizer`].
+            logits (`torch.Tensor` or `tf.Tensor` of shape `(batch_size, sequence_length)`):
                 Tensor containing the logits at the token level.
-            logits_agg (:obj:`torch.Tensor` or :obj:`tf.Tensor` of shape ``(batch_size, num_aggregation_labels)``, `optional`):
+            logits_agg (`torch.Tensor` or `tf.Tensor` of shape `(batch_size, num_aggregation_labels)`, *optional*):
                 Tensor containing the aggregation logits.
-            cell_classification_threshold (:obj:`float`, `optional`, defaults to 0.5):
+            cell_classification_threshold (`float`, *optional*, defaults to 0.5):
                 Threshold to be used for cell selection. All table cells for which their probability is larger than
                 this threshold will be selected.
 
         Returns:
-            :obj:`tuple` comprising various elements depending on the inputs:
+            `tuple` comprising various elements depending on the inputs:
 
-            - predicted_answer_coordinates (``List[List[[tuple]]`` of length ``batch_size``): Predicted answer
+            - predicted_answer_coordinates (`List[List[[tuple]]` of length `batch_size`): Predicted answer
               coordinates as a list of lists of tuples. Each element in the list contains the predicted answer
               coordinates of a single example in the batch, as a list of tuples. Each tuple is a cell, i.e. (row index,
               column index).
-            - predicted_aggregation_indices (``List[int]``of length ``batch_size``, `optional`, returned when
-              ``logits_aggregation`` is provided): Predicted aggregation operator indices of the aggregation head.
+            - predicted_aggregation_indices (`List[int]`of length `batch_size`, *optional*, returned when
+              `logits_aggregation` is provided): Predicted aggregation operator indices of the aggregation head.
         """
         # converting to numpy arrays to work with PT/TF
         logits = logits.numpy()
@@ -1994,19 +1996,18 @@ class BasicTokenizer(object):
     Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
 
     Args:
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
             Whether or not to lowercase the input when tokenizing.
-        never_split (:obj:`Iterable`, `optional`):
+        never_split (`Iterable`, *optional*):
             Collection of tokens which will never be split during tokenization. Only has an effect when
-            :obj:`do_basic_tokenize=True`
-        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            `do_basic_tokenize=True`
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
             Whether or not to tokenize Chinese characters.
 
-            This should likely be deactivated for Japanese (see this `issue
-            <https://github.com/huggingface/transformers/issues/328>`__).
-        strip_accents: (:obj:`bool`, `optional`):
+            This should likely be deactivated for Japanese (see this [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents: (`bool`, *optional*):
             Whether or not to strip all accents. If this option is not specified, then it will be determined by the
-            value for :obj:`lowercase` (as in the original BERT).
+            value for `lowercase` (as in the original BERT).
     """
 
     def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
@@ -2023,9 +2024,9 @@ class BasicTokenizer(object):
         WordPieceTokenizer.
 
         Args:
-            **never_split**: (`optional`) list of str
+            never_split (`LIst[str]`, *optional*)
                 Kept for backward compatibility purposes. Now implemented directly at the base class level (see
-                :func:`PreTrainedTokenizer.tokenize`) List of token not to split.
+                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
         """
         # union() returns a new set by concatenating the two sets.
         never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
@@ -2152,14 +2153,14 @@ class WordpieceTokenizer(object):
         Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
         tokenization using the given vocabulary.
 
-        For example, :obj:`input = "unaffable"` wil return as output :obj:`["un", "##aff", "##able"]`.
+        For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`.
 
         Args:
-          text: A single token or whitespace separated tokens. This should have
-            already been passed through `BasicTokenizer`.
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through *BasicTokenizer*.
 
         Returns:
-          A list of wordpiece tokens.
+            A list of wordpiece tokens.
         """
 
         output_tokens = []
diff --git a/src/transformers/models/transfo_xl/configuration_transfo_xl.py b/src/transformers/models/transfo_xl/configuration_transfo_xl.py
index 6787f0d022..47f824c6ab 100644
--- a/src/transformers/models/transfo_xl/configuration_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/configuration_transfo_xl.py
@@ -28,81 +28,82 @@ TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class TransfoXLConfig(PretrainedConfig):
     """
-    This is the configuration class to store the configuration of a :class:`~transformers.TransfoXLModel` or a
-    :class:`~transformers.TFTransfoXLModel`. It is used to instantiate a Transformer-XL model according to the
+    This is the configuration class to store the configuration of a [`TransfoXLModel`] or a
+    [`TFTransfoXLModel`]. It is used to instantiate a Transformer-XL model according to the
     specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a
-    similar configuration to that of the `Transformer XL <https://huggingface.co/transfo-xl-wt103>`__ architecture.
+    similar configuration to that of the [Transformer XL](https://huggingface.co/transfo-xl-wt103) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 267735):
+        vocab_size (`int`, *optional*, defaults to 267735):
             Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.TransfoXLModel` or
-            :class:`~transformers.TFTransfoXLModel`.
-        cutoffs (:obj:`List[int]`, `optional`, defaults to :obj:`[20000, 40000, 200000]`):
+            `inputs_ids` passed when calling [`TransfoXLModel`] or
+            [`TFTransfoXLModel`].
+        cutoffs (`List[int]`, *optional*, defaults to `[20000, 40000, 200000]`):
             Cutoffs for the adaptive softmax.
-        d_model (:obj:`int`, `optional`, defaults to 1024):
+        d_model (`int`, *optional*, defaults to 1024):
             Dimensionality of the model's hidden states.
-        d_embed (:obj:`int`, `optional`, defaults to 1024):
+        d_embed (`int`, *optional*, defaults to 1024):
             Dimensionality of the embeddings
-        n_head (:obj:`int`, `optional`, defaults to 16):
+        n_head (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
-        d_head (:obj:`int`, `optional`, defaults to 64):
+        d_head (`int`, *optional*, defaults to 64):
             Dimensionality of the model's heads.
-        d_inner (:obj:`int`, `optional`, defaults to 4096):
+        d_inner (`int`, *optional*, defaults to 4096):
             Inner dimension in FF
-        div_val (:obj:`int`, `optional`, defaults to 4):
+        div_val (`int`, *optional*, defaults to 4):
             Divident value for adapative input and softmax
-        pre_lnorm (:obj:`boolean`, `optional`, defaults to :obj:`False`):
+        pre_lnorm (`boolean`, *optional*, defaults to `False`):
             Whether or not to apply LayerNorm to the input instead of the output in the blocks.
-        n_layer (:obj:`int`, `optional`, defaults to 18):
+        n_layer (`int`, *optional*, defaults to 18):
             Number of hidden layers in the Transformer encoder.
-        mem_len (:obj:`int`, `optional`, defaults to 1600):
+        mem_len (`int`, *optional*, defaults to 1600):
             Length of the retained previous heads.
-        clamp_len (:obj:`int`, `optional`, defaults to 1000):
+        clamp_len (`int`, *optional*, defaults to 1000):
             Use the same pos embeddings after clamp_len.
-        same_length (:obj:`boolean`, `optional`, defaults to :obj:`True`):
+        same_length (`boolean`, *optional*, defaults to `True`):
             Whether or not to use the same attn length for all tokens
-        proj_share_all_but_first (:obj:`boolean`, `optional`, defaults to :obj:`True`):
+        proj_share_all_but_first (`boolean`, *optional*, defaults to `True`):
             True to share all but first projs, False not to share.
-        attn_type (:obj:`int`, `optional`, defaults to 0):
+        attn_type (`int`, *optional*, defaults to 0):
             Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
-        sample_softmax (:obj:`int`, `optional`, defaults to -1):
+        sample_softmax (`int`, *optional*, defaults to -1):
             Number of samples in the sampled softmax.
-        adaptive (:obj:`boolean`, `optional`, defaults to :obj:`True`):
+        adaptive (`boolean`, *optional*, defaults to `True`):
             Whether or not to use adaptive softmax.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+        dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        dropatt (:obj:`float`, `optional`, defaults to 0):
+        dropatt (`float`, *optional*, defaults to 0):
             The dropout ratio for the attention probabilities.
-        untie_r (:obj:`boolean`, `optional`, defaults to :obj:`True`):
+        untie_r (`boolean`, *optional*, defaults to `True`):
             Whether ot not to untie relative position biases.
-        init (:obj:`str`, `optional`, defaults to :obj:`"normal"`):
+        init (`str`, *optional*, defaults to `"normal"`):
             Parameter initializer to use.
-        init_range (:obj:`float`, `optional`, defaults to 0.01):
+        init_range (`float`, *optional*, defaults to 0.01):
             Parameters initialized by U(-init_range, init_range).
-        proj_init_std (:obj:`float`, `optional`, defaults to 0.01):
+        proj_init_std (`float`, *optional*, defaults to 0.01):
             Parameters initialized by N(0, init_std)
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
+        init_std (`float`, *optional*, defaults to 0.02):
             Parameters initialized by N(0, init_std)
-        layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
             The epsilon to use in the layer normalization layers
 
-    Examples::
+    Examples:
 
-        >>> from transformers import TransfoXLConfig, TransfoXLModel
+    ```python
+    >>> from transformers import TransfoXLConfig, TransfoXLModel
 
-        >>> # Initializing a Transformer XL configuration
-        >>> configuration = TransfoXLConfig()
+    >>> # Initializing a Transformer XL configuration
+    >>> configuration = TransfoXLConfig()
 
-        >>> # Initializing a model from the configuration
-        >>> model = TransfoXLModel(configuration)
+    >>> # Initializing a model from the configuration
+    >>> model = TransfoXLModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
 
     model_type = "transfo-xl"
     keys_to_ignore_at_inference = ["mems"]
diff --git a/src/transformers/models/transfo_xl/modeling_transfo_xl_utilities.py b/src/transformers/models/transfo_xl/modeling_transfo_xl_utilities.py
index 1f804a278f..ca7a1b5621 100644
--- a/src/transformers/models/transfo_xl/modeling_transfo_xl_utilities.py
+++ b/src/transformers/models/transfo_xl/modeling_transfo_xl_utilities.py
@@ -189,18 +189,18 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
 
     def log_prob(self, hidden):
         r"""
-        Computes log probabilities for all :math:`n\_classes` From:
+        Computes log probabilities for all \\(n\_classes\\) From:
         https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.p
 
         Args:
             hidden (Tensor): a minibatch of example
 
         Returns:
-            log-probabilities of for each class :math:`c` in range :math:`0 <= c <= n\_classes`, where
-            :math:`n\_classes` is a parameter passed to ``AdaptiveLogSoftmaxWithLoss`` constructor. Shape:
+            log-probabilities of for each class \\(c\\) in range \\(0 <= c <= n\_classes\\), where
+            \\(n\_classes\\) is a parameter passed to `AdaptiveLogSoftmaxWithLoss` constructor. Shape:
 
-            - Input: :math:`(N, in\_features)`
-            - Output: :math:`(N, n\_classes)`
+            - Input: \\((N, in\_features)\\)
+            - Output: \\((N, n\_classes)\\)
         """
         if self.n_clusters == 0:
             logit = self._compute_logit(hidden, self.out_layers[0].weight, self.out_layers[0].bias, self.out_projs[0])
diff --git a/src/transformers/models/transfo_xl/tokenization_transfo_xl.py b/src/transformers/models/transfo_xl/tokenization_transfo_xl.py
index eb0f2f5a18..f5f1412148 100644
--- a/src/transformers/models/transfo_xl/tokenization_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/tokenization_transfo_xl.py
@@ -76,10 +76,12 @@ def tokenize_numbers(text_array: List[str]) -> List[str]:
     Returns:
         A list of strings with tokenized numbers.
 
-    Example::
-        >>> tokenize_numbers(["$", "5,000", "1.73", "m"])
-        ["$", "5", "@,@", "000", "1", "@.@", "73", "m"]
-    """
+    Example:
+
+    ```python
+    >>> tokenize_numbers(["$", "5,000", "1.73", "m"])
+    ["$", "5", "@,@", "000", "1", "@.@", "73", "m"]
+    ```"""
     tokenized = []
     for i in range(len(text_array)):
         reg, sub = MATCH_NUMBERS
@@ -91,7 +93,7 @@ def tokenize_numbers(text_array: List[str]) -> List[str]:
 
 def detokenize_numbers(text: str) -> str:
     """
-    Inverts the operation of `tokenize_numbers`. This is replacing ' @,@ ' and ' @.@' by ',' and '.'.
+    Inverts the operation of *tokenize_numbers*. This is replacing ' @,@ ' and ' @.@' by ',' and '.'.
 
     Args:
         text: A string where the number should be detokenized.
@@ -99,10 +101,12 @@ def detokenize_numbers(text: str) -> str:
     Returns:
         A detokenized string.
 
-    Example::
-        >>> detokenize_numbers("$ 5 @,@ 000 1 @.@ 73 m")
-        "$ 5,000 1.73 m"
-    """
+    Example:
+
+    ```python
+    >>> detokenize_numbers("$ 5 @,@ 000 1 @.@ 73 m")
+    "$ 5,000 1.73 m"
+    ```"""
     for reg, sub in DETOKENIZE_NUMBERS:
         text = re.sub(reg, sub, text)
     return text
@@ -110,41 +114,40 @@ def detokenize_numbers(text: str) -> str:
 
 class TransfoXLTokenizer(PreTrainedTokenizer):
     """
-    Construct a Transformer-XL tokenizer adapted from Vocab class in `the original code
-    <https://github.com/kimiyoung/transformer-xl>`__. The Transformer-XL tokenizer is a word-level tokenizer (no
+    Construct a Transformer-XL tokenizer adapted from Vocab class in [the original code](https://github.com/kimiyoung/transformer-xl). The Transformer-XL tokenizer is a word-level tokenizer (no
     sub-word tokenization).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        special (:obj:`List[str]`, `optional`):
+        special (`List[str]`, *optional*):
             A list of special tokens (to be treated by the original implementation of this tokenizer).
-        min_freq (:obj:`int`, `optional`, defaults to 0):
+        min_freq (`int`, *optional*, defaults to 0):
             The minimum number of times a token has to be present in order to be kept in the vocabulary (otherwise it
-            will be mapped to :obj:`unk_token`).
-        max_size (:obj:`int`, `optional`):
+            will be mapped to `unk_token`).
+        max_size (`int`, *optional*):
             The maximum size of the vocabulary. If left unset, it will default to the size of the vocabulary found
-            after excluding the tokens according to the :obj:`min_freq` rule.
-        lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            after excluding the tokens according to the `min_freq` rule.
+        lower_case (`bool`, *optional*, defaults to `False`):
             Whether or not to lowercase the input when tokenizing.
-        delimiter (:obj:`str`, `optional`):
+        delimiter (`str`, *optional*):
             The delimiter used between tokens.
-        vocab_file (:obj:`str`, `optional`):
+        vocab_file (`str`, *optional*):
             File containing the vocabulary (from the original implementation).
-        pretrained_vocab_file (:obj:`str`, `optional`):
-            File containing the vocabulary as saved with the :obj:`save_pretrained()` method.
-        never_split (:obj:`List[str]`, `optional`):
+        pretrained_vocab_file (`str`, *optional*):
+            File containing the vocabulary as saved with the `save_pretrained()` method.
+        never_split (`List[str]`, *optional*):
             List of tokens that should never be split. If no list is specified, will simply use the existing special
             tokens.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"<eos>"`):
+        eos_token (`str`, *optional*, defaults to `"<eos>"`):
             The end of sequence token.
-        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<formula>"]`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `["<formula>"]`):
             A list of additional special tokens (for the HuggingFace functionality).
-        language (:obj:`str`, `optional`, defaults to :obj:`"en"`):
+        language (`str`, *optional*, defaults to `"en"`):
             The language of this tokenizer (used for mose preprocessing).
     """
 
@@ -407,8 +410,8 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
 
     def moses_pipeline(self, text: str) -> List[str]:
         """
-        Does basic tokenization using :class:`sacremoses.MosesPunctNormalizer` and :class:`sacremoses.MosesTokenizer`
-        with `aggressive_dash_splits=True` (see :func:`sacremoses.tokenize.MosesTokenizer.tokenize`). Additionally,
+        Does basic tokenization using [`sacremoses.MosesPunctNormalizer`] and [`sacremoses.MosesTokenizer`]
+        with *aggressive_dash_splits=True* (see [`sacremoses.tokenize.MosesTokenizer.tokenize`]). Additionally,
         large comma-separated numbers and floating point values are split. E.g. "23,000 people are 1.80m tall" -> "23
         @,@ 000 people are 1 @.@ 80m tall"
 
@@ -418,11 +421,13 @@ class TransfoXLTokenizer(PreTrainedTokenizer):
         Returns:
             A list of tokenized string
 
-        Example::
-            >>> tokenizer = TransfoXLTokenizer.from_pretrained("transfo-xl-wt103")
-            >>> tokenizer.moses_pipeline("23,000 people are 1.80 m tall")
-            ['23', '@,@', '000', 'people', 'are', '1', '@.@', '80', 'm', 'tall']
-        """
+        Example:
+
+        ```python
+        >>> tokenizer = TransfoXLTokenizer.from_pretrained("transfo-xl-wt103")
+        >>> tokenizer.moses_pipeline("23,000 people are 1.80 m tall")
+        ['23', '@,@', '000', 'people', 'are', '1', '@.@', '80', 'm', 'tall']
+        ```"""
         text = self.moses_punct_norm(text)
         text = self.moses_tokenize(text)
         text = tokenize_numbers(text)
diff --git a/src/transformers/models/trocr/configuration_trocr.py b/src/transformers/models/trocr/configuration_trocr.py
index 643827847c..6b6c2d180a 100644
--- a/src/transformers/models/trocr/configuration_trocr.py
+++ b/src/transformers/models/trocr/configuration_trocr.py
@@ -28,68 +28,68 @@ TROCR_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class TrOCRConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.TrOCRForCausalLM`. It is used
+    This is the configuration class to store the configuration of a [`TrOCRForCausalLM`]. It is used
     to instantiate an TrOCR model according to the specified arguments, defining the model architecture. Instantiating
-    a configuration with the defaults will yield a similar configuration to that of the TrOCR `microsoft/trocr-base
-    <https://huggingface.co/microsoft/trocr-base>`__ architecture.
+    a configuration with the defaults will yield a similar configuration to that of the TrOCR [microsoft/trocr-base](https://huggingface.co/microsoft/trocr-base) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 50265):
+        vocab_size (`int`, *optional*, defaults to 50265):
             Vocabulary size of the TrOCR model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.TrOCRForCausalLM`.
-        d_model (:obj:`int`, `optional`, defaults to 1024):
+            `inputs_ids` passed when calling [`TrOCRForCausalLM`].
+        d_model (`int`, *optional*, defaults to 1024):
             Dimensionality of the layers and the pooler layer.
-        decoder_layers (:obj:`int`, `optional`, defaults to 12):
+        decoder_layers (`int`, *optional*, defaults to 12):
             Number of decoder layers.
-        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
             Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
-            The non-linear activation function (function or string) in the pooler. If string, :obj:`"gelu"`,
-            :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+        dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        activation_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for classifier.
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
+        init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
-        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        scale_embedding (`bool`, *optional*, defaults to `False`):
             Whether or not to scale the word embeddings by sqrt(d_model).
-        use_learned_position_embeddings (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_learned_position_embeddings (`bool`, *optional*, defaults to `True`):
             Whether or not to use learned position embeddings. If not, sinusoidal position embeddings will be used.
-        layernorm_embedding (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        layernorm_embedding (`bool`, *optional*, defaults to `True`):
             Whether or not to use a layernorm after the word + position embeddings.
 
-    Example::
+    Example:
 
-        >>> from transformers import TrOCRForCausalLM, TrOCRConfig
+    ```python
+    >>> from transformers import TrOCRForCausalLM, TrOCRConfig
 
-        >>> # Initializing a TrOCR-base style configuration
-        >>> configuration = TrOCRConfig()
+    >>> # Initializing a TrOCR-base style configuration
+    >>> configuration = TrOCRConfig()
 
-        >>> # Initializing a model from the TrOCR-base style configuration
-        >>> model = TrOCRForCausalLM(configuration)
+    >>> # Initializing a model from the TrOCR-base style configuration
+    >>> model = TrOCRForCausalLM(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "trocr"
     keys_to_ignore_at_inference = ["past_key_values"]
     attribute_map = {
diff --git a/src/transformers/models/trocr/processing_trocr.py b/src/transformers/models/trocr/processing_trocr.py
index 319c3be966..24f8fc0dd7 100644
--- a/src/transformers/models/trocr/processing_trocr.py
+++ b/src/transformers/models/trocr/processing_trocr.py
@@ -28,15 +28,15 @@ class TrOCRProcessor:
     r"""
     Constructs a TrOCR processor which wraps a vision feature extractor and a TrOCR tokenizer into a single processor.
 
-    :class:`~transformers.TrOCRProcessor` offers all the functionalities of :class:`~transformers.AutoFeatureExtractor`
-    and :class:`~transformers.RobertaTokenizer`. See the :meth:`~transformers.TrOCRProcessor.__call__` and
-    :meth:`~transformers.TrOCRProcessor.decode` for more information.
+    [`TrOCRProcessor`] offers all the functionalities of [`AutoFeatureExtractor`]
+    and [`RobertaTokenizer`]. See the [`~TrOCRProcessor.__call__`] and
+    [`~TrOCRProcessor.decode`] for more information.
 
     Args:
-        feature_extractor (:class:`~transformers.AutoFeatureExtractor`):
-            An instance of :class:`~transformers.AutoFeatureExtractor`. The feature extractor is a required input.
-        tokenizer (:class:`~transformers.RobertaTokenizer`):
-            An instance of :class:`~transformers.RobertaTokenizer`. The tokenizer is a required input.
+        feature_extractor ([`AutoFeatureExtractor`]):
+            An instance of [`AutoFeatureExtractor`]. The feature extractor is a required input.
+        tokenizer ([`RobertaTokenizer`]):
+            An instance of [`RobertaTokenizer`]. The tokenizer is a required input.
     """
 
     def __init__(self, feature_extractor, tokenizer):
@@ -55,17 +55,19 @@ class TrOCRProcessor:
 
     def save_pretrained(self, save_directory):
         """
-        Save a TrOCR feature extractor object and TrOCR tokenizer object to the directory ``save_directory``, so that
-        it can be re-loaded using the :func:`~transformers.TrOCRProcessor.from_pretrained` class method.
+        Save a TrOCR feature extractor object and TrOCR tokenizer object to the directory `save_directory`, so that
+        it can be re-loaded using the [`~TrOCRProcessor.from_pretrained`] class method.
 
-        .. note::
+        <Tip>
 
-            This class method is simply calling :meth:`~transformers.PreTrainedFeatureExtractor.save_pretrained` and
-            :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.save_pretrained`. Please refer to the
-            docstrings of the methods above for more information.
+        This class method is simply calling [`~PreTrainedFeatureExtractor.save_pretrained`] and
+        [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the
+        docstrings of the methods above for more information.
+
+        </Tip>
 
         Args:
-            save_directory (:obj:`str` or :obj:`os.PathLike`):
+            save_directory (`str` or `os.PathLike`):
                 Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
                 be created if it does not exist).
         """
@@ -76,30 +78,32 @@ class TrOCRProcessor:
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         r"""
-        Instantiate a :class:`~transformers.TrOCRProcessor` from a pretrained TrOCR processor.
+        Instantiate a [`TrOCRProcessor`] from a pretrained TrOCR processor.
 
-        .. note::
+        <Tip>
 
-            This class method is simply calling AutoFeatureExtractor's
-            :meth:`~transformers.PreTrainedFeatureExtractor.from_pretrained` and TrOCRTokenizer's
-            :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained`. Please refer to the
-            docstrings of the methods above for more information.
+        This class method is simply calling AutoFeatureExtractor's
+        [`~PreTrainedFeatureExtractor.from_pretrained`] and TrOCRTokenizer's
+        [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the
+        docstrings of the methods above for more information.
+
+        </Tip>
 
         Args:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
                 This can be either:
 
-                - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
-                  namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing a feature extractor file saved using the
-                  :meth:`~transformers.PreTrainedFeatureExtractor.save_pretrained` method, e.g.,
-                  ``./my_model_directory/``.
-                - a path or url to a saved feature extractor JSON `file`, e.g.,
-                  ``./my_model_directory/preprocessor_config.json``.
+                - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
+                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                - a path to a *directory* containing a feature extractor file saved using the
+                  [`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g.,
+                  `./my_model_directory/`.
+                - a path or url to a saved feature extractor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.
             **kwargs
-                Additional keyword arguments passed along to both :class:`~transformers.PreTrainedFeatureExtractor` and
-                :class:`~transformers.PreTrainedTokenizer`
+                Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and
+                [`PreTrainedTokenizer`]
         """
         feature_extractor = AutoFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
         tokenizer = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
@@ -109,9 +113,9 @@ class TrOCRProcessor:
     def __call__(self, *args, **kwargs):
         """
         When used in normal mode, this method forwards all its arguments to AutoFeatureExtractor's
-        :meth:`~transformers.AutoFeatureExtractor.__call__` and returns its output. If used in the context
-        :meth:`~transformers.TrOCRProcessor.as_target_processor` this method forwards all its arguments to
-        TrOCRTokenizer's :meth:`~transformers.TrOCRTokenizer.__call__`. Please refer to the doctsring of the above two
+        [`~AutoFeatureExtractor.__call__`] and returns its output. If used in the context
+        [`~TrOCRProcessor.as_target_processor`] this method forwards all its arguments to
+        TrOCRTokenizer's [`~TrOCRTokenizer.__call__`]. Please refer to the doctsring of the above two
         methods for more information.
         """
         return self.current_processor(*args, **kwargs)
@@ -119,14 +123,14 @@ class TrOCRProcessor:
     def batch_decode(self, *args, **kwargs):
         """
         This method forwards all its arguments to TrOCRTokenizer's
-        :meth:`~transformers.PreTrainedTokenizer.batch_decode`. Please refer to the docstring of this method for more
+        [`~PreTrainedTokenizer.batch_decode`]. Please refer to the docstring of this method for more
         information.
         """
         return self.tokenizer.batch_decode(*args, **kwargs)
 
     def decode(self, *args, **kwargs):
         """
-        This method forwards all its arguments to TrOCRTokenizer's :meth:`~transformers.PreTrainedTokenizer.decode`.
+        This method forwards all its arguments to TrOCRTokenizer's [`~PreTrainedTokenizer.decode`].
         Please refer to the docstring of this method for more information.
         """
         return self.tokenizer.decode(*args, **kwargs)
diff --git a/src/transformers/models/unispeech/configuration_unispeech.py b/src/transformers/models/unispeech/configuration_unispeech.py
index d328f5a6df..0d1a08d653 100644
--- a/src/transformers/models/unispeech/configuration_unispeech.py
+++ b/src/transformers/models/unispeech/configuration_unispeech.py
@@ -28,147 +28,144 @@ UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class UniSpeechConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.UniSpeechModel`. It is used
+    This is the configuration class to store the configuration of a [`UniSpeechModel`]. It is used
     to instantiate an UniSpeech model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the UniSpeech
-    `facebook/unispeech-base-960h <https://huggingface.co/facebook/unispeech-base-960h>`__ architecture.
+    [facebook/unispeech-base-960h](https://huggingface.co/facebook/unispeech-base-960h) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 32):
+        vocab_size (`int`, *optional*, defaults to 32):
             Vocabulary size of the UniSpeech model. Defines the number of different tokens that can be represented by
-            the :obj:`inputs_ids` passed when calling :class:`~transformers.UniSpeechModel`. Vocabulary size of the
-            model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward
-            method of :class:`~transformers.UniSpeechModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            the `inputs_ids` passed when calling [`UniSpeechModel`]. Vocabulary size of the
+            model. Defines the different tokens that can be represented by the *inputs_ids* passed to the forward
+            method of [`UniSpeechModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
+        attention_dropout (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        final_dropout (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probability for the final projection layer of :class:`UniSpeechForCTC`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        final_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the final projection layer of [`UniSpeechForCTC`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        feat_extract_norm (:obj:`str`, `optional`, defaults to :obj:`"group"`):
-            The norm to be applied to 1D convolutional layers in feature extractor. One of :obj:`"group"` for group
-            normalization of only the first 1D convolutional layer or :obj:`"layer"` for layer normalization of all 1D
+        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
+            The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group
+            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
             convolutional layers.
-        feat_proj_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probability for output of the feature extractor.
-        feat_extract_activation (:obj:`str, `optional`, defaults to :obj:`"gelu"`):
+        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the 1D convolutional layers of the feature
-            extractor. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        feat_quantizer_dropout (obj:`float`, `optional`, defaults to 0.0):
+            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        feat_quantizer_dropout (obj:*float*, *optional*, defaults to 0.0):
             The dropout probabilitiy for quantized feature extractor states.
-        conv_dim (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(512, 512, 512, 512, 512, 512, 512)`):
+        conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
             A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
-            feature extractor. The length of `conv_dim` defines the number of 1D convolutional layers.
-        conv_stride (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(5, 2, 2, 2, 2, 2, 2)`):
+            feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers.
+        conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
             A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length
-            of `conv_stride` defines the number of convolutional layers and has to match the the length of `conv_dim`.
-        conv_kernel (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(10, 3, 3, 3, 3, 3, 3)`):
+            of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
+        conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
             A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The
-            length of `conv_kernel` defines the number of convolutional layers and has to match the the length of
-            `conv_dim`.
-        conv_bias (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
+            *conv_dim*.
+        conv_bias (`bool`, *optional*, defaults to `False`):
             Whether the 1D convolutional layers have a bias.
-        num_conv_pos_embeddings (:obj:`int`, `optional`, defaults to 128):
+        num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
             Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
             embeddings layer.
-        num_conv_pos_embedding_groups (:obj:`int`, `optional`, defaults to 16):
+        num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
             Number of groups of 1D convolutional positional embeddings layer.
-        do_stable_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to apply `stable` layer norm architecture of the Transformer encoder. ``do_stable_layer_norm is
-            True`` corresponds to applying layer norm before the attention layer, whereas ``do_stable_layer_norm is
-            False`` corresponds to applying layer norm after the attention layer.
-        apply_spec_augment (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_stable_layer_norm (`bool`, *optional*, defaults to `False`):
+            Whether to apply *stable* layer norm architecture of the Transformer encoder. `do_stable_layer_norm is True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is False` corresponds to applying layer norm after the attention layer.
+        apply_spec_augment (`bool`, *optional*, defaults to `True`):
             Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see
-            `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
-            <https://arxiv.org/abs/1904.08779>`__.
-        mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
+            [SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition](https://arxiv.org/abs/1904.08779).
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
             Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
             procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
             reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
-            masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
-            the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
-        mask_time_length (:obj:`int`, `optional`, defaults to 10):
+            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease
+            the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+        mask_time_length (`int`, *optional*, defaults to 10):
             Length of vector span along the time axis.
-        mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
-            The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
-            step, irrespectively of ``mask_feature_prob``. Only relevant if
+        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
             ''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
-        mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
             Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
             masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
             the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
-            span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
-            overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
-            is True``.
-        mask_feature_length (:obj:`int`, `optional`, defaults to 10):
+            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that
+            overlap may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
             Length of vector span along the feature axis.
-        mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
-            The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
-            step, irrespectively of ``mask_feature_prob``. Only relevant if
+        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
+            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
             ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
-        num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320):
+        num_codevectors_per_group (`int`, *optional*, defaults to 320):
             Number of entries in each quantization codebook (group).
-        num_codevector_groups (:obj:`int`, `optional`, defaults to 2):
+        num_codevector_groups (`int`, *optional*, defaults to 2):
             Number of codevector groups for product codevector quantization.
-        contrastive_logits_temperature (:obj:`float`, `optional`, defaults to 0.1):
-            The temperature `kappa` in the contrastive loss.
-        feat_quantizer_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
+            The temperature *kappa* in the contrastive loss.
+        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probabilitiy for the output of the feature extractor that's used by the quantizer.
-        num_negatives (:obj:`int`, `optional`, defaults to 100):
+        num_negatives (`int`, *optional*, defaults to 100):
             Number of negative samples for the contrastive loss.
-        codevector_dim (:obj:`int`, `optional`, defaults to 256):
+        codevector_dim (`int`, *optional*, defaults to 256):
             Dimensionality of the quantized feature vectors.
-        proj_codevector_dim (:obj:`int`, `optional`, defaults to 256):
+        proj_codevector_dim (`int`, *optional*, defaults to 256):
             Dimensionality of the final projection of both the quantized and the transformer features.
-        diversity_loss_weight (:obj:`int`, `optional`, defaults to 0.1):
+        diversity_loss_weight (`int`, *optional*, defaults to 0.1):
             The weight of the codebook diversity loss component.
-        ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"mean"`):
-            Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an
-            instance of :class:`~transformers.UniSpeechForCTC`.
-        ctc_zero_infinity (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to zero infinite losses and the associated gradients of ``torch.nn.CTCLoss``. Infinite losses
+        ctc_loss_reduction (`str`, *optional*, defaults to `"mean"`):
+            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+            instance of [`UniSpeechForCTC`].
+        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses
             mainly occur when the inputs are too short to be aligned to the targets. Only relevant when training an
-            instance of :class:`~transformers.UniSpeechForCTC`.
-        use_weighted_layer_sum (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            instance of [`UniSpeechForCTC`].
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
             Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
-            instance of :class:`~transformers.UniSpeechForSequenceClassification`.
-        classifier_proj_size (:obj:`int`, `optional`, defaults to 256):
+            instance of [`UniSpeechForSequenceClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 256):
             Dimensionality of the projection before token mean-pooling for classification.
-        replace_prob (:obj:`float`, `optional`, defaults to 0.5):
+        replace_prob (`float`, *optional*, defaults to 0.5):
             Propability that transformer feature is replaced by quantized feature for pretraining.
 
-    Example::
+    Example:
 
-        >>> from transformers import UniSpeechModel, UniSpeechConfig
+    ```python
+    >>> from transformers import UniSpeechModel, UniSpeechConfig
 
-        >>> # Initializing a UniSpeech facebook/unispeech-base-960h style configuration
-        >>> configuration = UniSpeechConfig()
+    >>> # Initializing a UniSpeech facebook/unispeech-base-960h style configuration
+    >>> configuration = UniSpeechConfig()
 
-        >>> # Initializing a model from the facebook/unispeech-base-960h style configuration
-        >>> model = UniSpeechModel(configuration)
+    >>> # Initializing a model from the facebook/unispeech-base-960h style configuration
+    >>> model = UniSpeechModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "unispeech"
 
     def __init__(
diff --git a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
index d9c2a00fce..120fe49547 100644
--- a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
@@ -28,156 +28,153 @@ UNISPEECH_SAT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class UniSpeechSatConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.UniSpeechSatModel`. It is
+    This is the configuration class to store the configuration of a [`UniSpeechSatModel`]. It is
     used to instantiate an UniSpeechSat model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the UniSpeechSat
-    `facebook/unispeech_sat-base-960h <https://huggingface.co/facebook/unispeech_sat-base-960h>`__ architecture.
+    [facebook/unispeech_sat-base-960h](https://huggingface.co/facebook/unispeech_sat-base-960h) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 32):
+        vocab_size (`int`, *optional*, defaults to 32):
             Vocabulary size of the UniSpeechSat model. Defines the number of different tokens that can be represented
-            by the :obj:`inputs_ids` passed when calling :class:`~transformers.UniSpeechSatModel`. Vocabulary size of
-            the model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward
-            method of :class:`~transformers.UniSpeechSatModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            by the `inputs_ids` passed when calling [`UniSpeechSatModel`]. Vocabulary size of
+            the model. Defines the different tokens that can be represented by the *inputs_ids* passed to the forward
+            method of [`UniSpeechSatModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
+        attention_dropout (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        final_dropout (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probability for the final projection layer of :class:`UniSpeechSatForCTC`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        final_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the final projection layer of [`UniSpeechSatForCTC`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        feat_extract_norm (:obj:`str`, `optional`, defaults to :obj:`"group"`):
-            The norm to be applied to 1D convolutional layers in feature extractor. One of :obj:`"group"` for group
-            normalization of only the first 1D convolutional layer or :obj:`"layer"` for layer normalization of all 1D
+        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
+            The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group
+            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
             convolutional layers.
-        feat_proj_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probability for output of the feature extractor.
-        feat_extract_activation (:obj:`str, `optional`, defaults to :obj:`"gelu"`):
+        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the 1D convolutional layers of the feature
-            extractor. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        feat_quantizer_dropout (obj:`float`, `optional`, defaults to 0.0):
+            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        feat_quantizer_dropout (obj:*float*, *optional*, defaults to 0.0):
             The dropout probabilitiy for quantized feature extractor states.
-        conv_dim (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(512, 512, 512, 512, 512, 512, 512)`):
+        conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
             A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
-            feature extractor. The length of `conv_dim` defines the number of 1D convolutional layers.
-        conv_stride (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(5, 2, 2, 2, 2, 2, 2)`):
+            feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers.
+        conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
             A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length
-            of `conv_stride` defines the number of convolutional layers and has to match the the length of `conv_dim`.
-        conv_kernel (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(10, 3, 3, 3, 3, 3, 3)`):
+            of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
+        conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
             A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The
-            length of `conv_kernel` defines the number of convolutional layers and has to match the the length of
-            `conv_dim`.
-        conv_bias (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
+            *conv_dim*.
+        conv_bias (`bool`, *optional*, defaults to `False`):
             Whether the 1D convolutional layers have a bias.
-        num_conv_pos_embeddings (:obj:`int`, `optional`, defaults to 128):
+        num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
             Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
             embeddings layer.
-        num_conv_pos_embedding_groups (:obj:`int`, `optional`, defaults to 16):
+        num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
             Number of groups of 1D convolutional positional embeddings layer.
-        do_stable_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to apply `stable` layer norm architecture of the Transformer encoder. ``do_stable_layer_norm is
-            True`` corresponds to applying layer norm before the attention layer, whereas ``do_stable_layer_norm is
-            False`` corresponds to applying layer norm after the attention layer.
-        apply_spec_augment (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_stable_layer_norm (`bool`, *optional*, defaults to `False`):
+            Whether to apply *stable* layer norm architecture of the Transformer encoder. `do_stable_layer_norm is True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is False` corresponds to applying layer norm after the attention layer.
+        apply_spec_augment (`bool`, *optional*, defaults to `True`):
             Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see
-            `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
-            <https://arxiv.org/abs/1904.08779>`__.
-        mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
+            [SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition](https://arxiv.org/abs/1904.08779).
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
             Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
             procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
             reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
-            masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
-            the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
-        mask_time_length (:obj:`int`, `optional`, defaults to 10):
+            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease
+            the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+        mask_time_length (`int`, *optional*, defaults to 10):
             Length of vector span along the time axis.
-        mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
-            The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
-            step, irrespectively of ``mask_feature_prob``. Only relevant if
+        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
             ''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
-        mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
             Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
             masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
             the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
-            span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
-            overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
-            is True``.
-        mask_feature_length (:obj:`int`, `optional`, defaults to 10):
+            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that
+            overlap may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
             Length of vector span along the feature axis.
-        mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
-            The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
-            step, irrespectively of ``mask_feature_prob``. Only relevant if
+        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
+            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
             ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
-        num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320):
+        num_codevectors_per_group (`int`, *optional*, defaults to 320):
             Number of entries in each quantization codebook (group).
-        num_codevector_groups (:obj:`int`, `optional`, defaults to 2):
+        num_codevector_groups (`int`, *optional*, defaults to 2):
             Number of codevector groups for product codevector quantization.
-        contrastive_logits_temperature (:obj:`float`, `optional`, defaults to 0.1):
-            The temperature `kappa` in the contrastive loss.
-        feat_quantizer_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
+            The temperature *kappa* in the contrastive loss.
+        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probabilitiy for the output of the feature extractor that's used by the quantizer.
-        num_negatives (:obj:`int`, `optional`, defaults to 100):
+        num_negatives (`int`, *optional*, defaults to 100):
             Number of negative samples for the contrastive loss.
-        codevector_dim (:obj:`int`, `optional`, defaults to 256):
+        codevector_dim (`int`, *optional*, defaults to 256):
             Dimensionality of the quantized feature vectors.
-        proj_codevector_dim (:obj:`int`, `optional`, defaults to 256):
+        proj_codevector_dim (`int`, *optional*, defaults to 256):
             Dimensionality of the final projection of both the quantized and the transformer features.
-        diversity_loss_weight (:obj:`int`, `optional`, defaults to 0.1):
+        diversity_loss_weight (`int`, *optional*, defaults to 0.1):
             The weight of the codebook diversity loss component.
-        ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"mean"`):
-            Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an
-            instance of :class:`~transformers.UniSpeechSatForCTC`.
-        ctc_zero_infinity (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to zero infinite losses and the associated gradients of ``torch.nn.CTCLoss``. Infinite losses
+        ctc_loss_reduction (`str`, *optional*, defaults to `"mean"`):
+            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+            instance of [`UniSpeechSatForCTC`].
+        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses
             mainly occur when the inputs are too short to be aligned to the targets. Only relevant when training an
-            instance of :class:`~transformers.UniSpeechSatForCTC`.
-        use_weighted_layer_sum (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            instance of [`UniSpeechSatForCTC`].
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
             Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
-            instance of :class:`~transformers.UniSpeechSatForSequenceClassification`.
-        classifier_proj_size (:obj:`int`, `optional`, defaults to 256):
+            instance of [`UniSpeechSatForSequenceClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 256):
             Dimensionality of the projection before token mean-pooling for classification.
-        tdnn_dim (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(512, 512, 512, 512, 1500)`):
-            A tuple of integers defining the number of output channels of each 1D convolutional layer in the `TDNN`
-            module of the `XVector` model. The length of `tdnn_dim` defines the number of `TDNN` layers.
-        tdnn_kernel (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(5, 3, 3, 1, 1)`):
-            A tuple of integers defining the kernel size of each 1D convolutional layer in the `TDNN` module of the
-            `XVector` model. The length of `tdnn_kernel` has to match the length of `tdnn_dim`.
-        tdnn_dilation (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(1, 2, 3, 1, 1)`):
-            A tuple of integers defining the dilation factor of each 1D convolutional layer in `TDNN` module of the
-            `XVector` model. The length of `tdnn_dilation` has to match the length of `tdnn_dim`.
-        xvector_output_dim (:obj:`int`, `optional`, defaults to 512):
-            Dimensionality of the `XVector` embedding vectors.
+        tdnn_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
+            A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN*
+            module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers.
+        tdnn_kernel (`Tuple[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the
+            *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*.
+        tdnn_dilation (`Tuple[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
+            A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the
+            *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
+        xvector_output_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of the *XVector* embedding vectors.
 
-    Example::
+    Example:
 
-        >>> from transformers import UniSpeechSatModel, UniSpeechSatConfig
+    ```python
+    >>> from transformers import UniSpeechSatModel, UniSpeechSatConfig
 
-        >>> # Initializing a UniSpeechSat facebook/unispeech_sat-base-960h style configuration
-        >>> configuration = UniSpeechSatConfig()
+    >>> # Initializing a UniSpeechSat facebook/unispeech_sat-base-960h style configuration
+    >>> configuration = UniSpeechSatConfig()
 
-        >>> # Initializing a model from the facebook/unispeech_sat-base-960h style configuration
-        >>> model = UniSpeechSatModel(configuration)
+    >>> # Initializing a model from the facebook/unispeech_sat-base-960h style configuration
+    >>> model = UniSpeechSatModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "unispeech-sat"
 
     def __init__(
diff --git a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
index 28aa752ebd..bc81a65a9b 100755
--- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@@ -1266,49 +1266,50 @@ class UniSpeechSatForPreTraining(UniSpeechSatPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> import torch
-            >>> from transformers import UniSpeechSatFeatureExtractor, UniSpeechSatForPreTraining
-            >>> from transformers.models.unispeech_sat.modeling_unispeech_sat import _compute_mask_indices
-            >>> from datasets import load_dataset
-            >>> import soundfile as sf
+        ```python
+        >>> import torch
+        >>> from transformers import UniSpeechSatFeatureExtractor, UniSpeechSatForPreTraining
+        >>> from transformers.models.unispeech_sat.modeling_unispeech_sat import _compute_mask_indices
+        >>> from datasets import load_dataset
+        >>> import soundfile as sf
 
-            >>> feature_extractor = UniSpeechSatFeatureExtractor.from_pretrained("patrickvonplaten/unispeech_sat-base")
-            >>> model = UniSpeechSatForPreTraining.from_pretrained("patrickvonplaten/unispeech_sat-base")
+        >>> feature_extractor = UniSpeechSatFeatureExtractor.from_pretrained("patrickvonplaten/unispeech_sat-base")
+        >>> model = UniSpeechSatForPreTraining.from_pretrained("patrickvonplaten/unispeech_sat-base")
 
 
-            >>> def map_to_array(batch):
-            ...     speech, _ = sf.read(batch["file"])
-            ...     batch["speech"] = speech
-            ...     return batch
+        >>> def map_to_array(batch):
+        ...     speech, _ = sf.read(batch["file"])
+        ...     batch["speech"] = speech
+        ...     return batch
 
 
-            >>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
-            >>> ds = ds.map(map_to_array)
+        >>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
+        >>> ds = ds.map(map_to_array)
 
-            >>> input_values = feature_extractor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
+        >>> input_values = feature_extractor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
 
-            >>> # compute masked indices
-            >>> batch_size, raw_sequence_length = input_values.shape
-            >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length)
-            >>> mask_time_indices = _compute_mask_indices((batch_size, sequence_length), mask_prob=0.2, mask_length=2)
+        >>> # compute masked indices
+        >>> batch_size, raw_sequence_length = input_values.shape
+        >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length)
+        >>> mask_time_indices = _compute_mask_indices((batch_size, sequence_length), mask_prob=0.2, mask_length=2)
 
-            >>> with torch.no_grad():
-            ...     outputs = model(input_values, mask_time_indices=mask_time_indices)
+        >>> with torch.no_grad():
+        ...     outputs = model(input_values, mask_time_indices=mask_time_indices)
 
-            >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
-            >>> cosine_sim = torch.cosine_similarity(
-            ...     outputs.projected_states, outputs.projected_quantized_states, dim=-1
-            ... )
+        >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
+        >>> cosine_sim = torch.cosine_similarity(
+        ...     outputs.projected_states, outputs.projected_quantized_states, dim=-1
+        ... )
 
-            >>> # show that cosine similarity is much higher than random
-            >>> assert cosine_sim[mask_time_indices].mean() > 0.5
+        >>> # show that cosine similarity is much higher than random
+        >>> assert cosine_sim[mask_time_indices].mean() > 0.5
 
-            >>> # for contrastive loss training model should be put into train mode
-            >>> model.train()
-            >>> loss = model(input_values, mask_time_indices=mask_time_indices).loss
-        """
+        >>> # for contrastive loss training model should be put into train mode
+        >>> model.train()
+        >>> loss = model(input_values, mask_time_indices=mask_time_indices).loss
+        ```"""
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
diff --git a/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
index 1191a67b12..3dc0e4fdf9 100644
--- a/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py
@@ -26,49 +26,50 @@ logger = logging.get_logger(__name__)
 
 class VisionEncoderDecoderConfig(PretrainedConfig):
     r"""
-    :class:`~transformers.VisionEncoderDecoderConfig` is the configuration class to store the configuration of a
-    :class:`~transformers.VisionEncoderDecoderModel`. It is used to instantiate a Vision-Encoder-Text-Decoder model
+    [`VisionEncoderDecoderConfig`] is the configuration class to store the configuration of a
+    [`VisionEncoderDecoderModel`]. It is used to instantiate a Vision-Encoder-Text-Decoder model
     according to the specified arguments, defining the encoder and decoder configs.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        kwargs (`optional`):
+        kwargs (*optional*):
             Dictionary of keyword arguments. Notably:
 
-                - **encoder** (:class:`~transformers.PretrainedConfig`, `optional`) -- An instance of a configuration
+                - **encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration
                   object that defines the encoder config.
-                - **decoder** (:class:`~transformers.PretrainedConfig`, `optional`) -- An instance of a configuration
+                - **decoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration
                   object that defines the decoder config.
 
-    Examples::
+    Examples:
 
-        >>> from transformers import BertConfig, ViTConfig, VisionEncoderDecoderConfig, VisionEncoderDecoderModel
+    ```python
+    >>> from transformers import BertConfig, ViTConfig, VisionEncoderDecoderConfig, VisionEncoderDecoderModel
 
-        >>> # Initializing a ViT & BERT style configuration
-        >>> config_encoder = ViTConfig()
-        >>> config_decoder = BertConfig()
+    >>> # Initializing a ViT & BERT style configuration
+    >>> config_encoder = ViTConfig()
+    >>> config_decoder = BertConfig()
 
-        >>> config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
+    >>> config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
 
-        >>> # Initializing a ViTBert model from a ViT & bert-base-uncased style configurations
-        >>> model = VisionEncoderDecoderModel(config=config)
+    >>> # Initializing a ViTBert model from a ViT & bert-base-uncased style configurations
+    >>> model = VisionEncoderDecoderModel(config=config)
 
-        >>> # Accessing the model configuration
-        >>> config_encoder = model.config.encoder
-        >>> config_decoder  = model.config.decoder
-        >>> # set decoder config to causal lm
-        >>> config_decoder.is_decoder = True
-        >>> config_decoder.add_cross_attention = True
+    >>> # Accessing the model configuration
+    >>> config_encoder = model.config.encoder
+    >>> config_decoder  = model.config.decoder
+    >>> # set decoder config to causal lm
+    >>> config_decoder.is_decoder = True
+    >>> config_decoder.add_cross_attention = True
 
-        >>> # Saving the model, including its configuration
-        >>> model.save_pretrained('my-model')
+    >>> # Saving the model, including its configuration
+    >>> model.save_pretrained('my-model')
 
-        >>> # loading model and config from pretrained folder
-        >>> encoder_decoder_config = VisionEncoderDecoderConfig.from_pretrained('my-model')
-        >>> model = VisionEncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config)
-    """
+    >>> # loading model and config from pretrained folder
+    >>> encoder_decoder_config = VisionEncoderDecoderConfig.from_pretrained('my-model')
+    >>> model = VisionEncoderDecoderModel.from_pretrained('my-model', config=encoder_decoder_config)
+    ```"""
     model_type = "vision-encoder-decoder"
     is_composition = True
 
@@ -94,11 +95,11 @@ class VisionEncoderDecoderConfig(PretrainedConfig):
         cls, encoder_config: PretrainedConfig, decoder_config: PretrainedConfig, **kwargs
     ) -> PretrainedConfig:
         r"""
-        Instantiate a :class:`~transformers.VisionEncoderDecoderConfig` (or a derived class) from a pre-trained encoder
+        Instantiate a [`VisionEncoderDecoderConfig`] (or a derived class) from a pre-trained encoder
         model configuration and decoder model configuration.
 
         Returns:
-            :class:`VisionEncoderDecoderConfig`: An instance of a configuration object
+            [`VisionEncoderDecoderConfig`]: An instance of a configuration object
         """
         logger.info("Setting `config.is_decoder=True` and `config.add_cross_attention=True` for decoder_config")
         decoder_config.is_decoder = True
@@ -108,10 +109,10 @@ class VisionEncoderDecoderConfig(PretrainedConfig):
 
     def to_dict(self):
         """
-        Serializes this instance to a Python dictionary. Override the default `to_dict()` from `PretrainedConfig`.
+        Serializes this instance to a Python dictionary. Override the default *to_dict()* from *PretrainedConfig*.
 
         Returns:
-            :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
         """
         output = copy.deepcopy(self.__dict__)
         output["encoder"] = self.encoder.to_dict()
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
index e177cd9b76..35b0629adf 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
@@ -395,24 +395,24 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import FlaxVisionEncoderDecoderModel
-            >>> from PIL import Image
-            >>> import requests
+        ```python
+        >>> from transformers import FlaxVisionEncoderDecoderModel
+        >>> from PIL import Image
+        >>> import requests
 
-            >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
-            >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> image = Image.open(requests.get(url, stream=True).raw)
 
-            >>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
+        >>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
 
-            >>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
-            >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained('vit', 'gpt2')
+        >>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
+        >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained('vit', 'gpt2')
 
-            >>> pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values
-            >>> encoder_outputs = model.encode(pixel_values)
-
-        """
+        >>> pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values
+        >>> encoder_outputs = model.encode(pixel_values)
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -471,31 +471,31 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
         r"""
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import FlaxVisionEncoderDecoderModel
-            >>> import jax.numpy as jnp
-            >>> from PIL import Image
-            >>> import requests
+        ```python
+        >>> from transformers import FlaxVisionEncoderDecoderModel
+        >>> import jax.numpy as jnp
+        >>> from PIL import Image
+        >>> import requests
 
-            >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
-            >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> image = Image.open(requests.get(url, stream=True).raw)
 
-            >>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
+        >>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
 
-            >>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
-            >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained('vit', 'gpt2')
+        >>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
+        >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained('vit', 'gpt2')
 
-            >>> pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values
-            >>> encoder_outputs = model.encode(pixel_values)
+        >>> pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values
+        >>> encoder_outputs = model.encode(pixel_values)
 
-            >>> decoder_start_token_id = model.config.decoder.bos_token_id
-            >>> decoder_input_ids = jnp.ones((pixel_values.shape[0], 1), dtype="i4") * decoder_start_token_id
+        >>> decoder_start_token_id = model.config.decoder.bos_token_id
+        >>> decoder_input_ids = jnp.ones((pixel_values.shape[0], 1), dtype="i4") * decoder_start_token_id
 
-            >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
-            >>> logits = outputs.logits
-
-        """
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> logits = outputs.logits
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -599,34 +599,35 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
         r"""
         Returns:
 
-        Examples::
+        Examples:
 
-            >>> from transformers import FlaxVisionEncoderDecoderModel, ViTFeatureExtractor, GPT2Tokenizer
-            >>> from PIL import Image
-            >>> import requests
+        ```python
+        >>> from transformers import FlaxVisionEncoderDecoderModel, ViTFeatureExtractor, GPT2Tokenizer
+        >>> from PIL import Image
+        >>> import requests
 
-            >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
-            >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> image = Image.open(requests.get(url, stream=True).raw)
 
-            >>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
+        >>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
 
-            >>> # load output tokenizer
-            >>> tokenizer_output = GPT2Tokenizer.from_pretrained('gpt2')
+        >>> # load output tokenizer
+        >>> tokenizer_output = GPT2Tokenizer.from_pretrained('gpt2')
 
-            >>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
-            >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained('vit', 'gpt2')
+        >>> # initialize a vit-gpt2 from pretrained ViT and GPT2 models. Note that the cross-attention layers will be randomly initialized
+        >>> model = FlaxVisionEncoderDecoderModel.from_encoder_decoder_pretrained('vit', 'gpt2')
 
-            >>> pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values
+        >>> pixel_values = feature_extractor(images=image, return_tensors="np").pixel_values
 
-            >>> # use GPT2's eos_token as the pad as well as eos token
-            >>> model.config.eos_token_id = model.config.decoder.eos_token_id
-            >>> model.config.pad_token_id = model.config.eos_token_id
+        >>> # use GPT2's eos_token as the pad as well as eos token
+        >>> model.config.eos_token_id = model.config.decoder.eos_token_id
+        >>> model.config.pad_token_id = model.config.eos_token_id
 
-            >>> # generation
-            >>> sequences = model.generate(pixel_values, num_beams=4, max_length=12).sequences
+        >>> # generation
+        >>> sequences = model.generate(pixel_values, num_beams=4, max_length=12).sequences
 
-            >>> captions = tokenizer_output.batch_decode(sequences, skip_special_tokens=True)
-        """
+        >>> captions = tokenizer_output.batch_decode(sequences, skip_special_tokens=True)
+        ```"""
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
index 14cf72bbe7..97706bb3f0 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
@@ -410,36 +410,36 @@ class VisionEncoderDecoderModel(PreTrainedModel):
         r"""
         Returns:
 
-        Examples::
+        Examples:
 
-            >>> from transformers import TrOCRProcessor, VisionEncoderDecoderModel
-            >>> import requests
-            >>> from PIL import Image
-            >>> import torch
+        ```python
+        >>> from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+        >>> import requests
+        >>> from PIL import Image
+        >>> import torch
 
-            >>> processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
-            >>> model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten')
+        >>> processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
+        >>> model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten')
 
-            >>> # load image from the IAM dataset
-            >>> url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg"
-            >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+        >>> # load image from the IAM dataset
+        >>> url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
 
-            >>> # training
-            >>> model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
-            >>> model.config.pad_token_id = processor.tokenizer.pad_token_id
-            >>> model.config.vocab_size = model.config.decoder.vocab_size
+        >>> # training
+        >>> model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
+        >>> model.config.pad_token_id = processor.tokenizer.pad_token_id
+        >>> model.config.vocab_size = model.config.decoder.vocab_size
 
-            >>> pixel_values = processor(image, return_tensors="pt").pixel_values
-            >>> text = "hello world"
-            >>> labels = processor.tokenizer(text, return_tensors="pt").input_ids
-            >>> outputs = model(pixel_values=pixel_values, labels=labels)
-            >>> loss = outputs.loss
+        >>> pixel_values = processor(image, return_tensors="pt").pixel_values
+        >>> text = "hello world"
+        >>> labels = processor.tokenizer(text, return_tensors="pt").input_ids
+        >>> outputs = model(pixel_values=pixel_values, labels=labels)
+        >>> loss = outputs.loss
 
-            >>> # inference (generation)
-            >>> generated_ids = model.generate(pixel_values)
-            >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-
-        """
+        >>> # inference (generation)
+        >>> generated_ids = model.generate(pixel_values)
+        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        ```"""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         kwargs_encoder = {argument: value for argument, value in kwargs.items() if not argument.startswith("decoder_")}
diff --git a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py
index b2223e41f5..7de8e2e494 100644
--- a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py
@@ -27,50 +27,51 @@ logger = logging.get_logger(__name__)
 
 class VisionTextDualEncoderConfig(PretrainedConfig):
     r"""
-    :class:`~transformers.VisionTextDualEncoderConfig` is the configuration class to store the configuration of a
-    :class:`~transformers.VisionTextDualEncoderModel`. It is used to instantiate
-    :class:`~transformers.VisionTextDualEncoderModel` model according to the specified arguments, defining the text
+    [`VisionTextDualEncoderConfig`] is the configuration class to store the configuration of a
+    [`VisionTextDualEncoderModel`]. It is used to instantiate
+    [`VisionTextDualEncoderModel`] model according to the specified arguments, defining the text
     model and vision model configs.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        text_config_dict (:obj:`dict`):
+        text_config_dict (`dict`):
             Dictionary of configuration options that defines text model config.
-        vision_config_dict (:obj:`dict`):
+        vision_config_dict (`dict`):
             Dictionary of configuration options that defines vison model config.
-        projection_dim (:obj:`int`, `optional`, defaults to 512):
+        projection_dim (`int`, *optional*, defaults to 512):
             Dimentionality of text and vision projection layers.
-        logit_scale_init_value (:obj:`float`, `optional`, defaults to 2.6592):
-            The inital value of the `logit_scale` paramter. Default is used as per the original CLIP implementation.
-        kwargs (`optional`):
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
+        kwargs (*optional*):
             Dictionary of keyword arguments.
 
-    Examples::
+    Examples:
 
-        >>> from transformers import ViTConfig, BertConfig, VisionTextDualEncoderConfig, VisionTextDualEncoderModel
+    ```python
+    >>> from transformers import ViTConfig, BertConfig, VisionTextDualEncoderConfig, VisionTextDualEncoderModel
 
-        >>> # Initializing a BERT and ViT configuration
-        >>> config_vision = ViTConfig()
-        >>> config_text = BertConfig()
+    >>> # Initializing a BERT and ViT configuration
+    >>> config_vision = ViTConfig()
+    >>> config_text = BertConfig()
 
-        >>> config = VisionTextDualEncoderConfig.from_vision_text_configs(config_vision, config_text, projection_dim=512)
+    >>> config = VisionTextDualEncoderConfig.from_vision_text_configs(config_vision, config_text, projection_dim=512)
 
-        >>> # Initializing a BERT and ViT model
-        >>> model = VisionTextDualEncoderModel(config=config)
+    >>> # Initializing a BERT and ViT model
+    >>> model = VisionTextDualEncoderModel(config=config)
 
-        >>> # Accessing the model configuration
-        >>> config_vision  = model.config.vision_config
-        >>> config_text = model.config.text_config
+    >>> # Accessing the model configuration
+    >>> config_vision  = model.config.vision_config
+    >>> config_text = model.config.text_config
 
-        >>> # Saving the model, including its configuration
-        >>> model.save_pretrained('my-model')
+    >>> # Saving the model, including its configuration
+    >>> model.save_pretrained('my-model')
 
-        >>> # loading model and config from pretrained folder
-        >>> vision_text_config = VisionTextDualEncoderConfig.from_pretrained('vit-bert')
-        >>> model = VisionTextDualEncoderModel.from_pretrained('vit-bert', config=vision_text_config)
-    """
+    >>> # loading model and config from pretrained folder
+    >>> vision_text_config = VisionTextDualEncoderConfig.from_pretrained('vit-bert')
+    >>> model = VisionTextDualEncoderModel.from_pretrained('vit-bert', config=vision_text_config)
+    ```"""
 
     model_type = "vision-text-dual-encoder"
     is_composition = True
@@ -105,11 +106,11 @@ class VisionTextDualEncoderConfig(PretrainedConfig):
     @classmethod
     def from_vision_text_configs(cls, vision_config: PretrainedConfig, text_config: PretrainedConfig, **kwargs):
         r"""
-        Instantiate a :class:`VisionTextDualEncoderConfig` (or a derived class) from text model configuration and
+        Instantiate a [`VisionTextDualEncoderConfig`] (or a derived class) from text model configuration and
         vision model configuration.
 
         Returns:
-            :class:`VisionTextDualEncoderConfig`: An instance of a configuration object
+            [`VisionTextDualEncoderConfig`]: An instance of a configuration object
         """
 
         return cls(vision_config=vision_config.to_dict(), text_config=text_config.to_dict(), **kwargs)
@@ -117,10 +118,10 @@ class VisionTextDualEncoderConfig(PretrainedConfig):
     def to_dict(self):
         """
         Serializes this instance to a Python dictionary. Override the default
-        :meth:`~transformers.PretrainedConfig.to_dict`.
+        [`~PretrainedConfig.to_dict`].
 
         Returns:
-            :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
         """
         output = copy.deepcopy(self.__dict__)
         output["vision_config"] = self.vision_config.to_dict()
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
index d88a009304..610741da39 100644
--- a/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
@@ -528,34 +528,35 @@ class FlaxVisionTextDualEncoderModel(FlaxPreTrainedModel):
 VISION_TEXT_DUAL_ENCODER_MODEL_DOCSTRING = r"""
     Returns:
 
-    Examples::
+    Examples:
 
-        >>> from PIL import Image
-        >>> import requests
-        >>> import jax
-        >>> from transformers import FlaxVisionTextDualEncoderModel, VisionTextDualEncoderProcessor, ViTFeatureExtractor, BertTokenizer
+    ```python
+    >>> from PIL import Image
+    >>> import requests
+    >>> import jax
+    >>> from transformers import FlaxVisionTextDualEncoderModel, VisionTextDualEncoderProcessor, ViTFeatureExtractor, BertTokenizer
 
-        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-        >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
-        >>> processor = VisionTextDualEncoderProcessor(feature_extractor, tokenizer)
-        >>> model = FlaxVisionTextDualEncoderModel.from_vision_text_pretrained("google/vit-base-patch16-224", "bert-base-uncased")
+    >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+    >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
+    >>> processor = VisionTextDualEncoderProcessor(feature_extractor, tokenizer)
+    >>> model = FlaxVisionTextDualEncoderModel.from_vision_text_pretrained("google/vit-base-patch16-224", "bert-base-uncased")
 
-        >>> # contrastive training
-        >>> urls = ["http://images.cocodataset.org/val2017/000000039769.jpg", "https://farm3.staticflickr.com/2674/5850229113_4fe05d5265_z.jpg]
-        >>> images = [Image.open(requests.get(url, stream=True).raw) for url in urls]
-        >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=images, return_tensors="np", padding=True)
-        >>> outputs = model(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, pixel_values=inputs.pixel_values, return_loss=True)
-        >>> loss, logits_per_image = outputs.loss, outputs.logits_per_imag # this is the image-text similarity score
+    >>> # contrastive training
+    >>> urls = ["http://images.cocodataset.org/val2017/000000039769.jpg", "https://farm3.staticflickr.com/2674/5850229113_4fe05d5265_z.jpg]
+    >>> images = [Image.open(requests.get(url, stream=True).raw) for url in urls]
+    >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=images, return_tensors="np", padding=True)
+    >>> outputs = model(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, pixel_values=inputs.pixel_values, return_loss=True)
+    >>> loss, logits_per_image = outputs.loss, outputs.logits_per_imag # this is the image-text similarity score
 
-        >>> # save and load from pretrained
-        >>> model.save_pretrained("vit-bert")
-        >>> model = FlaxVisionTextDualEncoderModel.from_pretrained("vit-bert")
-
-        >>> # inference
-        >>> outputs = model(**inputs)
-        >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
-        >>> probs = jax.nn.softmax(logits_per_image, axis=1) # we can take the softmax to get the label probabilities
+    >>> # save and load from pretrained
+    >>> model.save_pretrained("vit-bert")
+    >>> model = FlaxVisionTextDualEncoderModel.from_pretrained("vit-bert")
 
+    >>> # inference
+    >>> outputs = model(**inputs)
+    >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
+    >>> probs = jax.nn.softmax(logits_per_image, axis=1) # we can take the softmax to get the label probabilities
+    ```
 """
 
 overwrite_call_docstring(
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
index b7ef229745..008af3e471 100755
--- a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
@@ -310,34 +310,34 @@ class VisionTextDualEncoderModel(PreTrainedModel):
         r"""
         Returns:
 
-        Examples::
+        Examples:
 
-            >>> from PIL import Image
-            >>> import requests
-            >>> from transformers import VisionTextDualEncoderModel, VisionTextDualEncoderProcessor, ViTFeatureExtractor, BertTokenizer
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import VisionTextDualEncoderModel, VisionTextDualEncoderProcessor, ViTFeatureExtractor, BertTokenizer
 
-            >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
-            >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
-            >>> processor = VisionTextDualEncoderProcessor(feature_extractor, tokenizer)
-            >>> model = VisionTextDualEncoderModel.from_vision_text_pretrained("google/vit-base-patch16-224", "bert-base-uncased")
+        >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+        >>> feature_extractor = ViTFeatureExtractor.from_pretrained("google/vit-base-patch16-224")
+        >>> processor = VisionTextDualEncoderProcessor(feature_extractor, tokenizer)
+        >>> model = VisionTextDualEncoderModel.from_vision_text_pretrained("google/vit-base-patch16-224", "bert-base-uncased")
 
-            >>> # contrastive training
-            >>> urls = ["http://images.cocodataset.org/val2017/000000039769.jpg", "https://farm3.staticflickr.com/2674/5850229113_4fe05d5265_z.jpg]
-            >>> images = [Image.open(requests.get(url, stream=True).raw) for url in urls]
-            >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=images, return_tensors="pt", padding=True)
-            >>> outputs = model(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, pixel_values=inputs.pixel_values, return_loss=True)
-            >>> loss, logits_per_image = outputs.loss, outputs.logits_per_imag # this is the image-text similarity score
+        >>> # contrastive training
+        >>> urls = ["http://images.cocodataset.org/val2017/000000039769.jpg", "https://farm3.staticflickr.com/2674/5850229113_4fe05d5265_z.jpg]
+        >>> images = [Image.open(requests.get(url, stream=True).raw) for url in urls]
+        >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=images, return_tensors="pt", padding=True)
+        >>> outputs = model(input_ids=inputs.input_ids, attention_mask=inputs.attention_mask, pixel_values=inputs.pixel_values, return_loss=True)
+        >>> loss, logits_per_image = outputs.loss, outputs.logits_per_imag # this is the image-text similarity score
 
-            >>> # save and load from pretrained
-            >>> model.save_pretrained("vit-bert")
-            >>> model = VisionTextDualEncoderModel.from_pretrained("vit-bert")
+        >>> # save and load from pretrained
+        >>> model.save_pretrained("vit-bert")
+        >>> model = VisionTextDualEncoderModel.from_pretrained("vit-bert")
 
-            >>> # inference
-            >>> outputs = model(**inputs)
-            >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
-            >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
-
-        """
+        >>> # inference
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
+        >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
+        ```"""
         return_dict = return_dict if return_dict is not None else self.config.return_dict
 
         vision_outputs = self.vision_model(
diff --git a/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
index fb32320780..5e607eb108 100644
--- a/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py
@@ -30,15 +30,15 @@ class VisionTextDualEncoderProcessor:
     Constructs a VisionTextDualEncoder processor which wraps a vision feature extractor and a tokenizer into a single
     processor.
 
-    :class:`~transformers.VisionTextDualEncoderProcessor` offers all the functionalities of
-    :class:`~transformers.AutoFeatureExtractor` and :class:`~transformers.AutoTokenizer`. See the
-    :meth:`~transformers.VisionTextDualEncoderProcessor.__call__` and
-    :meth:`~transformers.VisionTextDualEncoderProcessor.decode` for more information.
+    [`VisionTextDualEncoderProcessor`] offers all the functionalities of
+    [`AutoFeatureExtractor`] and [`AutoTokenizer`]. See the
+    [`~VisionTextDualEncoderProcessor.__call__`] and
+    [`~VisionTextDualEncoderProcessor.decode`] for more information.
 
     Args:
-        feature_extractor (:class:`~transformers.AutoFeatureExtractor`):
+        feature_extractor ([`AutoFeatureExtractor`]):
             The feature extractor is a required input.
-        tokenizer (:class:`~transformers.PreTrainedTokenizer`):
+        tokenizer ([`PreTrainedTokenizer`]):
             The tokenizer is a required input.
     """
 
@@ -61,17 +61,19 @@ class VisionTextDualEncoderProcessor:
     def save_pretrained(self, save_directory):
         """
         Save a VisionTextDualEncoder feature extractor object and VisionTextDualEncoder tokenizer object to the
-        directory ``save_directory``, so that it can be re-loaded using the
-        :func:`~transformers.VisionTextDualEncoderProcessor.from_pretrained` class method.
+        directory `save_directory`, so that it can be re-loaded using the
+        [`~VisionTextDualEncoderProcessor.from_pretrained`] class method.
 
-        .. note::
+        <Tip>
 
-            This class method is simply calling :meth:`~transformers.PreTrainedFeatureExtractor.save_pretrained` and
-            :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.save_pretrained`. Please refer to the
-            docstrings of the methods above for more information.
+        This class method is simply calling [`~PreTrainedFeatureExtractor.save_pretrained`] and
+        [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the
+        docstrings of the methods above for more information.
+
+        </Tip>
 
         Args:
-            save_directory (:obj:`str` or :obj:`os.PathLike`):
+            save_directory (`str` or `os.PathLike`):
                 Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
                 be created if it does not exist).
         """
@@ -82,32 +84,34 @@ class VisionTextDualEncoderProcessor:
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         r"""
-        Instantiate a :class:`~transformers.VisionTextDualEncoderProcessor` from a pretrained VisionTextDualEncoder
+        Instantiate a [`VisionTextDualEncoderProcessor`] from a pretrained VisionTextDualEncoder
         processor.
 
-        .. note::
+        <Tip>
 
-            This class method is simply calling AutoFeatureExtractor's
-            :meth:`~transformers.PreTrainedFeatureExtractor.from_pretrained` and AutoTokenizer's
-            :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained`. Please refer to the
-            docstrings of the methods above for more information.
+        This class method is simply calling AutoFeatureExtractor's
+        [`~PreTrainedFeatureExtractor.from_pretrained`] and AutoTokenizer's
+        [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. Please refer to the
+        docstrings of the methods above for more information.
+
+        </Tip>
 
         Args:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
                 This can be either:
 
-                - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
-                  namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing a feature extractor file saved using the
-                  :meth:`~transformers.PreTrainedFeatureExtractor.save_pretrained` method, e.g.,
-                  ``./my_model_directory/``.
-                - a path or url to a saved feature extractor JSON `file`, e.g.,
-                  ``./my_model_directory/preprocessor_config.json``.
+                - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
+                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                - a path to a *directory* containing a feature extractor file saved using the
+                  [`~PreTrainedFeatureExtractor.save_pretrained`] method, e.g.,
+                  `./my_model_directory/`.
+                - a path or url to a saved feature extractor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.
 
             **kwargs
-                Additional keyword arguments passed along to both :class:`~transformers.PreTrainedFeatureExtractor` and
-                :class:`~transformers.PreTrainedTokenizer`
+                Additional keyword arguments passed along to both [`PreTrainedFeatureExtractor`] and
+                [`PreTrainedTokenizer`]
         """
         feature_extractor = AutoFeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
         tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
@@ -117,38 +121,38 @@ class VisionTextDualEncoderProcessor:
     def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the
-        :obj:`text` and :obj:`kwargs` arguments to VisionTextDualEncoderTokenizer's
-        :meth:`~transformers.PreTrainedTokenizer.__call__` if :obj:`text` is not :obj:`None` to encode the text. To
-        prepare the image(s), this method forwards the :obj:`images` and :obj:`kwrags` arguments to
-        AutoFeatureExtractor's :meth:`~transformers.AutoFeatureExtractor.__call__` if :obj:`images` is not :obj:`None`.
+        `text` and `kwargs` arguments to VisionTextDualEncoderTokenizer's
+        [`~PreTrainedTokenizer.__call__`] if `text` is not `None` to encode the text. To
+        prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        AutoFeatureExtractor's [`~AutoFeatureExtractor.__call__`] if `images` is not `None`.
         Please refer to the doctsring of the above two methods for more information.
 
         Args:
-            text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+            text (`str`, `List[str]`, `List[List[str]]`):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                :obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                 number of channels, H and W are image height and width.
 
-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
                 If set, will return tensors of a particular framework. Acceptable values are:
 
-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return NumPy :obj:`np.ndarray` objects.
-                * :obj:`'jax'`: Return JAX :obj:`jnp.ndarray` objects.
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
 
         Returns:
-            :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
 
-            - **input_ids** -- List of token ids to be fed to a model. Returned when :obj:`text` is not :obj:`None`.
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              :obj:`return_attention_mask=True` or if `"attention_mask"` is in :obj:`self.model_input_names` and if
-              :obj:`text` is not :obj:`None`).
-            - **pixel_values** -- Pixel values to be fed to a model. Returned when :obj:`images` is not :obj:`None`.
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if
+              `text` is not `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
 
         if text is None and images is None:
@@ -171,7 +175,7 @@ class VisionTextDualEncoderProcessor:
     def batch_decode(self, *args, **kwargs):
         """
         This method forwards all its arguments to VisionTextDualEncoderTokenizer's
-        :meth:`~transformers.PreTrainedTokenizer.batch_decode`. Please refer to the docstring of this method for more
+        [`~PreTrainedTokenizer.batch_decode`]. Please refer to the docstring of this method for more
         information.
         """
         return self.tokenizer.batch_decode(*args, **kwargs)
@@ -179,7 +183,7 @@ class VisionTextDualEncoderProcessor:
     def decode(self, *args, **kwargs):
         """
         This method forwards all its arguments to VisionTextDualEncoderTokenizer's
-        :meth:`~transformers.PreTrainedTokenizer.decode`. Please refer to the docstring of this method for more
+        [`~PreTrainedTokenizer.decode`]. Please refer to the docstring of this method for more
         information.
         """
         return self.tokenizer.decode(*args, **kwargs)
diff --git a/src/transformers/models/visual_bert/configuration_visual_bert.py b/src/transformers/models/visual_bert/configuration_visual_bert.py
index be98508fdc..5963af26aa 100644
--- a/src/transformers/models/visual_bert/configuration_visual_bert.py
+++ b/src/transformers/models/visual_bert/configuration_visual_bert.py
@@ -36,71 +36,72 @@ VISUAL_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class VisualBertConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.VisualBertModel`. It is used
+    This is the configuration class to store the configuration of a [`VisualBertModel`]. It is used
     to instantiate an VisualBERT model according to the specified arguments, defining the model architecture.
     Instantiating a configuration with the defaults will yield a similar configuration to that of the VisualBERT
-    `visualbert-vqa-coco-pre <https://huggingface.co/uclanlp/visualbert-vqa-coco-pre>`__ architecture.
+    [visualbert-vqa-coco-pre](https://huggingface.co/uclanlp/visualbert-vqa-coco-pre) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the VisualBERT model. Defines the number of different tokens that can be represented by
-            the :obj:`inputs_ids` passed when calling :class:`~transformers.VisualBertModel`. Vocabulary size of the
-            model. Defines the different tokens that can be represented by the ``inputs_ids`` passed to the forward
-            method of :class:`~transformers.VisualBertModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            the `inputs_ids` passed when calling [`VisualBertModel`]. Vocabulary size of the
+            model. Defines the different tokens that can be represented by the `inputs_ids` passed to the forward
+            method of [`VisualBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        visual_embedding_dim (:obj:`int`, `optional`, defaults to 512):
+        visual_embedding_dim (`int`, *optional*, defaults to 512):
             Dimensionality of the visual embeddings to be passed to the model.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling
-            :class:`~transformers.VisualBertModel`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling
+            [`VisualBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        bypass_transformer (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not the model should bypass the transformer for the visual embeddings. If set to :obj:`True`,
-            the model directly concatenates the visual embeddings from :class:`~transformers.VisualBertEmbeddings` with
+        bypass_transformer (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should bypass the transformer for the visual embeddings. If set to `True`,
+            the model directly concatenates the visual embeddings from [`VisualBertEmbeddings`] with
             text output from transformers, and then pass it to a self-attention layer.
-        special_visual_initialize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        special_visual_initialize (`bool`, *optional*, defaults to `True`):
             Whether or not the visual token type and position type embedding weights should be initialized the same as
-            the textual token type and positive type embeddings. When set to :obj:`True`, the weights of the textual
+            the textual token type and positive type embeddings. When set to `True`, the weights of the textual
             token type and position type embeddings are copied to the respective visual embedding layers.
 
 
-    Example::
+    Example:
 
-        >>> from transformers import VisualBertModel, VisualBertConfig
+    ```python
+    >>> from transformers import VisualBertModel, VisualBertConfig
 
-        >>> # Initializing a VisualBERT visualbert-vqa-coco-pre style configuration
-        >>> configuration = VisualBertConfig.from_pretrained('visualbert-vqa-coco-pre')
+    >>> # Initializing a VisualBERT visualbert-vqa-coco-pre style configuration
+    >>> configuration = VisualBertConfig.from_pretrained('visualbert-vqa-coco-pre')
 
-        >>> # Initializing a model from the visualbert-vqa-coco-pre style configuration
-        >>> model = VisualBertModel(configuration)
+    >>> # Initializing a model from the visualbert-vqa-coco-pre style configuration
+    >>> model = VisualBertModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
 
     model_type = "visual_bert"
 
diff --git a/src/transformers/models/visual_bert/modeling_visual_bert.py b/src/transformers/models/visual_bert/modeling_visual_bert.py
index bd822dabbe..1100bc94bc 100755
--- a/src/transformers/models/visual_bert/modeling_visual_bert.py
+++ b/src/transformers/models/visual_bert/modeling_visual_bert.py
@@ -736,30 +736,31 @@ class VisualBertModel(VisualBertPreTrainedModel):
 
         Returns:
 
-        Example::
+        Example:
 
-            # Assumption: `get_visual_embeddings(image)` gets the visual embeddings of the image.
-            from transformers import BertTokenizer, VisualBertModel
-            import torch
+        ```python
+        # Assumption: *get_visual_embeddings(image)* gets the visual embeddings of the image.
+        from transformers import BertTokenizer, VisualBertModel
+        import torch
 
-            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-            model = VisualBertModel.from_pretrained('uclanlp/visualbert-vqa-coco-pre')
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = VisualBertModel.from_pretrained('uclanlp/visualbert-vqa-coco-pre')
 
-            inputs = tokenizer("The capital of France is Paris.", return_tensors="pt")
-            visual_embeds = get_visual_embeddings(image).unsqueeze(0)
-            visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
-            visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
+        inputs = tokenizer("The capital of France is Paris.", return_tensors="pt")
+        visual_embeds = get_visual_embeddings(image).unsqueeze(0)
+        visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
+        visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
 
-            inputs.update({
-                "visual_embeds": visual_embeds,
-                "visual_token_type_ids": visual_token_type_ids,
-                "visual_attention_mask": visual_attention_mask
-            })
+        inputs.update({
+            "visual_embeds": visual_embeds,
+            "visual_token_type_ids": visual_token_type_ids,
+            "visual_attention_mask": visual_attention_mask
+        })
 
-            outputs = model(**inputs)
+        outputs = model(**inputs)
 
-            last_hidden_states = outputs.last_hidden_state
-        """
+        last_hidden_states = outputs.last_hidden_state
+        ```"""
 
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
diff --git a/src/transformers/models/vit/configuration_vit.py b/src/transformers/models/vit/configuration_vit.py
index 6d243d7779..9c1ee38b2e 100644
--- a/src/transformers/models/vit/configuration_vit.py
+++ b/src/transformers/models/vit/configuration_vit.py
@@ -28,58 +28,58 @@ VIT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class ViTConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.ViTModel`. It is used to
+    This is the configuration class to store the configuration of a [`ViTModel`]. It is used to
     instantiate an ViT model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the ViT `google/vit-base-patch16-224
-    <https://huggingface.co/google/vit-base-patch16-224>`__ architecture.
+    configuration with the defaults will yield a similar configuration to that of the ViT [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        image_size (:obj:`int`, `optional`, defaults to :obj:`224`):
+        image_size (`int`, *optional*, defaults to `224`):
             The size (resolution) of each image.
-        patch_size (:obj:`int`, `optional`, defaults to :obj:`16`):
+        patch_size (`int`, *optional*, defaults to `16`):
             The size (resolution) of each patch.
-        num_channels (:obj:`int`, `optional`, defaults to :obj:`3`):
+        num_channels (`int`, *optional*, defaults to `3`):
             The number of input channels.
-        qkv_bias (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        qkv_bias (`bool`, *optional*, defaults to `True`):
             Whether to add a bias to the queries, keys and values.
 
 
-    Example::
+    Example:
 
-        >>> from transformers import ViTModel, ViTConfig
+    ```python
+    >>> from transformers import ViTModel, ViTConfig
 
-        >>> # Initializing a ViT vit-base-patch16-224 style configuration
-        >>> configuration = ViTConfig()
+    >>> # Initializing a ViT vit-base-patch16-224 style configuration
+    >>> configuration = ViTConfig()
 
-        >>> # Initializing a model from the vit-base-patch16-224 style configuration
-        >>> model = ViTModel(configuration)
+    >>> # Initializing a model from the vit-base-patch16-224 style configuration
+    >>> model = ViTModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "vit"
 
     def __init__(
diff --git a/src/transformers/models/vit/feature_extraction_vit.py b/src/transformers/models/vit/feature_extraction_vit.py
index b45c7088f9..d67f60ef23 100644
--- a/src/transformers/models/vit/feature_extraction_vit.py
+++ b/src/transformers/models/vit/feature_extraction_vit.py
@@ -38,25 +38,25 @@ class ViTFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
     r"""
     Constructs a ViT feature extractor.
 
-    This feature extractor inherits from :class:`~transformers.FeatureExtractionMixin` which contains most of the main
+    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main
     methods. Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        do_resize (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether to resize the input to a certain :obj:`size`.
-        size (:obj:`int` or :obj:`Tuple(int)`, `optional`, defaults to 224):
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the input to a certain `size`.
+        size (`int` or `Tuple(int)`, *optional*, defaults to 224):
             Resize the input to the given size. If a tuple is provided, it should be (width, height). If only an
-            integer is provided, then the input will be resized to (size, size). Only has an effect if :obj:`do_resize`
-            is set to :obj:`True`.
-        resample (:obj:`int`, `optional`, defaults to :obj:`PIL.Image.BILINEAR`):
-            An optional resampling filter. This can be one of :obj:`PIL.Image.NEAREST`, :obj:`PIL.Image.BOX`,
-            :obj:`PIL.Image.BILINEAR`, :obj:`PIL.Image.HAMMING`, :obj:`PIL.Image.BICUBIC` or :obj:`PIL.Image.LANCZOS`.
-            Only has an effect if :obj:`do_resize` is set to :obj:`True`.
-        do_normalize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            integer is provided, then the input will be resized to (size, size). Only has an effect if `do_resize`
+            is set to `True`.
+        resample (`int`, *optional*, defaults to `PIL.Image.BILINEAR`):
+            An optional resampling filter. This can be one of `PIL.Image.NEAREST`, `PIL.Image.BOX`,
+            `PIL.Image.BILINEAR`, `PIL.Image.HAMMING`, `PIL.Image.BICUBIC` or `PIL.Image.LANCZOS`.
+            Only has an effect if `do_resize` is set to `True`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
             Whether or not to normalize the input with mean and standard deviation.
-        image_mean (:obj:`List[int]`, defaults to :obj:`[0.5, 0.5, 0.5]`):
+        image_mean (`List[int]`, defaults to `[0.5, 0.5, 0.5]`):
             The sequence of means for each channel, to be used when normalizing images.
-        image_std (:obj:`List[int]`, defaults to :obj:`[0.5, 0.5, 0.5]`):
+        image_std (`List[int]`, defaults to `[0.5, 0.5, 0.5]`):
             The sequence of standard deviations for each channel, to be used when normalizing images.
     """
 
@@ -86,27 +86,29 @@ class ViTFeatureExtractor(FeatureExtractionMixin, ImageFeatureExtractionMixin):
         """
         Main method to prepare for the model one or several image(s).
 
-        .. warning::
+        <Tip warning={true}>
 
-           NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
-           PIL images.
+        NumPy arrays and PyTorch tensors are converted to PIL images when resizing, so the most efficient is to pass
+        PIL images.
+
+        </Tip>
 
         Args:
-            images (:obj:`PIL.Image.Image`, :obj:`np.ndarray`, :obj:`torch.Tensor`, :obj:`List[PIL.Image.Image]`, :obj:`List[np.ndarray]`, :obj:`List[torch.Tensor]`):
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
                 The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                 tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                 number of channels, H and W are image height and width.
 
-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`, defaults to :obj:`'np'`):
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*, defaults to `'np'`):
                 If set, will return tensors of a particular framework. Acceptable values are:
 
-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return NumPy :obj:`np.ndarray` objects.
-                * :obj:`'jax'`: Return JAX :obj:`jnp.ndarray` objects.
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
 
         Returns:
-            :class:`~transformers.BatchFeature`: A :class:`~transformers.BatchFeature` with the following fields:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
 
             - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
               width).
diff --git a/src/transformers/models/vit/modeling_flax_vit.py b/src/transformers/models/vit/modeling_flax_vit.py
index 824629cad3..e478063dbb 100644
--- a/src/transformers/models/vit/modeling_flax_vit.py
+++ b/src/transformers/models/vit/modeling_flax_vit.py
@@ -515,21 +515,23 @@ class FlaxViTModel(FlaxViTPreTrainedModel):
 FLAX_VISION_MODEL_DOCSTRING = """
     Returns:
 
-    Examples::
+    Examples:
 
-        >>> from transformers import ViTFeatureExtractor, FlaxViTModel
-        >>> from PIL import Image
-        >>> import requests
+    ```python
+    >>> from transformers import ViTFeatureExtractor, FlaxViTModel
+    >>> from PIL import Image
+    >>> import requests
 
-        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+    >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+    >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
-        >>> model = FlaxViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
+    >>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
+    >>> model = FlaxViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
 
-        >>> inputs = feature_extractor(images=image, return_tensors="np")
-        >>> outputs = model(**inputs)
-        >>> last_hidden_states = outputs.last_hidden_state
+    >>> inputs = feature_extractor(images=image, return_tensors="np")
+    >>> outputs = model(**inputs)
+    >>> last_hidden_states = outputs.last_hidden_state
+    ```
 """
 
 overwrite_call_docstring(FlaxViTModel, FLAX_VISION_MODEL_DOCSTRING)
@@ -594,26 +596,28 @@ class FlaxViTForImageClassification(FlaxViTPreTrainedModel):
 FLAX_VISION_CLASSIF_DOCSTRING = """
     Returns:
 
-    Example::
+    Example:
 
-        >>> from transformers import ViTFeatureExtractor, FlaxViTForImageClassification
-        >>> from PIL import Image
-        >>> import jax
-        >>> import requests
+    ```python
+    >>> from transformers import ViTFeatureExtractor, FlaxViTForImageClassification
+    >>> from PIL import Image
+    >>> import jax
+    >>> import requests
 
-        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
-        >>> image = Image.open(requests.get(url, stream=True).raw)
+    >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+    >>> image = Image.open(requests.get(url, stream=True).raw)
 
-        >>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
-        >>> model = FlaxViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
+    >>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
+    >>> model = FlaxViTForImageClassification.from_pretrained('google/vit-base-patch16-224')
 
-        >>> inputs = feature_extractor(images=image, return_tensors="np")
-        >>> outputs = model(**inputs)
-        >>> logits = outputs.logits
+    >>> inputs = feature_extractor(images=image, return_tensors="np")
+    >>> outputs = model(**inputs)
+    >>> logits = outputs.logits
 
-        >>> # model predicts one of the 1000 ImageNet classes
-        >>> predicted_class_idx = jax.numpy.argmax(logits, axis=-1)
-        >>> print("Predicted class:", model.config.id2label[predicted_class_idx.item()])
+    >>> # model predicts one of the 1000 ImageNet classes
+    >>> predicted_class_idx = jax.numpy.argmax(logits, axis=-1)
+    >>> print("Predicted class:", model.config.id2label[predicted_class_idx.item()])
+    ```
 """
 
 overwrite_call_docstring(FlaxViTForImageClassification, FLAX_VISION_CLASSIF_DOCSTRING)
diff --git a/src/transformers/models/vit/modeling_tf_vit.py b/src/transformers/models/vit/modeling_tf_vit.py
index 20a9b520ee..f04169562e 100644
--- a/src/transformers/models/vit/modeling_tf_vit.py
+++ b/src/transformers/models/vit/modeling_tf_vit.py
@@ -674,22 +674,23 @@ class TFViTModel(TFViTPreTrainedModel):
         r"""
         Returns:
 
-        Examples::
+        Examples:
 
-            >>> from transformers import ViTFeatureExtractor, TFViTModel
-            >>> from PIL import Image
-            >>> import requests
+        ```python
+        >>> from transformers import ViTFeatureExtractor, TFViTModel
+        >>> from PIL import Image
+        >>> import requests
 
-            >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
-            >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> image = Image.open(requests.get(url, stream=True).raw)
 
-            >>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
-            >>> model = TFViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
+        >>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
+        >>> model = TFViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
 
-            >>> inputs = feature_extractor(images=image, return_tensors="tf")
-            >>> outputs = model(**inputs)
-            >>> last_hidden_states = outputs.last_hidden_state
-        """
+        >>> inputs = feature_extractor(images=image, return_tensors="tf")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
         inputs = input_processing(
             func=self.call,
             config=self.config,
diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py
index 50b69c9eb8..cbafc36af0 100644
--- a/src/transformers/models/vit/modeling_vit.py
+++ b/src/transformers/models/vit/modeling_vit.py
@@ -517,22 +517,23 @@ class ViTModel(ViTPreTrainedModel):
         r"""
         Returns:
 
-        Examples::
+        Examples:
 
-            >>> from transformers import ViTFeatureExtractor, ViTModel
-            >>> from PIL import Image
-            >>> import requests
+        ```python
+        >>> from transformers import ViTFeatureExtractor, ViTModel
+        >>> from PIL import Image
+        >>> import requests
 
-            >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
-            >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+        >>> image = Image.open(requests.get(url, stream=True).raw)
 
-            >>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
-            >>> model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
+        >>> feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
+        >>> model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
 
-            >>> inputs = feature_extractor(images=image, return_tensors="pt")
-            >>> outputs = model(**inputs)
-            >>> last_hidden_states = outputs.last_hidden_state
-        """
+        >>> inputs = feature_extractor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
diff --git a/src/transformers/models/wav2vec2/configuration_wav2vec2.py b/src/transformers/models/wav2vec2/configuration_wav2vec2.py
index 6f1d8c3234..a9d47cc632 100644
--- a/src/transformers/models/wav2vec2/configuration_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/configuration_wav2vec2.py
@@ -28,169 +28,165 @@ WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class Wav2Vec2Config(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.Wav2Vec2Model`. It is used to
+    This is the configuration class to store the configuration of a [`Wav2Vec2Model`]. It is used to
     instantiate an Wav2Vec2 model according to the specified arguments, defining the model architecture. Instantiating
     a configuration with the defaults will yield a similar configuration to that of the Wav2Vec2
-    `facebook/wav2vec2-base-960h <https://huggingface.co/facebook/wav2vec2-base-960h>`__ architecture.
+    [facebook/wav2vec2-base-960h](https://huggingface.co/facebook/wav2vec2-base-960h) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 32):
+        vocab_size (`int`, *optional*, defaults to 32):
             Vocabulary size of the Wav2Vec2 model. Defines the number of different tokens that can be represented by
-            the :obj:`inputs_ids` passed when calling :class:`~transformers.Wav2Vec2Model` or
-            :class:`~transformers.TFWav2Vec2Model`. Vocabulary size of the model. Defines the different tokens that can
-            be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.Wav2Vec2Model`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            the `inputs_ids` passed when calling [`Wav2Vec2Model`] or
+            [`TFWav2Vec2Model`]. Vocabulary size of the model. Defines the different tokens that can
+            be represented by the *inputs_ids* passed to the forward method of [`Wav2Vec2Model`].
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
+        attention_dropout (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        final_dropout (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probability for the final projection layer of :class:`Wav2Vec2ForCTC`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        final_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the final projection layer of [`Wav2Vec2ForCTC`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        feat_extract_norm (:obj:`str`, `optional`, defaults to :obj:`"group"`):
-            The norm to be applied to 1D convolutional layers in feature extractor. One of :obj:`"group"` for group
-            normalization of only the first 1D convolutional layer or :obj:`"layer"` for layer normalization of all 1D
+        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
+            The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group
+            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
             convolutional layers.
-        feat_proj_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probability for output of the feature extractor.
-        feat_extract_activation (:obj:`str, `optional`, defaults to :obj:`"gelu"`):
+        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the 1D convolutional layers of the feature
-            extractor. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        feat_quantizer_dropout (obj:`float`, `optional`, defaults to 0.0):
+            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        feat_quantizer_dropout (obj:*float*, *optional*, defaults to 0.0):
             The dropout probabilitiy for quantized feature extractor states.
-        conv_dim (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(512, 512, 512, 512, 512, 512, 512)`):
+        conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
             A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
-            feature extractor. The length of `conv_dim` defines the number of 1D convolutional layers.
-        conv_stride (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(5, 2, 2, 2, 2, 2, 2)`):
+            feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers.
+        conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
             A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length
-            of `conv_stride` defines the number of convolutional layers and has to match the length of `conv_dim`.
-        conv_kernel (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(10, 3, 3, 3, 3, 3, 3)`):
+            of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
+        conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
             A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The
-            length of `conv_kernel` defines the number of convolutional layers and has to match the length of
-            `conv_dim`.
-        conv_bias (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            length of *conv_kernel* defines the number of convolutional layers and has to match the length of
+            *conv_dim*.
+        conv_bias (`bool`, *optional*, defaults to `False`):
             Whether the 1D convolutional layers have a bias.
-        num_conv_pos_embeddings (:obj:`int`, `optional`, defaults to 128):
+        num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
             Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
             embeddings layer.
-        num_conv_pos_embedding_groups (:obj:`int`, `optional`, defaults to 16):
+        num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
             Number of groups of 1D convolutional positional embeddings layer.
-        do_stable_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to apply `stable` layer norm architecture of the Transformer encoder. ``do_stable_layer_norm is
-            True`` corresponds to applying layer norm before the attention layer, whereas ``do_stable_layer_norm is
-            False`` corresponds to applying layer norm after the attention layer.
-        apply_spec_augment (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_stable_layer_norm (`bool`, *optional*, defaults to `False`):
+            Whether to apply *stable* layer norm architecture of the Transformer encoder. `do_stable_layer_norm is True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is False` corresponds to applying layer norm after the attention layer.
+        apply_spec_augment (`bool`, *optional*, defaults to `True`):
             Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see
-            `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
-            <https://arxiv.org/abs/1904.08779>`__.
-        mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
+            [SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition](https://arxiv.org/abs/1904.08779).
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
             Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
             procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
             reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
-            masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
-            the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
-        mask_time_length (:obj:`int`, `optional`, defaults to 10):
+            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease
+            the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+        mask_time_length (`int`, *optional*, defaults to 10):
             Length of vector span along the time axis.
-        mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
-            The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
-            step, irrespectively of ``mask_feature_prob``. Only relevant if
+        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
             ''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
-        mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
             Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
             masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
             the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
-            span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
-            overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
-            is True``.
-        mask_feature_length (:obj:`int`, `optional`, defaults to 10):
+            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that
+            overlap may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
             Length of vector span along the feature axis.
-        mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
-            The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
-            step, irrespectively of ``mask_feature_prob``. Only relevant if
+        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
+            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
             ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
-        num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320):
+        num_codevectors_per_group (`int`, *optional*, defaults to 320):
             Number of entries in each quantization codebook (group).
-        num_codevector_groups (:obj:`int`, `optional`, defaults to 2):
+        num_codevector_groups (`int`, *optional*, defaults to 2):
             Number of codevector groups for product codevector quantization.
-        contrastive_logits_temperature (:obj:`float`, `optional`, defaults to 0.1):
-            The temperature `kappa` in the contrastive loss.
-        feat_quantizer_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
+            The temperature *kappa* in the contrastive loss.
+        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probabilitiy for the output of the feature extractor that's used by the quantizer.
-        num_negatives (:obj:`int`, `optional`, defaults to 100):
+        num_negatives (`int`, *optional*, defaults to 100):
             Number of negative samples for the contrastive loss.
-        codevector_dim (:obj:`int`, `optional`, defaults to 256):
+        codevector_dim (`int`, *optional*, defaults to 256):
             Dimensionality of the quantized feature vectors.
-        proj_codevector_dim (:obj:`int`, `optional`, defaults to 256):
+        proj_codevector_dim (`int`, *optional*, defaults to 256):
             Dimensionality of the final projection of both the quantized and the transformer features.
-        diversity_loss_weight (:obj:`int`, `optional`, defaults to 0.1):
+        diversity_loss_weight (`int`, *optional*, defaults to 0.1):
             The weight of the codebook diversity loss component.
-        ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`):
-            Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an
-            instance of :class:`~transformers.Wav2Vec2ForCTC`.
-        ctc_zero_infinity (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to zero infinite losses and the associated gradients of ``torch.nn.CTCLoss``. Infinite losses
+        ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
+            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+            instance of [`Wav2Vec2ForCTC`].
+        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses
             mainly occur when the inputs are too short to be aligned to the targets. Only relevant when training an
-            instance of :class:`~transformers.Wav2Vec2ForCTC`.
-        use_weighted_layer_sum (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            instance of [`Wav2Vec2ForCTC`].
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
             Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
-            instance of :class:`~transformers.Wav2Vec2ForSequenceClassification`.
-        classifier_proj_size (:obj:`int`, `optional`, defaults to 256):
+            instance of [`Wav2Vec2ForSequenceClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 256):
             Dimensionality of the projection before token mean-pooling for classification.
-        tdnn_dim (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(512, 512, 512, 512, 1500)`):
-            A tuple of integers defining the number of output channels of each 1D convolutional layer in the `TDNN`
-            module of the `XVector` model. The length of `tdnn_dim` defines the number of `TDNN` layers.
-        tdnn_kernel (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(5, 3, 3, 1, 1)`):
-            A tuple of integers defining the kernel size of each 1D convolutional layer in the `TDNN` module of the
-            `XVector` model. The length of `tdnn_kernel` has to match the length of `tdnn_dim`.
-        tdnn_dilation (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(1, 2, 3, 1, 1)`):
-            A tuple of integers defining the dilation factor of each 1D convolutional layer in `TDNN` module of the
-            `XVector` model. The length of `tdnn_dilation` has to match the length of `tdnn_dim`.
-        xvector_output_dim (:obj:`int`, `optional`, defaults to 512):
-            Dimensionality of the `XVector` embedding vectors.
-        add_adapter (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        tdnn_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
+            A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN*
+            module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers.
+        tdnn_kernel (`Tuple[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the
+            *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*.
+        tdnn_dilation (`Tuple[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
+            A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the
+            *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
+        xvector_output_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of the *XVector* embedding vectors.
+        add_adapter (`bool`, *optional*, defaults to `False`):
             Whether a convolutional network should be stacked on top of the Wav2Vec2 Encoder. Can be very useful for
             warm-starting Wav2Vec2 for SpeechEncoderDecoder models.
-        adapter_kernel_size (:obj:`int`, `optional`, defaults to 3):
-            Kernel size of the convolutional layers in the adapter network. Only relevant if ``add_adapter is True``.
-        adapter_stride (:obj:`int`, `optional`, defaults to 2):
-            Stride of the convolutional layers in the adapter network. Only relevant if ``add_adapter is True``.
-        num_adapter_layers (:obj:`int`, `optional`, defaults to 3):
-            Number of convolutional layers that should be used in the adapter network. Only relevant if ``add_adapter
-            is True``.
-        output_hidden_size (:obj:`int`, `optional`):
-            Dimensionality of the encoder output layer. If not defined, this defaults to `hidden-size`. Only relevant
-            if ``add_adapter is True``.
+        adapter_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        adapter_stride (`int`, *optional*, defaults to 2):
+            Stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        num_adapter_layers (`int`, *optional*, defaults to 3):
+            Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is True`.
+        output_hidden_size (`int`, *optional*):
+            Dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant
+            if `add_adapter is True`.
 
-    Example::
+    Example:
 
-        >>> from transformers import Wav2Vec2Model, Wav2Vec2Config
+    ```python
+    >>> from transformers import Wav2Vec2Model, Wav2Vec2Config
 
-        >>> # Initializing a Wav2Vec2 facebook/wav2vec2-base-960h style configuration
-        >>> configuration = Wav2Vec2Config()
+    >>> # Initializing a Wav2Vec2 facebook/wav2vec2-base-960h style configuration
+    >>> configuration = Wav2Vec2Config()
 
-        >>> # Initializing a model from the facebook/wav2vec2-base-960h style configuration
-        >>> model = Wav2Vec2Model(configuration)
+    >>> # Initializing a model from the facebook/wav2vec2-base-960h style configuration
+    >>> model = Wav2Vec2Model(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "wav2vec2"
 
     def __init__(
diff --git a/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py b/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
index 0f64412bbd..1b4894430d 100644
--- a/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/feature_extraction_wav2vec2.py
@@ -34,34 +34,32 @@ class Wav2Vec2FeatureExtractor(SequenceFeatureExtractor):
     Constructs a Wav2Vec2 feature extractor.
 
     This feature extractor inherits from
-    :class:`~transformers.feature_extraction_sequence_utils.SequenceFeatureExtractor` which contains most of the main
+    [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains most of the main
     methods. Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        feature_size (:obj:`int`, defaults to 1):
+        feature_size (`int`, defaults to 1):
             The feature dimension of the extracted features.
-        sampling_rate (:obj:`int`, defaults to 16000):
+        sampling_rate (`int`, defaults to 16000):
             The sampling rate at which the audio files should be digitalized expressed in Hertz per second (Hz).
-        padding_value (:obj:`float`, defaults to 0.0):
+        padding_value (`float`, defaults to 0.0):
             The value that is used to fill the padding values.
-        do_normalize (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        do_normalize (`bool`, *optional*, defaults to `False`):
             Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
-            improve the performance for some models, *e.g.*, `wav2vec2-lv60
-            <https://huggingface.co/models?search=lv60>`__.
-        return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not :meth:`~transformers.Wav2Vec2FeatureExtractor.__call__` should return :obj:`attention_mask`.
+            improve the performance for some models, *e.g.*, [wav2vec2-lv60](https://huggingface.co/models?search=lv60).
+        return_attention_mask (`bool`, *optional*, defaults to `False`):
+            Whether or not [`~Wav2Vec2FeatureExtractor.__call__`] should return `attention_mask`.
 
-            .. note::
+            <Tip>
 
-                Wav2Vec2 models that have set ``config.feat_extract_norm == "group"``, such as `wav2vec2-base
-                <https://huggingface.co/facebook/wav2vec2-base-960h>`__, have **not** been trained using
-                :obj:`attention_mask`. For such models, :obj:`input_values` should simply be padded with 0 and no
-                :obj:`attention_mask` should be passed.
+            Wav2Vec2 models that have set `config.feat_extract_norm == "group"`, such as [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), have **not** been trained using
+            `attention_mask`. For such models, `input_values` should simply be padded with 0 and no
+            `attention_mask` should be passed.
 
-                For Wav2Vec2 models that have set ``config.feat_extract_norm == "layer"``, such as `wav2vec2-lv60
-                <https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self>`__, :obj:`attention_mask` should be
-                passed for batched inference.
-    """
+            For Wav2Vec2 models that have set `config.feat_extract_norm == "layer"`, such as [wav2vec2-lv60](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self), `attention_mask` should be
+            passed for batched inference.
+
+            </Tip>"""
 
     model_input_names = ["input_values", "attention_mask"]
 
@@ -116,55 +114,55 @@ class Wav2Vec2FeatureExtractor(SequenceFeatureExtractor):
         Main method to featurize and prepare for the model one or several sequence(s). sequences.
 
         Args:
-            raw_speech (:obj:`np.ndarray`, :obj:`List[float]`, :obj:`List[np.ndarray]`, :obj:`List[List[float]]`):
+            raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                 The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                 values, a list of numpy arrays or a list of list of float values.
-            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
                 index) among:
 
-                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a
                   single sequence if provided).
-                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
                   maximum acceptable input length for the model if that argument is not provided.
-                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
                   different lengths).
-            max_length (:obj:`int`, `optional`):
+            max_length (`int`, *optional*):
                 Maximum length of the returned list and optionally padding length (see above).
-            truncation (:obj:`bool`):
-                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
-            pad_to_multiple_of (:obj:`int`, `optional`):
+            truncation (`bool`):
+                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
+            pad_to_multiple_of (`int`, *optional*):
                 If set will pad the sequence to a multiple of the provided value.
 
                 This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                 >= 7.5 (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
-            return_attention_mask (:obj:`bool`, `optional`):
+            return_attention_mask (`bool`, *optional*):
                 Whether to return the attention mask. If left to the default, will return the attention mask according
                 to the specific feature_extractor's default.
 
-                `What are attention masks? <../glossary.html#attention-mask>`__
+                [What are attention masks?](../glossary#attention-mask)
 
-                .. note::
+                <Tip>
 
-                    Wav2Vec2 models that have set ``config.feat_extract_norm == "group"``, such as `wav2vec2-base
-                    <https://huggingface.co/facebook/wav2vec2-base-960h>`__, have **not** been trained using
-                    :obj:`attention_mask`. For such models, :obj:`input_values` should simply be padded with 0 and no
-                    :obj:`attention_mask` should be passed.
+                Wav2Vec2 models that have set `config.feat_extract_norm == "group"`, such as [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), have **not** been trained using
+                `attention_mask`. For such models, `input_values` should simply be padded with 0 and no
+                `attention_mask` should be passed.
 
-                    For Wav2Vec2 models that have set ``config.feat_extract_norm == "layer"``, such as `wav2vec2-lv60
-                    <https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self>`__, :obj:`attention_mask` should be
-                    passed for batched inference.
+                For Wav2Vec2 models that have set `config.feat_extract_norm == "layer"`, such as [wav2vec2-lv60](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self), `attention_mask` should be
+                passed for batched inference.
 
-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+                </Tip>
+
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
 
-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
-            sampling_rate (:obj:`int`, `optional`):
-                The sampling rate at which the ``raw_speech`` input was sampled. It is strongly recommended to pass
-                ``sampling_rate`` at the forward call to prevent silent errors.
-            padding_value (:obj:`float`, defaults to 0.0):
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors.
+            padding_value (`float`, defaults to 0.0):
         """
 
         if sampling_rate is not None:
diff --git a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
index 4c84019cb8..3c24e4b161 100644
--- a/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_flax_wav2vec2.py
@@ -937,25 +937,27 @@ class FlaxWav2Vec2Model(FlaxWav2Vec2PreTrainedModel):
 FLAX_WAV2VEC2_MODEL_DOCSTRING = """
     Returns:
 
-    Example::
+    Example:
 
-        >>> from transformers import Wav2Vec2Processor, FlaxWav2Vec2Model
-        >>> from datasets import load_dataset
-        >>> import soundfile as sf
+    ```python
+    >>> from transformers import Wav2Vec2Processor, FlaxWav2Vec2Model
+    >>> from datasets import load_dataset
+    >>> import soundfile as sf
 
-        >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-lv60")
-        >>> model = FlaxWav2Vec2Model.from_pretrained("facebook/wav2vec2-large-lv60")
+    >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-lv60")
+    >>> model = FlaxWav2Vec2Model.from_pretrained("facebook/wav2vec2-large-lv60")
 
-        >>> def map_to_array(batch):
-        >>>     speech, _ = sf.read(batch["file"])
-        >>>     batch["speech"] = speech
-        >>>     return batch
+    >>> def map_to_array(batch):
+    >>>     speech, _ = sf.read(batch["file"])
+    >>>     batch["speech"] = speech
+    >>>     return batch
 
-        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        >>> ds = ds.map(map_to_array)
+    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    >>> ds = ds.map(map_to_array)
 
-        >>> input_values = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="np").input_values  # Batch size 1
-        >>> hidden_states = model(input_values).last_hidden_state
+    >>> input_values = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="np").input_values  # Batch size 1
+    >>> hidden_states = model(input_values).last_hidden_state
+    ```
 """
 
 overwrite_call_docstring(
@@ -1037,30 +1039,32 @@ class FlaxWav2Vec2ForCTC(FlaxWav2Vec2PreTrainedModel):
 FLAX_WAV2VEC2_FOR_CTC_DOCSTRING = """
     Returns:
 
-    Example::
+    Example:
 
-        >>> import jax.numpy as jnp
-        >>> from transformers import Wav2Vec2Processor, FlaxWav2Vec2ForCTC
-        >>> from datasets import load_dataset
-        >>> import soundfile as sf
+    ```python
+    >>> import jax.numpy as jnp
+    >>> from transformers import Wav2Vec2Processor, FlaxWav2Vec2ForCTC
+    >>> from datasets import load_dataset
+    >>> import soundfile as sf
 
-        >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60")
-        >>> model = FlaxWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60")
+    >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h-lv60")
+    >>> model = FlaxWav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h-lv60")
 
-        >>> def map_to_array(batch):
-        >>>     speech, _ = sf.read(batch["file"])
-        >>>     batch["speech"] = speech
-        >>>     return batch
+    >>> def map_to_array(batch):
+    >>>     speech, _ = sf.read(batch["file"])
+    >>>     batch["speech"] = speech
+    >>>     return batch
 
-        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        >>> ds = ds.map(map_to_array)
+    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    >>> ds = ds.map(map_to_array)
 
-        >>> input_values = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="np").input_values  # Batch size 1
-        >>> logits = model(input_values).logits
-        >>> predicted_ids = jnp.argmax(logits, axis=-1)
+    >>> input_values = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="np").input_values  # Batch size 1
+    >>> logits = model(input_values).logits
+    >>> predicted_ids = jnp.argmax(logits, axis=-1)
 
-        >>> transcription = processor.decode(predicted_ids[0])
-        >>> # should give:  "A MAN SAID TO THE UNIVERSE SIR I EXIST"
+    >>> transcription = processor.decode(predicted_ids[0])
+    >>> # should give:  "A MAN SAID TO THE UNIVERSE SIR I EXIST"
+    ```
 """
 
 overwrite_call_docstring(
@@ -1108,10 +1112,11 @@ class FlaxWav2Vec2ForPreTrainingModule(nn.Module):
         r"""
         Returns:
 
-        Example::
+        Example:
 
+        ```python
 
-        """
+        ```"""
 
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
@@ -1220,45 +1225,47 @@ class FlaxWav2Vec2ForPreTraining(FlaxWav2Vec2PreTrainedModel):
 FLAX_WAV2VEC2_FOR_PRETRAINING_DOCSTRING = """
     Returns:
 
-    Example::
+    Example:
 
-        >>> import optax
-        >>> import numpy as np
-        >>> import jax.numpy as jnp
-        >>> from transformers import Wav2Vec2FeatureExtractor, FlaxWav2Vec2ForPreTraining
-        >>> from transformers.models.wav2vec2.modeling_flax_wav2vec2 import _compute_mask_indices
-        >>> from datasets import load_dataset
-        >>> import soundfile as sf
+    ```python
+    >>> import optax
+    >>> import numpy as np
+    >>> import jax.numpy as jnp
+    >>> from transformers import Wav2Vec2FeatureExtractor, FlaxWav2Vec2ForPreTraining
+    >>> from transformers.models.wav2vec2.modeling_flax_wav2vec2 import _compute_mask_indices
+    >>> from datasets import load_dataset
+    >>> import soundfile as sf
 
-        >>> feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-large-lv60")
-        >>> model = FlaxWav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-large-lv60")
+    >>> feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-large-lv60")
+    >>> model = FlaxWav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-large-lv60")
 
 
-        >>> def map_to_array(batch):
-        ...     speech, _ = sf.read(batch["file"])
-        ...     batch["speech"] = speech
-        ...     return batch
+    >>> def map_to_array(batch):
+    ...     speech, _ = sf.read(batch["file"])
+    ...     batch["speech"] = speech
+    ...     return batch
 
 
-        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-        >>> ds = ds.map(map_to_array)
+    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    >>> ds = ds.map(map_to_array)
 
-        >>> input_values = feature_extractor(ds["speech"][0], return_tensors="np").input_values  # Batch size 1
+    >>> input_values = feature_extractor(ds["speech"][0], return_tensors="np").input_values  # Batch size 1
 
-        >>> # compute masked indices
-        >>> batch_size, raw_sequence_length = input_values.shape
-        >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length)
-        >>> mask_time_indices = _compute_mask_indices((batch_size, sequence_length), mask_prob=0.2, mask_length=2)
+    >>> # compute masked indices
+    >>> batch_size, raw_sequence_length = input_values.shape
+    >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length)
+    >>> mask_time_indices = _compute_mask_indices((batch_size, sequence_length), mask_prob=0.2, mask_length=2)
 
-        >>> outputs = model(input_values, mask_time_indices=mask_time_indices)
+    >>> outputs = model(input_values, mask_time_indices=mask_time_indices)
 
-        >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
-        >>> cosine_sim = optax.cosine_similarity(
-        ...     outputs.projected_states, outputs.projected_quantized_states
-        ... )
+    >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
+    >>> cosine_sim = optax.cosine_similarity(
+    ...     outputs.projected_states, outputs.projected_quantized_states
+    ... )
 
-        >>> # show that cosine similarity is much higher than random
-        >>> assert np.asarray(cosine_sim)[mask_time_indices].mean() > 0.5
+    >>> # show that cosine similarity is much higher than random
+    >>> assert np.asarray(cosine_sim)[mask_time_indices].mean() > 0.5
+    ```
 """
 
 overwrite_call_docstring(
diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
index af72317b62..f1de2f14a1 100644
--- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
@@ -1396,26 +1396,27 @@ class TFWav2Vec2Model(TFWav2Vec2PreTrainedModel):
 
         Returns:
 
-        Example::
+        Example:
 
-            >>> from transformers import Wav2Vec2Processor, TFWav2Vec2Model
-            >>> from datasets import load_dataset
-            >>> import soundfile as sf
+        ```python
+        >>> from transformers import Wav2Vec2Processor, TFWav2Vec2Model
+        >>> from datasets import load_dataset
+        >>> import soundfile as sf
 
-            >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
-            >>> model = TFWav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
+        >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
+        >>> model = TFWav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
 
-            >>> def map_to_array(batch):
-            >>>     speech, _ = sf.read(batch["file"])
-            >>>     batch["speech"] = speech
-            >>>     return batch
+        >>> def map_to_array(batch):
+        >>>     speech, _ = sf.read(batch["file"])
+        >>>     batch["speech"] = speech
+        >>>     return batch
 
-            >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
-            >>> ds = ds.map(map_to_array)
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> ds = ds.map(map_to_array)
 
-            >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values  # Batch size 1
-            >>> hidden_states = model(input_values).last_hidden_state
-        """
+        >>> input_values = processor(ds["speech"][0], return_tensors="tf").input_values  # Batch size 1
+        >>> hidden_states = model(input_values).last_hidden_state
+        ```"""
 
         inputs = input_values_processing(
             func=self.call,
diff --git a/src/transformers/models/wav2vec2/processing_wav2vec2.py b/src/transformers/models/wav2vec2/processing_wav2vec2.py
index 2f8dcb4ad0..3b1313665a 100644
--- a/src/transformers/models/wav2vec2/processing_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/processing_wav2vec2.py
@@ -29,16 +29,16 @@ class Wav2Vec2Processor:
     Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor and a Wav2Vec2 CTC tokenizer into a single
     processor.
 
-    :class:`~transformers.Wav2Vec2Processor` offers all the functionalities of
-    :class:`~transformers.Wav2Vec2FeatureExtractor` and :class:`~transformers.PreTrainedTokenizer`. See the docstring
-    of :meth:`~transformers.Wav2Vec2Processor.__call__` and :meth:`~transformers.Wav2Vec2Processor.decode` for more
+    [`Wav2Vec2Processor`] offers all the functionalities of
+    [`Wav2Vec2FeatureExtractor`] and [`PreTrainedTokenizer`]. See the docstring
+    of [`~Wav2Vec2Processor.__call__`] and [`~Wav2Vec2Processor.decode`] for more
     information.
 
     Args:
-        feature_extractor (:obj:`Wav2Vec2FeatureExtractor`):
-            An instance of :class:`~transformers.Wav2Vec2FeatureExtractor`. The feature extractor is a required input.
-        tokenizer (:class:`~transformers.PreTrainedTokenizer`):
-            An instance of :class:`~transformers.PreTrainedTokenizer`. The tokenizer is a required input.
+        feature_extractor (`Wav2Vec2FeatureExtractor`):
+            An instance of [`Wav2Vec2FeatureExtractor`]. The feature extractor is a required input.
+        tokenizer ([`PreTrainedTokenizer`]):
+            An instance of [`PreTrainedTokenizer`]. The tokenizer is a required input.
     """
 
     def __init__(self, feature_extractor, tokenizer):
@@ -57,18 +57,20 @@ class Wav2Vec2Processor:
 
     def save_pretrained(self, save_directory):
         """
-        Save a Wav2Vec2 feature_extractor object and Wav2Vec2 tokenizer object to the directory ``save_directory``, so
-        that it can be re-loaded using the :func:`~transformers.Wav2Vec2Processor.from_pretrained` class method.
+        Save a Wav2Vec2 feature_extractor object and Wav2Vec2 tokenizer object to the directory `save_directory`, so
+        that it can be re-loaded using the [`~Wav2Vec2Processor.from_pretrained`] class method.
 
-        .. note::
+        <Tip>
 
-            This class method is simply calling
-            :meth:`~transformers.feature_extraction_utils.FeatureExtractionMixin.save_pretrained` and
-            :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.save_pretrained`. Please refer to the
-            docstrings of the methods above for more information.
+        This class method is simply calling
+        [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and
+        [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`]. Please refer to the
+        docstrings of the methods above for more information.
+
+        </Tip>
 
         Args:
-            save_directory (:obj:`str` or :obj:`os.PathLike`):
+            save_directory (`str` or `os.PathLike`):
                 Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
                 be created if it does not exist).
         """
@@ -79,30 +81,32 @@ class Wav2Vec2Processor:
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         r"""
-        Instantiate a :class:`~transformers.Wav2Vec2Processor` from a pretrained Wav2Vec2 processor.
+        Instantiate a [`Wav2Vec2Processor`] from a pretrained Wav2Vec2 processor.
 
-        .. note::
+        <Tip>
 
-            This class method is simply calling Wav2Vec2FeatureExtractor's
-            :meth:`~transformers.feature_extraction_utils.FeatureExtractionMixin.from_pretrained` and
-            PreTrainedTokenizer's :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained`.
-            Please refer to the docstrings of the methods above for more information.
+        This class method is simply calling Wav2Vec2FeatureExtractor's
+        [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and
+        PreTrainedTokenizer's [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`].
+        Please refer to the docstrings of the methods above for more information.
+
+        </Tip>
 
         Args:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
                 This can be either:
 
-                - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
-                  namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing a feature extractor file saved using the
-                  :meth:`~transformers.SequenceFeatureExtractor.save_pretrained` method, e.g.,
-                  ``./my_model_directory/``.
-                - a path or url to a saved feature extractor JSON `file`, e.g.,
-                  ``./my_model_directory/preprocessor_config.json``.
+                - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
+                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                - a path to a *directory* containing a feature extractor file saved using the
+                  [`~SequenceFeatureExtractor.save_pretrained`] method, e.g.,
+                  `./my_model_directory/`.
+                - a path or url to a saved feature extractor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.
             **kwargs
-                Additional keyword arguments passed along to both :class:`~transformers.SequenceFeatureExtractor` and
-                :class:`~transformers.PreTrainedTokenizer`
+                Additional keyword arguments passed along to both [`SequenceFeatureExtractor`] and
+                [`PreTrainedTokenizer`]
         """
         feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(pretrained_model_name_or_path, **kwargs)
 
@@ -128,9 +132,9 @@ class Wav2Vec2Processor:
     def __call__(self, *args, **kwargs):
         """
         When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's
-        :meth:`~transformers.Wav2Vec2FeatureExtractor.__call__` and returns its output. If used in the context
-        :meth:`~transformers.Wav2Vec2Processor.as_target_processor` this method forwards all its arguments to
-        PreTrainedTokenizer's :meth:`~transformers.PreTrainedTokenizer.__call__`. Please refer to the docstring of the
+        [`~Wav2Vec2FeatureExtractor.__call__`] and returns its output. If used in the context
+        [`~Wav2Vec2Processor.as_target_processor`] this method forwards all its arguments to
+        PreTrainedTokenizer's [`~PreTrainedTokenizer.__call__`]. Please refer to the docstring of the
         above two methods for more information.
         """
         return self.current_processor(*args, **kwargs)
@@ -138,9 +142,9 @@ class Wav2Vec2Processor:
     def pad(self, *args, **kwargs):
         """
         When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's
-        :meth:`~transformers.Wav2Vec2FeatureExtractor.pad` and returns its output. If used in the context
-        :meth:`~transformers.Wav2Vec2Processor.as_target_processor` this method forwards all its arguments to
-        PreTrainedTokenizer's :meth:`~transformers.PreTrainedTokenizer.pad`. Please refer to the docstring of the above
+        [`~Wav2Vec2FeatureExtractor.pad`] and returns its output. If used in the context
+        [`~Wav2Vec2Processor.as_target_processor`] this method forwards all its arguments to
+        PreTrainedTokenizer's [`~PreTrainedTokenizer.pad`]. Please refer to the docstring of the above
         two methods for more information.
         """
         return self.current_processor.pad(*args, **kwargs)
@@ -148,7 +152,7 @@ class Wav2Vec2Processor:
     def batch_decode(self, *args, **kwargs):
         """
         This method forwards all its arguments to PreTrainedTokenizer's
-        :meth:`~transformers.PreTrainedTokenizer.batch_decode`. Please refer to the docstring of this method for more
+        [`~PreTrainedTokenizer.batch_decode`]. Please refer to the docstring of this method for more
         information.
         """
         return self.tokenizer.batch_decode(*args, **kwargs)
@@ -156,7 +160,7 @@ class Wav2Vec2Processor:
     def decode(self, *args, **kwargs):
         """
         This method forwards all its arguments to PreTrainedTokenizer's
-        :meth:`~transformers.PreTrainedTokenizer.decode`. Please refer to the docstring of this method for more
+        [`~PreTrainedTokenizer.decode`]. Please refer to the docstring of this method for more
         information.
         """
         return self.tokenizer.decode(*args, **kwargs)
diff --git a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
index 0c8eb31d01..d2e024ed0b 100644
--- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py
@@ -50,31 +50,31 @@ PRETRAINED_VOCAB_FILES_MAP = {
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/wav2vec2-base-960h": sys.maxsize}
 
 WAV2VEC2_KWARGS_DOCSTRING = r"""
-            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
                 Activates and controls padding. Accepts the following values:
 
-                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a
                   single sequence if provided).
-                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
                   maximum acceptable input length for the model if that argument is not provided.
-                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
                   different lengths).
-            max_length (:obj:`int`, `optional`):
+            max_length (`int`, *optional*):
                 Controls the maximum length to use by one of the truncation/padding parameters.
 
-                If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
+                If left unset or set to `None`, this will use the predefined model maximum length if a maximum
                 length is required by one of the truncation/padding parameters. If the model has no specific maximum
                 input length (like XLNet) truncation/padding to a maximum length will be deactivated.
-            pad_to_multiple_of (:obj:`int`, `optional`):
+            pad_to_multiple_of (`int`, *optional*):
                 If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                 the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
 
-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
-            verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            verbose (`bool`, *optional*, defaults to `True`):
                 Whether or not to print more information and warnings.
 """
 
@@ -84,28 +84,28 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
     """
     Constructs a Wav2Vec2CTC tokenizer.
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains some of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains some of the main methods.
     Users should refer to the superclass for more information regarding such methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             File containing the vocabulary.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sentence token.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sentence token.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        word_delimiter_token (:obj:`str`, `optional`, defaults to :obj:`"|"`):
+        word_delimiter_token (`str`, *optional*, defaults to `"|"`):
             The token used for defining the end of a word.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        do_lower_case (`bool`, *optional*, defaults to `False`):
             Whether or not to accept lowercase input and lowercase the output when decoding.
 
         **kwargs
-            Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer`
+            Additional keyword arguments passed along to [`PreTrainedTokenizer`]
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -153,7 +153,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
     @property
     def word_delimiter_token(self) -> str:
         """
-        :obj:`str`: Word delimiter token. Log an error if used while not having been set.
+        `str`: Word delimiter token. Log an error if used while not having been set.
         """
         if self._word_delimiter_token is None and self.verbose:
             logger.error("Using word_delimiter_token, but it is not set yet.")
@@ -163,7 +163,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
     @property
     def word_delimiter_token_id(self) -> Optional[int]:
         """
-        :obj:`Optional[int]`: Id of the word_delimiter_token in the vocabulary. Returns :obj:`None` if the token has
+        `Optional[int]`: Id of the word_delimiter_token in the vocabulary. Returns `None` if the token has
         not been set.
         """
         if self._word_delimiter_token is None:
@@ -285,26 +285,27 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer):
         it with indices starting from length of the current vocabulary.
 
         Args:
-            new_tokens (:obj:`List[str]`or :obj:`List[tokenizers.AddedToken]`):
+            new_tokens (`List[str]`or `List[tokenizers.AddedToken]`):
                 Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by
-                checking if the tokenizer assign the index of the ``unk_token`` to them).
-            special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                checking if the tokenizer assign the index of the `unk_token` to them).
+            special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the tokens should be added as special tokens.
 
         Returns:
-            :obj:`int`: The number of tokens actually added to the vocabulary.
+            `int`: The number of tokens actually added to the vocabulary.
 
-        Examples::
+        Examples:
 
-            # Let's see how to increase the vocabulary of Bert model and tokenizer
-            tokenizer = Wav2Vec2CTCTokenizer.from_pretrained('facebook/wav2vec2-base-960h')
-            model = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base-960h')
+        ```python
+        # Let's see how to increase the vocabulary of Bert model and tokenizer
+        tokenizer = Wav2Vec2CTCTokenizer.from_pretrained('facebook/wav2vec2-base-960h')
+        model = Wav2Vec2ForCTC.from_pretrained('facebook/wav2vec2-base-960h')
 
-            num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
-            print('We have added', num_added_toks, 'tokens')
-            # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
-            model.resize_token_embeddings(len(tokenizer))
-        """
+        num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
+        print('We have added', num_added_toks, 'tokens')
+        # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
+        model.resize_token_embeddings(len(tokenizer))
+        ```"""
         new_tokens = [str(tok) for tok in new_tokens]
 
         tokens_to_add = []
@@ -341,45 +342,44 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer):
     """
     Constructs a Wav2Vec2 tokenizer.
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains some of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains some of the main methods.
     Users should refer to the superclass for more information regarding such methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             File containing the vocabulary.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sentence token.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sentence token.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        word_delimiter_token (:obj:`str`, `optional`, defaults to :obj:`"|"`):
+        word_delimiter_token (`str`, *optional*, defaults to `"|"`):
             The token used for defining the end of a word.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        do_lower_case (`bool`, *optional*, defaults to `False`):
             Whether or not to lowercase the output when decoding.
-        do_normalize (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        do_normalize (`bool`, *optional*, defaults to `False`):
             Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly
-            improve the performance for some models, *e.g.*, `wav2vec2-lv60
-            <https://huggingface.co/models?search=lv60>`__.
-        return_attention_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not :meth:`~transformers.Wav2Vec2Tokenizer.__call__` should return :obj:`attention_mask`.
+            improve the performance for some models, *e.g.*, [wav2vec2-lv60](https://huggingface.co/models?search=lv60).
+        return_attention_mask (`bool`, *optional*, defaults to `False`):
+            Whether or not [`~Wav2Vec2Tokenizer.__call__`] should return `attention_mask`.
 
-            .. note::
+            <Tip>
 
-                Wav2Vec2 models that have set ``config.feat_extract_norm == "group"``, such as `wav2vec2-base
-                <https://huggingface.co/facebook/wav2vec2-base-960h>`__, have **not** been trained using
-                :obj:`attention_mask`. For such models, :obj:`input_values` should simply be padded with 0 and no
-                :obj:`attention_mask` should be passed.
+            Wav2Vec2 models that have set `config.feat_extract_norm == "group"`, such as [wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base-960h), have **not** been trained using
+            `attention_mask`. For such models, `input_values` should simply be padded with 0 and no
+            `attention_mask` should be passed.
 
-                For Wav2Vec2 models that have set ``config.feat_extract_norm == "layer"``, such as `wav2vec2-lv60
-                <https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self>`__, :obj:`attention_mask` should be
-                passed for batched inference.
+            For Wav2Vec2 models that have set `config.feat_extract_norm == "layer"`, such as [wav2vec2-lv60](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self), `attention_mask` should be
+            passed for batched inference.
+
+            </Tip>
 
         **kwargs
-            Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer`
+            Additional keyword arguments passed along to [`PreTrainedTokenizer`]
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -437,7 +437,7 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer):
     @property
     def word_delimiter_token(self) -> str:
         """
-        :obj:`str`: Padding token. Log an error if used while not having been set.
+        `str`: Padding token. Log an error if used while not having been set.
         """
         if self._word_delimiter_token is None and self.verbose:
             logger.error("Using word_delimiter_token, but it is not set yet.")
@@ -447,7 +447,7 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer):
     @property
     def word_delimiter_token_id(self) -> Optional[int]:
         """
-        :obj:`Optional[int]`: Id of the word_delimiter_token in the vocabulary. Returns :obj:`None` if the token has
+        `Optional[int]`: Id of the word_delimiter_token in the vocabulary. Returns `None` if the token has
         not been set.
         """
         if self._word_delimiter_token is None:
@@ -478,7 +478,7 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer):
         sequences.
 
         Args:
-            raw_speech (:obj:`np.ndarray`, :obj:`List[float]`, :obj:`List[np.ndarray]`, :obj:`List[List[float]]`):
+            raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
                 The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
                 values, a list of numpy arrayr or a list of list of float values.
         """
diff --git a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
index a11134b97d..0fc8cb7a49 100644
--- a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
+++ b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py
@@ -52,32 +52,32 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
     """
     Constructs a Wav2Vec2PhonemeCTC tokenizer.
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains some of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains some of the main methods.
     Users should refer to the superclass for more information regarding such methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             File containing the vocabulary.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sentence token.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sentence token.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        do_phonemize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_phonemize (`bool`, *optional*, defaults to `True`):
             Whether the tokenizer should phonetize the input or not. Only if a sequence of phonemes is passed to the
-            tokenizer, :obj:`do_phonemize` should be set to ``False``.
-        phonemizer_lang (:obj:`str`, `optional`, defaults to :obj:`"en-us"`):
+            tokenizer, `do_phonemize` should be set to `False`.
+        phonemizer_lang (`str`, *optional*, defaults to `"en-us"`):
             The language of the phoneme set to which the tokenizer should phonetize the input text to.
-        phonemizer_backend (:obj:`str`, `optional`. defaults to :obj:`"espeak"`):
-            The backend phonetization library that shall be used by the phonemizer library. Defaults to ``espeak-ng``.
-            See the `phonemizer package <https://github.com/bootphon/phonemizer#readme>`_. for more information.
+        phonemizer_backend (`str`, *optional*. defaults to `"espeak"`):
+            The backend phonetization library that shall be used by the phonemizer library. Defaults to `espeak-ng`.
+            See the [phonemizer package](https://github.com/bootphon/phonemizer#readme). for more information.
 
         **kwargs
-            Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer`
+            Additional keyword arguments passed along to [`PreTrainedTokenizer`]
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -139,25 +139,25 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
         """
         Performs any necessary transformations before tokenization.
 
-        This method should pop the arguments from kwargs and return the remaining :obj:`kwargs` as well. We test the
-        :obj:`kwargs` at the end of the encoding process to be sure all the arguments have been used.
+        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
+        `kwargs` at the end of the encoding process to be sure all the arguments have been used.
 
         Args:
-            text (:obj:`str`):
+            text (`str`):
                 The text to prepare.
-            is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the input is already pre-tokenized (e.g., split into words). If set to :obj:`True`, the
+            is_split_into_words (`bool`, *optional*, defaults to `False`):
+                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
                 tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                 which it will tokenize. This is useful for NER or token classification.
-            phonemizer_lang (:obj:`str`, `optional`):
+            phonemizer_lang (`str`, *optional*):
                 The language of the phoneme set to which the tokenizer should phonetize the input text to.
-            do_phonemize (:obj:`bool`, `optional`):
+            do_phonemize (`bool`, *optional*):
                 Whether the tokenizer should phonetize the input text or not. Only if a sequence of phonemes is passed
-                to the tokenizer, :obj:`do_phonemize` should be set to ``False``.
+                to the tokenizer, `do_phonemize` should be set to `False`.
 
 
         Returns:
-            :obj:`Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
+            `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
         """
         if is_split_into_words:
             text = " " + text
@@ -217,7 +217,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
     @property
     def word_delimiter_token(self) -> str:
         """
-        :obj:`str`: Word delimiter token. Log an error if used while not having been set.
+        `str`: Word delimiter token. Log an error if used while not having been set.
         """
         if self._word_delimiter_token is None and self.verbose:
             return None
@@ -226,7 +226,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
     @property
     def word_delimiter_token_id(self) -> Optional[int]:
         """
-        :obj:`Optional[int]`: Id of the word_delimiter_token in the vocabulary. Returns :obj:`None` if the token has
+        `Optional[int]`: Id of the word_delimiter_token in the vocabulary. Returns `None` if the token has
         not been set.
         """
         if self._word_delimiter_token is None:
@@ -244,7 +244,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
     @property
     def phone_delimiter_token(self) -> str:
         """
-        :obj:`str`: Word delimiter token. Log an error if used while not having been set.
+        `str`: Word delimiter token. Log an error if used while not having been set.
         """
         if self._phone_delimiter_token is None and self.verbose:
             logger.error("Using phone_delimiter_token, but it is not set yet.")
@@ -254,7 +254,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
     @property
     def phone_delimiter_token_id(self) -> Optional[int]:
         """
-        :obj:`Optional[int]`: Id of the phone_delimiter_token in the vocabulary. Returns :obj:`None` if the token has
+        `Optional[int]`: Id of the phone_delimiter_token in the vocabulary. Returns `None` if the token has
         not been set.
         """
         if self._phone_delimiter_token is None:
@@ -357,26 +357,27 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer):
         it with indices starting from length of the current vocabulary.
 
         Args:
-            new_tokens (:obj:`List[str]`or :obj:`List[tokenizers.AddedToken]`):
+            new_tokens (`List[str]`or `List[tokenizers.AddedToken]`):
                 Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by
-                checking if the tokenizer assign the index of the ``unk_token`` to them).
-            special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                checking if the tokenizer assign the index of the `unk_token` to them).
+            special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the tokens should be added as special tokens.
 
         Returns:
-            :obj:`int`: The number of tokens actually added to the vocabulary.
+            `int`: The number of tokens actually added to the vocabulary.
 
-        Examples::
+        Examples:
 
-            # Let's see how to increase the vocabulary of Bert model and tokenizer
-            tokenizer = Wav2Vec2PhonemeCTCTokenizer.from_pretrained('facebook/wav2vec2-lv-60-espeak-cv-ft')
-            model = Wav2Vec2PhonemeForCTC.from_pretrained('facebook/wav2vec2-lv-60-espeak-cv-ft')
+        ```python
+        # Let's see how to increase the vocabulary of Bert model and tokenizer
+        tokenizer = Wav2Vec2PhonemeCTCTokenizer.from_pretrained('facebook/wav2vec2-lv-60-espeak-cv-ft')
+        model = Wav2Vec2PhonemeForCTC.from_pretrained('facebook/wav2vec2-lv-60-espeak-cv-ft')
 
-            num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
-            print('We have added', num_added_toks, 'tokens')
-            # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
-            model.resize_token_embeddings(len(tokenizer))
-        """
+        num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
+        print('We have added', num_added_toks, 'tokens')
+        # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
+        model.resize_token_embeddings(len(tokenizer))
+        ```"""
         new_tokens = [str(tok) for tok in new_tokens]
 
         tokens_to_add = []
diff --git a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
index 291898274d..be0108173d 100644
--- a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
+++ b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
@@ -37,10 +37,10 @@ if TYPE_CHECKING:
 @dataclass
 class Wav2Vec2DecoderWithLMOutput(ModelOutput):
     """
-    Output type of :class:`~transformers.Wav2Vec2DecoderWithLM`, with transcription.
+    Output type of [`Wav2Vec2DecoderWithLM`], with transcription.
 
     Args:
-        text (list of :obj:`str`):
+        text (list of `str`):
             Decoded logits in text from. Usually the speech transcription.
     """
 
@@ -53,12 +53,12 @@ class Wav2Vec2ProcessorWithLM:
     with language model support into a single processor for language model boosted speech recognition decoding.
 
     Args:
-        feature_extractor (:class:`~transformers.Wav2Vec2FeatureExtractor`):
-            An instance of :class:`~transformers.Wav2Vec2FeatureExtractor`. The feature extractor is a required input.
-        tokenizer (:class:`~transformers.Wav2Vec2CTCTokenizer`):
-            An instance of :class:`~transformers.Wav2Vec2CTCTokenizer`. The tokenizer is a required input.
-        decoder (:obj:`pyctcdecode.BeamSearchDecoderCTC`):
-            An instance of :class:`pyctcdecode.BeamSearchDecoderCTC`. The decoder is a required input.
+        feature_extractor ([`Wav2Vec2FeatureExtractor`]):
+            An instance of [`Wav2Vec2FeatureExtractor`]. The feature extractor is a required input.
+        tokenizer ([`Wav2Vec2CTCTokenizer`]):
+            An instance of [`Wav2Vec2CTCTokenizer`]. The tokenizer is a required input.
+        decoder (`pyctcdecode.BeamSearchDecoderCTC`):
+            An instance of [`pyctcdecode.BeamSearchDecoderCTC`]. The decoder is a required input.
     """
 
     def __init__(
@@ -98,20 +98,22 @@ class Wav2Vec2ProcessorWithLM:
     def save_pretrained(self, save_directory):
         """
         Save the Wav2Vec2 feature_extractor, a tokenizer object and a pyctcdecode decoder to the directory
-        ``save_directory``, so that they can be re-loaded using the
-        :func:`~transformers.Wav2Vec2ProcessorWithLM.from_pretrained` class method.
+        `save_directory`, so that they can be re-loaded using the
+        [`~Wav2Vec2ProcessorWithLM.from_pretrained`] class method.
 
-        .. note::
+        <Tip>
 
-            This class method is simply calling
-            :meth:`~transformers.feature_extraction_utils.FeatureExtractionMixin.save_pretrained,`
-            :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.save_pretrained` and pyctcdecode's
-            :meth:`pyctcdecode.BeamSearchDecoderCTC.save_to_dir`.
+        This class method is simply calling
+        [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained,`]
+        [`~tokenization_utils_base.PreTrainedTokenizer.save_pretrained`] and pyctcdecode's
+        [`pyctcdecode.BeamSearchDecoderCTC.save_to_dir`].
 
-            Please refer to the docstrings of the methods above for more information.
+        Please refer to the docstrings of the methods above for more information.
+
+        </Tip>
 
         Args:
-            save_directory (:obj:`str` or :obj:`os.PathLike`):
+            save_directory (`str` or `os.PathLike`):
                 Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will
                 be created if it does not exist).
         """
@@ -122,32 +124,34 @@ class Wav2Vec2ProcessorWithLM:
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         r"""
-        Instantiate a :class:`~transformers.Wav2Vec2ProcessorWithLM` from a pretrained Wav2Vec2 processor.
+        Instantiate a [`Wav2Vec2ProcessorWithLM`] from a pretrained Wav2Vec2 processor.
 
-        .. note::
+        <Tip>
 
-            This class method is simply calling Wav2Vec2FeatureExtractor's
-            :meth:`~transformers.feature_extraction_utils.FeatureExtractionMixin.from_pretrained`,
-            Wav2Vec2CTCTokenizer's :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained`,
-            and :meth:`pyctcdecode.BeamSearchDecoderCTC.load_from_hf_hub`.
+        This class method is simply calling Wav2Vec2FeatureExtractor's
+        [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`],
+        Wav2Vec2CTCTokenizer's [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`],
+        and [`pyctcdecode.BeamSearchDecoderCTC.load_from_hf_hub`].
 
-            Please refer to the docstrings of the methods above for more information.
+        Please refer to the docstrings of the methods above for more information.
+
+        </Tip>
 
         Args:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
                 This can be either:
 
-                - a string, the `model id` of a pretrained feature_extractor hosted inside a model repo on
-                  huggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or
-                  namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``.
-                - a path to a `directory` containing a feature extractor file saved using the
-                  :meth:`~transformers.SequenceFeatureExtractor.save_pretrained` method, e.g.,
-                  ``./my_model_directory/``.
-                - a path or url to a saved feature extractor JSON `file`, e.g.,
-                  ``./my_model_directory/preprocessor_config.json``.
+                - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
+                  huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or
+                  namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`.
+                - a path to a *directory* containing a feature extractor file saved using the
+                  [`~SequenceFeatureExtractor.save_pretrained`] method, e.g.,
+                  `./my_model_directory/`.
+                - a path or url to a saved feature extractor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.
             **kwargs
-                Additional keyword arguments passed along to both :class:`~transformers.SequenceFeatureExtractor` and
-                :class:`~transformers.PreTrainedTokenizer`
+                Additional keyword arguments passed along to both [`SequenceFeatureExtractor`] and
+                [`PreTrainedTokenizer`]
         """
         requires_backends(cls, "pyctcdecode")
         from pyctcdecode import BeamSearchDecoderCTC
@@ -215,9 +219,9 @@ class Wav2Vec2ProcessorWithLM:
     def __call__(self, *args, **kwargs):
         """
         When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's
-        :meth:`~transformers.Wav2Vec2FeatureExtractor.__call__` and returns its output. If used in the context
-        :meth:`~transformers.Wav2Vec2ProcessorWithLM.as_target_processor` this method forwards all its arguments to
-        Wav2Vec2CTCTokenizer's :meth:`~transformers.Wav2Vec2CTCTokenizer.__call__`. Please refer to the docstring of
+        [`~Wav2Vec2FeatureExtractor.__call__`] and returns its output. If used in the context
+        [`~Wav2Vec2ProcessorWithLM.as_target_processor`] this method forwards all its arguments to
+        Wav2Vec2CTCTokenizer's [`~Wav2Vec2CTCTokenizer.__call__`]. Please refer to the docstring of
         the above two methods for more information.
         """
         return self.current_processor(*args, **kwargs)
@@ -225,9 +229,9 @@ class Wav2Vec2ProcessorWithLM:
     def pad(self, *args, **kwargs):
         """
         When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's
-        :meth:`~transformers.Wav2Vec2FeatureExtractor.pad` and returns its output. If used in the context
-        :meth:`~transformers.Wav2Vec2ProcessorWithLM.as_target_processor` this method forwards all its arguments to
-        Wav2Vec2CTCTokenizer's :meth:`~transformers.Wav2Vec2CTCTokenizer.pad`. Please refer to the docstring of the
+        [`~Wav2Vec2FeatureExtractor.pad`] and returns its output. If used in the context
+        [`~Wav2Vec2ProcessorWithLM.as_target_processor`] this method forwards all its arguments to
+        Wav2Vec2CTCTokenizer's [`~Wav2Vec2CTCTokenizer.pad`]. Please refer to the docstring of the
         above two methods for more information.
         """
         return self.current_processor.pad(*args, **kwargs)
@@ -245,30 +249,32 @@ class Wav2Vec2ProcessorWithLM:
         """
         Batch decode output logits to audio transcription with language model support.
 
-        .. note::
+        <Tip>
 
-            This function makes use of Python's multiprocessing.
+        This function makes use of Python's multiprocessing.
+
+        </Tip>
 
         Args:
-            logits (:obj:`np.ndarray`):
+            logits (`np.ndarray`):
                 The logits output vector of the model representing the log probabilities for each token.
-            num_processes (:obj:`int`, `optional`):
+            num_processes (`int`, *optional*):
                 Number of processes on which the function should be parallelized over. Defaults to the number of
                 available CPUs.
-            beam_width (:obj:`int`, `optional`):
+            beam_width (`int`, *optional*):
                 Maximum number of beams at each step in decoding. Defaults to pyctcdecode's DEFAULT_BEAM_WIDTH.
-            beam_prune_logp (:obj:`int`, `optional`):
+            beam_prune_logp (`int`, *optional*):
                 Beams that are much worse than best beam will be pruned Defaults to pyctcdecode's DEFAULT_PRUNE_LOGP.
-            token_min_logp (:obj:`int`, `optional`):
+            token_min_logp (`int`, *optional*):
                 Tokens below this logp are skipped unless they are argmax of frame Defaults to pyctcdecode's
                 DEFAULT_MIN_TOKEN_LOGP.
-            hotwords (:obj:`List[str]`, `optional`):
+            hotwords (`List[str]`, *optional*):
                 List of words with extra importance, can be OOV for LM
-            hotword_weight (:obj:`int`, `optional`):
+            hotword_weight (`int`, *optional*):
                 Weight factor for hotword importance Defaults to pyctcdecode's DEFAULT_HOTWORD_WEIGHT.
 
         Returns:
-            :class:`~transformers.models.wav2vec2.Wav2Vec2DecoderWithLMOutput` or :obj:`tuple`.
+            [`~models.wav2vec2.Wav2Vec2DecoderWithLMOutput`] or `tuple`.
 
         """
         from pyctcdecode.constants import (
@@ -318,23 +324,23 @@ class Wav2Vec2ProcessorWithLM:
         Decode output logits to audio transcription with language model support.
 
         Args:
-            logits (:obj:`np.ndarray`):
+            logits (`np.ndarray`):
                 The logits output vector of the model representing the log probabilities for each token.
-            beam_width (:obj:`int`, `optional`):
+            beam_width (`int`, *optional*):
                 Maximum number of beams at each step in decoding. Defaults to pyctcdecode's DEFAULT_BEAM_WIDTH.
-            beam_prune_logp (:obj:`int`, `optional`):
+            beam_prune_logp (`int`, *optional*):
                 A threshold to prune beams with log-probs less than best_beam_logp + beam_prune_logp. The value should
                 be <= 0. Defaults to pyctcdecode's DEFAULT_PRUNE_LOGP.
-            token_min_logp (:obj:`int`, `optional`):
+            token_min_logp (`int`, *optional*):
                 Tokens with log-probs below token_min_logp are skipped unless they are have the maximum log-prob for an
                 utterance. Defaults to pyctcdecode's DEFAULT_MIN_TOKEN_LOGP.
-            hotwords (:obj:`List[str]`, `optional`):
+            hotwords (`List[str]`, *optional*):
                 List of words with extra importance which can be missing from the LM's vocabulary, e.g. ["huggingface"]
-            hotword_weight (:obj:`int`, `optional`):
+            hotword_weight (`int`, *optional*):
                 Weight multiplier that boosts hotword scores. Defaults to pyctcdecode's DEFAULT_HOTWORD_WEIGHT.
 
         Returns:
-            :class:`~transformers.models.wav2vec2.Wav2Vec2DecoderWithLMOutput` or :obj:`tuple`.
+            [`~models.wav2vec2.Wav2Vec2DecoderWithLMOutput`] or `tuple`.
 
         """
         from pyctcdecode.constants import (
diff --git a/src/transformers/models/wavlm/configuration_wavlm.py b/src/transformers/models/wavlm/configuration_wavlm.py
index 42de770543..c6a848506d 100644
--- a/src/transformers/models/wavlm/configuration_wavlm.py
+++ b/src/transformers/models/wavlm/configuration_wavlm.py
@@ -28,162 +28,162 @@ WAVLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class WavLMConfig(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.WavLMModel`. It is used to
+    This is the configuration class to store the configuration of a [`WavLMModel`]. It is used to
     instantiate an WavLM model according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the WavLM `facebook/wavlm-base-960h
-    <https://huggingface.co/facebook/wavlm-base-960h>`__ architecture.
+    configuration with the defaults will yield a similar configuration to that of the WavLM [facebook/wavlm-base-960h](https://huggingface.co/facebook/wavlm-base-960h) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 32):
+        vocab_size (`int`, *optional*, defaults to 32):
             Vocabulary size of the WavLM model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.WavLMModel`. Vocabulary size of the model.
-            Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of
-            :class:`~transformers.WavLMModel`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            `inputs_ids` passed when calling [`WavLMModel`]. Vocabulary size of the model.
+            Defines the different tokens that can be represented by the *inputs_ids* passed to the forward method of
+            [`WavLMModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
+        attention_dropout (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        final_dropout (:obj:`float`, `optional`, defaults to 0.1):
-            The dropout probability for the final projection layer of :class:`WavLMForCTC`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        final_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the final projection layer of [`WavLMForCTC`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        feat_extract_norm (:obj:`str`, `optional`, defaults to :obj:`"group"`):
-            The norm to be applied to 1D convolutional layers in feature extractor. One of :obj:`"group"` for group
-            normalization of only the first 1D convolutional layer or :obj:`"layer"` for layer normalization of all 1D
+        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
+            The norm to be applied to 1D convolutional layers in feature extractor. One of `"group"` for group
+            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
             convolutional layers.
-        feat_proj_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probability for output of the feature extractor.
-        feat_extract_activation (:obj:`str, `optional`, defaults to :obj:`"gelu"`):
+        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the 1D convolutional layers of the feature
-            extractor. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        feat_quantizer_dropout (obj:`float`, `optional`, defaults to 0.0):
+            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        feat_quantizer_dropout (obj:*float*, *optional*, defaults to 0.0):
             The dropout probabilitiy for quantized feature extractor states.
-        conv_dim (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(512, 512, 512, 512, 512, 512, 512)`):
+        conv_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
             A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
-            feature extractor. The length of `conv_dim` defines the number of 1D convolutional layers.
-        conv_stride (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(5, 2, 2, 2, 2, 2, 2)`):
+            feature extractor. The length of *conv_dim* defines the number of 1D convolutional layers.
+        conv_stride (`Tuple[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
             A tuple of integers defining the stride of each 1D convolutional layer in the feature extractor. The length
-            of `conv_stride` defines the number of convolutional layers and has to match the the length of `conv_dim`.
-        conv_kernel (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(10, 3, 3, 3, 3, 3, 3)`):
+            of *conv_stride* defines the number of convolutional layers and has to match the the length of *conv_dim*.
+        conv_kernel (`Tuple[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
             A tuple of integers defining the kernel size of each 1D convolutional layer in the feature extractor. The
-            length of `conv_kernel` defines the number of convolutional layers and has to match the the length of
-            `conv_dim`.
-        conv_bias (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            length of *conv_kernel* defines the number of convolutional layers and has to match the the length of
+            *conv_dim*.
+        conv_bias (`bool`, *optional*, defaults to `False`):
             Whether the 1D convolutional layers have a bias.
-        num_conv_pos_embeddings (:obj:`int`, `optional`, defaults to 128):
+        num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
             Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
             embeddings layer.
-        num_conv_pos_embedding_groups (:obj:`int`, `optional`, defaults to 16):
+        num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
             Number of groups of 1D convolutional positional embeddings layer.
-        do_stable_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to apply `stable` layer norm architecture of the Transformer encoder. ``do_stable_layer_norm is
-            True`` corresponds to applying layer norm before the attention layer, whereas ``do_stable_layer_norm is
-            False`` corresponds to applying layer norm after the attention layer.
-        apply_spec_augment (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_stable_layer_norm (`bool`, *optional*, defaults to `False`):
+            Whether to apply *stable* layer norm architecture of the Transformer encoder. `do_stable_layer_norm is True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is False` corresponds to applying layer norm after the attention layer.
+        apply_spec_augment (`bool`, *optional*, defaults to `True`):
             Whether to apply *SpecAugment* data augmentation to the outputs of the feature extractor. For reference see
-            `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
-            <https://arxiv.org/abs/1904.08779>`__.
-        mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
+            [SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition](https://arxiv.org/abs/1904.08779).
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
             Propability of each feature vector along the time axis to be chosen as the start of the vector span to be
-            masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be
-            masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
-        mask_time_length (:obj:`int`, `optional`, defaults to 10):
+            masked. Approximately `mask_time_prob * sequence_length // mask_time_length` feature vectors will be
+            masked along the time axis. This is only relevant if `apply_spec_augment is True`.
+        mask_time_length (`int`, *optional*, defaults to 10):
             Length of vector span along the time axis.
-        mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
-            The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
-            step, irrespectively of ``mask_feature_prob``. Only relevant if
+        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
             ''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
-        mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
             Propability of each feature vector along the feature axis to be chosen as the start of the vector span to
-            be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be
-            masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
-        mask_feature_length (:obj:`int`, `optional`, defaults to 10):
+            be masked. Approximately `mask_time_prob * hidden_size // mask_time_length` feature vectors will be
+            masked along the time axis. This is only relevant if `apply_spec_augment is True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
             Length of vector span along the feature axis.
-        num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320):
+        num_codevectors_per_group (`int`, *optional*, defaults to 320):
             Number of entries in each quantization codebook (group).
-        num_codevector_groups (:obj:`int`, `optional`, defaults to 2):
+        num_codevector_groups (`int`, *optional*, defaults to 2):
             Number of codevector groups for product codevector quantization.
-        contrastive_logits_temperature (:obj:`float`, `optional`, defaults to 0.1):
-            The temperature `kappa` in the contrastive loss.
-        feat_quantizer_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
+            The temperature *kappa* in the contrastive loss.
+        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
             The dropout probabilitiy for the output of the feature extractor that's used by the quantizer.
-        num_negatives (:obj:`int`, `optional`, defaults to 100):
+        num_negatives (`int`, *optional*, defaults to 100):
             Number of negative samples for the contrastive loss.
-        codevector_dim (:obj:`int`, `optional`, defaults to 256):
+        codevector_dim (`int`, *optional*, defaults to 256):
             Dimensionality of the quantized feature vectors.
-        proj_codevector_dim (:obj:`int`, `optional`, defaults to 256):
+        proj_codevector_dim (`int`, *optional*, defaults to 256):
             Dimensionality of the final projection of both the quantized and the transformer features.
-        diversity_loss_weight (:obj:`int`, `optional`, defaults to 0.1):
+        diversity_loss_weight (`int`, *optional*, defaults to 0.1):
             The weight of the codebook diversity loss component.
-        ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"mean"`):
-            Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an
-            instance of :class:`~transformers.WavLMForCTC`.
-        ctc_zero_infinity (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to zero infinite losses and the associated gradients of ``torch.nn.CTCLoss``. Infinite losses
+        ctc_loss_reduction (`str`, *optional*, defaults to `"mean"`):
+            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+            instance of [`WavLMForCTC`].
+        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses
             mainly occur when the inputs are too short to be aligned to the targets. Only relevant when training an
-            instance of :class:`~transformers.WavLMForCTC`.
-        use_weighted_layer_sum (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            instance of [`WavLMForCTC`].
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
             Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
-            instance of :class:`~transformers.WavLMForSequenceClassification`.
-        classifier_proj_size (:obj:`int`, `optional`, defaults to 256):
+            instance of [`WavLMForSequenceClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 256):
             Dimensionality of the projection before token mean-pooling for classification.
-        tdnn_dim (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(512, 512, 512, 512, 1500)`):
-            A tuple of integers defining the number of output channels of each 1D convolutional layer in the `TDNN`
-            module of the `XVector` model. The length of `tdnn_dim` defines the number of `TDNN` layers.
-        tdnn_kernel (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(5, 3, 3, 1, 1)`):
-            A tuple of integers defining the kernel size of each 1D convolutional layer in the `TDNN` module of the
-            `XVector` model. The length of `tdnn_kernel` has to match the length of `tdnn_dim`.
-        tdnn_dilation (:obj:`Tuple[int]`, `optional`, defaults to :obj:`(1, 2, 3, 1, 1)`):
-            A tuple of integers defining the dilation factor of each 1D convolutional layer in `TDNN` module of the
-            `XVector` model. The length of `tdnn_dilation` has to match the length of `tdnn_dim`.
-        xvector_output_dim (:obj:`int`, `optional`, defaults to 512):
-            Dimensionality of the `XVector` embedding vectors.
-        add_adapter (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        tdnn_dim (`Tuple[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
+            A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN*
+            module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers.
+        tdnn_kernel (`Tuple[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the
+            *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*.
+        tdnn_dilation (`Tuple[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
+            A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the
+            *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
+        xvector_output_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of the *XVector* embedding vectors.
+        add_adapter (`bool`, *optional*, defaults to `False`):
             Whether a convolutional network should be stacked on top of the Wav2Vec2 Encoder. Can be very useful for
             warm-starting Wav2Vec2 for SpeechEncoderDecoder models.
-        adapter_kernel_size (:obj:`int`, `optional`, defaults to 3):
-            Kernel size of the convolutional layers in the adapter network. Only relevant if ``add_adapter is True``.
-        adapter_stride (:obj:`int`, `optional`, defaults to 2):
-            Stride of the convolutional layers in the adapter network. Only relevant if ``add_adapter is True``.
-        num_adapter_layers (:obj:`int`, `optional`, defaults to 3):
-            Number of convolutional layers that should be used in the adapter network. Only relevant if ``add_adapter
-            is True``.
-        output_hidden_size (:obj:`int`, `optional`):
-            Dimensionality of the encoder output layer. If not defined, this defaults to `hidden-size`. Only relevant
-            if ``add_adapter is True``.
+        adapter_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        adapter_stride (`int`, *optional*, defaults to 2):
+            Stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        num_adapter_layers (`int`, *optional*, defaults to 3):
+            Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is True`.
+        output_hidden_size (`int`, *optional*):
+            Dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant
+            if `add_adapter is True`.
 
-    Example::
+    Example:
 
-    Example::
+    ```python
 
-        >>> from transformers import WavLMModel, WavLMConfig
+    ```
 
-        >>> # Initializing a WavLM facebook/wavlm-base-960h style configuration
-        >>> configuration = WavLMConfig()
+    Example:
 
-        >>> # Initializing a model from the facebook/wavlm-base-960h style configuration
-        >>> model = WavLMModel(configuration)
+    ```python
+    >>> from transformers import WavLMModel, WavLMConfig
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Initializing a WavLM facebook/wavlm-base-960h style configuration
+    >>> configuration = WavLMConfig()
+
+    >>> # Initializing a model from the facebook/wavlm-base-960h style configuration
+    >>> model = WavLMModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
     model_type = "wavlm"
 
     def __init__(
diff --git a/src/transformers/models/xlm/configuration_xlm.py b/src/transformers/models/xlm/configuration_xlm.py
index 858bea96d3..ba6dd8dfa0 100644
--- a/src/transformers/models/xlm/configuration_xlm.py
+++ b/src/transformers/models/xlm/configuration_xlm.py
@@ -36,114 +36,115 @@ XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class XLMConfig(PretrainedConfig):
     """
-    This is the configuration class to store the configuration of a :class:`~transformers.XLMModel` or a
-    :class:`~transformers.TFXLMModel`. It is used to instantiate a XLM model according to the specified arguments,
+    This is the configuration class to store the configuration of a [`XLMModel`] or a
+    [`TFXLMModel`]. It is used to instantiate a XLM model according to the specified arguments,
     defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
-    to that of the `xlm-mlm-en-2048 <https://huggingface.co/xlm-mlm-en-2048>`__ architecture.
+    to that of the [xlm-mlm-en-2048](https://huggingface.co/xlm-mlm-en-2048) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 30145):
+        vocab_size (`int`, *optional*, defaults to 30145):
             Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.XLMModel` or :class:`~transformers.TFXLMModel`.
-        emb_dim (:obj:`int`, `optional`, defaults to 2048):
+            `inputs_ids` passed when calling [`XLMModel`] or [`TFXLMModel`].
+        emb_dim (`int`, *optional*, defaults to 2048):
             Dimensionality of the encoder layers and the pooler layer.
-        n_layer (:obj:`int`, `optional`, defaults to 12):
+        n_layer (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        n_head (:obj:`int`, `optional`, defaults to 16):
+        n_head (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+        dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.1):
+        attention_dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for the attention mechanism
-        gelu_activation (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to use `gelu` for the activations instead of `relu`.
-        sinusoidal_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        gelu_activation (`bool`, *optional*, defaults to `True`):
+            Whether or not to use *gelu* for the activations instead of *relu*.
+        sinusoidal_embeddings (`bool`, *optional*, defaults to `False`):
             Whether or not to use sinusoidal positional embeddings instead of absolute positional embeddings.
-        causal (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        causal (`bool`, *optional*, defaults to `False`):
             Whether or not the model should behave in a causal manner. Causal models use a triangular attention mask in
             order to only attend to the left-side context instead if a bidirectional context.
-        asm (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        asm (`bool`, *optional*, defaults to `False`):
             Whether or not to use an adaptive log softmax projection layer instead of a linear layer for the prediction
             layer.
-        n_langs (:obj:`int`, `optional`, defaults to 1):
+        n_langs (`int`, *optional*, defaults to 1):
             The number of languages the model handles. Set to 1 for monolingual models.
-        use_lang_emb (:obj:`bool`, `optional`, defaults to :obj:`True`)
-            Whether to use language embeddings. Some models use additional language embeddings, see `the multilingual
-            models page <http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings>`__ for
+        use_lang_emb (`bool`, *optional*, defaults to `True`)
+            Whether to use language embeddings. Some models use additional language embeddings, see [the multilingual
+            models page](http://huggingface.co/transformers/multilingual.html#xlm-language-embeddings) for
             information on how to use them.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        embed_init_std (:obj:`float`, `optional`, defaults to 2048^-0.5):
+        embed_init_std (`float`, *optional*, defaults to 2048^-0.5):
             The standard deviation of the truncated_normal_initializer for initializing the embedding matrices.
-        init_std (:obj:`int`, `optional`, defaults to 50257):
+        init_std (`int`, *optional*, defaults to 50257):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices except the
             embedding matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        bos_index (:obj:`int`, `optional`, defaults to 0):
+        bos_index (`int`, *optional*, defaults to 0):
             The index of the beginning of sentence token in the vocabulary.
-        eos_index (:obj:`int`, `optional`, defaults to 1):
+        eos_index (`int`, *optional*, defaults to 1):
             The index of the end of sentence token in the vocabulary.
-        pad_index (:obj:`int`, `optional`, defaults to 2):
+        pad_index (`int`, *optional*, defaults to 2):
             The index of the padding token in the vocabulary.
-        unk_index (:obj:`int`, `optional`, defaults to 3):
+        unk_index (`int`, *optional*, defaults to 3):
             The index of the unknown token in the vocabulary.
-        mask_index (:obj:`int`, `optional`, defaults to 5):
+        mask_index (`int`, *optional*, defaults to 5):
             The index of the masking token in the vocabulary.
-        is_encoder(:obj:`bool`, `optional`, defaults to :obj:`True`):
+        is_encoder(`bool`, *optional*, defaults to `True`):
             Whether or not the initialized model should be a transformer encoder or decoder as seen in Vaswani et al.
-        summary_type (:obj:`string`, `optional`, defaults to "first"):
+        summary_type (`string`, *optional*, defaults to "first"):
             Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
 
             Has to be one of the following options:
 
-                - :obj:`"last"`: Take the last token hidden state (like XLNet).
-                - :obj:`"first"`: Take the first token hidden state (like BERT).
-                - :obj:`"mean"`: Take the mean of all tokens hidden states.
-                - :obj:`"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
-                - :obj:`"attn"`: Not implemented now, use multi-head attention.
-        summary_use_proj (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                - `"last"`: Take the last token hidden state (like XLNet).
+                - `"first"`: Take the first token hidden state (like BERT).
+                - `"mean"`: Take the mean of all tokens hidden states.
+                - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
+                - `"attn"`: Not implemented now, use multi-head attention.
+        summary_use_proj (`bool`, *optional*, defaults to `True`):
             Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
 
             Whether or not to add a projection after the vector extraction.
-        summary_activation (:obj:`str`, `optional`):
+        summary_activation (`str`, *optional*):
             Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
 
-            Pass :obj:`"tanh"` for a tanh activation to the output, any other value will result in no activation.
-        summary_proj_to_labels (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
+        summary_proj_to_labels (`bool`, *optional*, defaults to `True`):
             Used in the sequence classification and multiple choice models.
 
-            Whether the projection outputs should have :obj:`config.num_labels` or :obj:`config.hidden_size` classes.
-        summary_first_dropout (:obj:`float`, `optional`, defaults to 0.1):
+            Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
+        summary_first_dropout (`float`, *optional*, defaults to 0.1):
             Used in the sequence classification and multiple choice models.
 
             The dropout ratio to be used after the projection and activation.
-        start_n_top (:obj:`int`, `optional`, defaults to 5):
+        start_n_top (`int`, *optional*, defaults to 5):
             Used in the SQuAD evaluation script.
-        end_n_top (:obj:`int`, `optional`, defaults to 5):
+        end_n_top (`int`, *optional*, defaults to 5):
             Used in the SQuAD evaluation script.
-        mask_token_id (:obj:`int`, `optional`, defaults to 0):
+        mask_token_id (`int`, *optional*, defaults to 0):
             Model agnostic parameter to identify masked tokens when generating text in an MLM context.
-        lang_id (:obj:`int`, `optional`, defaults to 1):
+        lang_id (`int`, *optional*, defaults to 1):
             The ID of the language used by the model. This parameter is used when generating text in a given language.
 
-    Examples::
+    Examples:
 
-        >>> from transformers import XLMConfig, XLMModel
+    ```python
+    >>> from transformers import XLMConfig, XLMModel
 
-        >>> # Initializing a XLM configuration
-        >>> configuration = XLMConfig()
+    >>> # Initializing a XLM configuration
+    >>> configuration = XLMConfig()
 
-        >>> # Initializing a model from the configuration
-        >>> model = XLMModel(configuration)
+    >>> # Initializing a model from the configuration
+    >>> model = XLMModel(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
 
     model_type = "xlm"
     attribute_map = {
diff --git a/src/transformers/models/xlm/tokenization_xlm.py b/src/transformers/models/xlm/tokenization_xlm.py
index dbf097992e..08e8604a7b 100644
--- a/src/transformers/models/xlm/tokenization_xlm.py
+++ b/src/transformers/models/xlm/tokenization_xlm.py
@@ -534,49 +534,52 @@ class XLMTokenizer(PreTrainedTokenizer):
     - Moses preprocessing and tokenization for most supported languages.
     - Language specific tokenization for Chinese (Jieba), Japanese (KyTea) and Thai (PyThaiNLP).
     - Optionally lowercases and normalizes all inputs text.
-    - The arguments ``special_tokens`` and the function ``set_special_tokens``, can be used to add additional symbols
+    - The arguments `special_tokens` and the function `set_special_tokens`, can be used to add additional symbols
       (like "__classify__") to a vocabulary.
-    - The :obj:`lang2id` attribute maps the languages supported by the model with their IDs if provided (automatically
+    - The `lang2id` attribute maps the languages supported by the model with their IDs if provided (automatically
       set for pretrained vocabularies).
-    - The :obj:`id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies).
+    - The `id2lang` attributes does reverse mapping if provided (automatically set for pretrained vocabularies).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             Vocabulary file.
-        merges_file (:obj:`str`):
+        merges_file (`str`):
             Merges file.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                sequence. The token used is the :obj:`cls_token`.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+        cls_token (`str`, *optional*, defaults to `"</s>"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<special1>"`):
+        mask_token (`str`, *optional*, defaults to `"<special1>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<special0>","<special1>","<special2>","<special3>","<special4>","<special5>","<special6>","<special7>","<special8>","<special9>"]`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `["<special0>","<special1>","<special2>","<special3>","<special4>","<special5>","<special6>","<special7>","<special8>","<special9>"]`):
             List of additional special tokens.
-        lang2id (:obj:`Dict[str, int]`, `optional`):
+        lang2id (`Dict[str, int]`, *optional*):
             Dictionary mapping languages string identifiers to their IDs.
-        id2lang (:obj:`Dict[int, str]`, `optional`):
+        id2lang (`Dict[int, str]`, *optional*):
             Dictionary mapping language IDs to their string identifiers.
-        do_lowercase_and_remove_accent (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lowercase_and_remove_accent (`bool`, *optional*, defaults to `True`):
             Whether to lowercase and remove accents when tokenizing.
     """
 
@@ -866,17 +869,17 @@ class XLMTokenizer(PreTrainedTokenizer):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. An XLM sequence has the following format:
 
-        - single sequence: ``<s> X </s>``
-        - pair of sequences: ``<s> A </s> B </s>``
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s> B </s>`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
 
         """
         bos = [self.bos_token_id]
@@ -891,18 +894,18 @@ class XLMTokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
 
         if already_has_special_tokens:
@@ -921,21 +924,21 @@ class XLMTokenizer(PreTrainedTokenizer):
         Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLM sequence
         pair mask has the following format:
 
-        ::
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
 
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
-
-        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
             sequence(s).
         """
         sep = [self.sep_token_id]
diff --git a/src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py
index 32ea91a9ea..27d6117708 100644
--- a/src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py
+++ b/src/transformers/models/xlm_prophetnet/configuration_xlm_prophetnet.py
@@ -28,7 +28,7 @@ XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class XLMProphetNetConfig(ProphetNetConfig):
     """
-    This class overrides :class:`~transformers.ProphetNetConfig`. Please check the superclass for the appropriate
+    This class overrides [`ProphetNetConfig`]. Please check the superclass for the appropriate
     documentation alongside usage examples.
     """
 
diff --git a/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
index bb23b1d7aa..dda9f80560 100644
--- a/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
+++ b/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py
@@ -56,64 +56,69 @@ def load_vocab(vocab_file):
 
 class XLMProphetNetTokenizer(PreTrainedTokenizer):
     """
-    Adapted from :class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Based on
-    `SentencePiece <https://github.com/google/sentencepiece>`__.
+    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
+    [SentencePiece](https://github.com/google/sentencepiece).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             Path to the vocabulary file.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
             Additional special tokens used by the tokenizer.
-        sp_model_kwargs (:obj:`dict`, `optional`):
-            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
-            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
 
-            - ``enable_sampling``: Enable subword regularization.
-            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
 
-              - ``nbest_size = {0,1}``: No sampling is performed.
-              - ``nbest_size > 1``: samples from the nbest_size results.
-              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                 using forward-filtering-and-backward-sampling algorithm.
 
-            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
               BPE-dropout.
 
     Attributes:
-        sp_model (:obj:`SentencePieceProcessor`):
-            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -208,18 +213,18 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
 
         if already_has_special_tokens:
@@ -239,13 +244,13 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
         does not make use of token type ids, therefore a list of zeros is returned.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of zeros.
+            `List[int]`: List of zeros.
 
         """
 
@@ -307,17 +312,17 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. A XLMProphetNet sequence has the following format:
 
-        - single sequence: ``X [SEP]``
-        - pair of sequences: ``A [SEP] B [SEP]``
+        - single sequence: `X [SEP]`
+        - pair of sequences: `A [SEP] B [SEP]`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
 
         if token_ids_1 is None:
diff --git a/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py b/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py
index 9300bfcc79..e0974a52e0 100644
--- a/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/configuration_xlm_roberta.py
@@ -36,7 +36,7 @@ XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class XLMRobertaConfig(RobertaConfig):
     """
-    This class overrides :class:`~transformers.RobertaConfig`. Please check the superclass for the appropriate
+    This class overrides [`RobertaConfig`]. Please check the superclass for the appropriate
     documentation alongside usage examples.
     """
 
diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
index 78a56615eb..80bd7b419f 100644
--- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
+++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py
@@ -54,64 +54,69 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 
 class XLMRobertaTokenizer(PreTrainedTokenizer):
     """
-    Adapted from :class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Based on
-    `SentencePiece <https://github.com/google/sentencepiece>`__.
+    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
+    [SentencePiece](https://github.com/google/sentencepiece).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             Path to the vocabulary file.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
             Additional special tokens used by the tokenizer.
-        sp_model_kwargs (:obj:`dict`, `optional`):
-            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
-            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
 
-            - ``enable_sampling``: Enable subword regularization.
-            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
 
-              - ``nbest_size = {0,1}``: No sampling is performed.
-              - ``nbest_size > 1``: samples from the nbest_size results.
-              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                 using forward-filtering-and-backward-sampling algorithm.
 
-            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
               BPE-dropout.
 
     Attributes:
-        sp_model (:obj:`SentencePieceProcessor`):
-            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -191,17 +196,17 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. An XLM-RoBERTa sequence has the following format:
 
-        - single sequence: ``<s> X </s>``
-        - pair of sequences: ``<s> A </s></s> B </s>``
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
 
         if token_ids_1 is None:
@@ -215,18 +220,18 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
 
         if already_has_special_tokens:
@@ -246,13 +251,13 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
         not make use of token type ids, therefore a list of zeros is returned.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of zeros.
+            `List[int]`: List of zeros.
 
         """
 
diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
index 3c686110fd..4c30c9cbb6 100644
--- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
+++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
@@ -66,46 +66,51 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 
 class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
     """
-    Construct a "fast" XLM-RoBERTa tokenizer (backed by HuggingFace's `tokenizers` library). Adapted from
-    :class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Based on `BPE
-    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models>`__.
+    Construct a "fast" XLM-RoBERTa tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
+    [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
     methods. Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             Path to the vocabulary file.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
             Additional special tokens used by the tokenizer.
     """
 
@@ -154,17 +159,17 @@ class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. An XLM-RoBERTa sequence has the following format:
 
-        - single sequence: ``<s> X </s>``
-        - pair of sequences: ``<s> A </s></s> B </s>``
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
 
         if token_ids_1 is None:
@@ -181,13 +186,13 @@ class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
         not make use of token type ids, therefore a list of zeros is returned.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of zeros.
+            `List[int]`: List of zeros.
 
         """
 
diff --git a/src/transformers/models/xlnet/configuration_xlnet.py b/src/transformers/models/xlnet/configuration_xlnet.py
index 131e867ff7..1029428669 100644
--- a/src/transformers/models/xlnet/configuration_xlnet.py
+++ b/src/transformers/models/xlnet/configuration_xlnet.py
@@ -31,109 +31,110 @@ XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 
 class XLNetConfig(PretrainedConfig):
     """
-    This is the configuration class to store the configuration of a :class:`~transformers.XLNetModel` or a
-    :class:`~transformers.TFXLNetModel`. It is used to instantiate a XLNet model according to the specified arguments,
+    This is the configuration class to store the configuration of a [`XLNetModel`] or a
+    [`TFXLNetModel`]. It is used to instantiate a XLNet model according to the specified arguments,
     defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
-    to that of the `xlnet-large-cased <https://huggingface.co/xlnet-large-cased>`__ architecture.
+    to that of the [xlnet-large-cased](https://huggingface.co/xlnet-large-cased) architecture.
 
-    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
-    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
+    outputs. Read the documentation from [`PretrainedConfig`] for more information.
 
     Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 32000):
+        vocab_size (`int`, *optional*, defaults to 32000):
             Vocabulary size of the XLNet model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.XLNetModel` or
-            :class:`~transformers.TFXLNetModel`.
-        d_model (:obj:`int`, `optional`, defaults to 1024):
+            `inputs_ids` passed when calling [`XLNetModel`] or
+            [`TFXLNetModel`].
+        d_model (`int`, *optional*, defaults to 1024):
             Dimensionality of the encoder layers and the pooler layer.
-        n_layer (:obj:`int`, `optional`, defaults to 24):
+        n_layer (`int`, *optional*, defaults to 24):
             Number of hidden layers in the Transformer encoder.
-        n_head (:obj:`int`, `optional`, defaults to 16):
+        n_head (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
-        d_inner (:obj:`int`, `optional`, defaults to 4096):
+        d_inner (`int`, *optional*, defaults to 4096):
             Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
-        ff_activation (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
-            The non-linear activation function (function or string) in the If string, :obj:`"gelu"`, :obj:`"relu"`,
-            :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        untie_r (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        ff_activation (`str` or `Callable`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the If string, `"gelu"`, `"relu"`,
+            `"silu"` and `"gelu_new"` are supported.
+        untie_r (`bool`, *optional*, defaults to `True`):
             Whether or not to untie relative position biases
-        attn_type (:obj:`str`, `optional`, defaults to :obj:`"bi"`):
-            The attention type used by the model. Set :obj:`"bi"` for XLNet, :obj:`"uni"` for Transformer-XL.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        attn_type (`str`, *optional*, defaults to `"bi"`):
+            The attention type used by the model. Set `"bi"` for XLNet, `"uni"` for Transformer-XL.
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+        dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        mem_len (:obj:`int` or :obj:`None`, `optional`):
+        mem_len (`int` or `None`, *optional*):
             The number of tokens to cache. The key/value pairs that have already been pre-computed in a previous
-            forward pass won't be re-computed. See the `quickstart
-            <https://huggingface.co/transformers/quickstart.html#using-the-past>`__ for more information.
-        reuse_len (:obj:`int`, `optional`):
+            forward pass won't be re-computed. See the [quickstart](https://huggingface.co/transformers/quickstart.html#using-the-past) for more information.
+        reuse_len (`int`, *optional*):
             The number of tokens in the current batch to be cached and reused in the future.
-        bi_data (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to use bidirectional input pipeline. Usually set to :obj:`True` during pretraining and
-            :obj:`False` during finetuning.
-        clamp_len (:obj:`int`, `optional`, defaults to -1):
+        bi_data (`bool`, *optional*, defaults to `False`):
+            Whether or not to use bidirectional input pipeline. Usually set to `True` during pretraining and
+            `False` during finetuning.
+        clamp_len (`int`, *optional*, defaults to -1):
             Clamp all relative distances larger than clamp_len. Setting this attribute to -1 means no clamping.
-        same_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        same_length (`bool`, *optional*, defaults to `False`):
             Whether or not to use the same attention length for each token.
-        summary_type (:obj:`str`, `optional`, defaults to "last"):
+        summary_type (`str`, *optional*, defaults to "last"):
             Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
 
             Has to be one of the following options:
 
-                - :obj:`"last"`: Take the last token hidden state (like XLNet).
-                - :obj:`"first"`: Take the first token hidden state (like BERT).
-                - :obj:`"mean"`: Take the mean of all tokens hidden states.
-                - :obj:`"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
-                - :obj:`"attn"`: Not implemented now, use multi-head attention.
-        summary_use_proj (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                - `"last"`: Take the last token hidden state (like XLNet).
+                - `"first"`: Take the first token hidden state (like BERT).
+                - `"mean"`: Take the mean of all tokens hidden states.
+                - `"cls_index"`: Supply a Tensor of classification token position (like GPT/GPT-2).
+                - `"attn"`: Not implemented now, use multi-head attention.
+        summary_use_proj (`bool`, *optional*, defaults to `True`):
             Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
 
             Whether or not to add a projection after the vector extraction.
-        summary_activation (:obj:`str`, `optional`):
+        summary_activation (`str`, *optional*):
             Argument used when doing sequence summary. Used in the sequence classification and multiple choice models.
 
-            Pass :obj:`"tanh"` for a tanh activation to the output, any other value will result in no activation.
-        summary_proj_to_labels (:obj:`boo`, `optional`, defaults to :obj:`True`):
+            Pass `"tanh"` for a tanh activation to the output, any other value will result in no activation.
+        summary_proj_to_labels (`boo`, *optional*, defaults to `True`):
             Used in the sequence classification and multiple choice models.
 
-            Whether the projection outputs should have :obj:`config.num_labels` or :obj:`config.hidden_size` classes.
-        summary_last_dropout (:obj:`float`, `optional`, defaults to 0.1):
+            Whether the projection outputs should have `config.num_labels` or `config.hidden_size` classes.
+        summary_last_dropout (`float`, *optional*, defaults to 0.1):
             Used in the sequence classification and multiple choice models.
 
             The dropout ratio to be used after the projection and activation.
-        start_n_top (:obj:`int`, `optional`, defaults to 5):
+        start_n_top (`int`, *optional*, defaults to 5):
             Used in the SQuAD evaluation script.
-        end_n_top (:obj:`int`, `optional`, defaults to 5):
+        end_n_top (`int`, *optional*, defaults to 5):
             Used in the SQuAD evaluation script.
-        use_mems_eval (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_mems_eval (`bool`, *optional*, defaults to `True`):
             Whether or not the model should make use of the recurrent memory mechanism in evaluation mode.
-        use_mems_train (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        use_mems_train (`bool`, *optional*, defaults to `False`):
             Whether or not the model should make use of the recurrent memory mechanism in train mode.
 
-            .. note::
-                For pretraining, it is recommended to set ``use_mems_train`` to :obj:`True`. For fine-tuning, it is
-                recommended to set ``use_mems_train`` to :obj:`False` as discussed `here
-                <https://github.com/zihangdai/xlnet/issues/41#issuecomment-505102587>`__. If ``use_mems_train`` is set
-                to :obj:`True`, one has to make sure that the train batches are correctly pre-processed, `e.g.`
-                :obj:`batch_1 = [[This line is], [This is the]]` and :obj:`batch_2 = [[ the first line], [ second
-                line]]` and that all batches are of equal size.
+            <Tip>
 
-    Examples::
+            For pretraining, it is recommended to set `use_mems_train` to `True`. For fine-tuning, it is
+            recommended to set `use_mems_train` to `False` as discussed [here](https://github.com/zihangdai/xlnet/issues/41#issuecomment-505102587). If `use_mems_train` is set
+            to `True`, one has to make sure that the train batches are correctly pre-processed, *e.g.*
+            `batch_1 = [[This line is], [This is the]]` and `batch_2 = [[ the first line], [ second line]]` and that all batches are of equal size.
 
-        >>> from transformers import XLNetConfig, XLNetModel
+            </Tip>
 
-        >>> # Initializing a XLNet configuration
-        >>> configuration = XLNetConfig()
+    Examples:
 
-        >>> # Initializing a model from the configuration
-        >>> model = XLNetModel(configuration)
+    ```python
+    >>> from transformers import XLNetConfig, XLNetModel
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Initializing a XLNet configuration
+    >>> configuration = XLNetConfig()
+
+    >>> # Initializing a model from the configuration
+    >>> model = XLNetModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
 
     model_type = "xlnet"
     keys_to_ignore_at_inference = ["mems"]
diff --git a/src/transformers/models/xlnet/modeling_tf_xlnet.py b/src/transformers/models/xlnet/modeling_tf_xlnet.py
index 6823b59c1f..02a7629eba 100644
--- a/src/transformers/models/xlnet/modeling_tf_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_tf_xlnet.py
@@ -485,7 +485,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
             qlen: TODO Lysandre didn't fill
             mlen: TODO Lysandre didn't fill
 
-        ::
+        ```
 
                   same_length=False:      same_length=True:
                   <mlen > <  qlen >       <mlen > <  qlen >
@@ -494,7 +494,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
             qlen [0 0 0 0 0 0 0 1 1]     [1 1 0 0 0 0 0 1 1]
                  [0 0 0 0 0 0 0 0 1]     [1 1 1 0 0 0 0 0 1]
                v [0 0 0 0 0 0 0 0 0]     [1 1 1 1 0 0 0 0 0]
-
+        ```
         """
         attn_mask = tf.ones([qlen, qlen])
         mask_u = tf.linalg.band_part(attn_mask, 0, -1)
@@ -1069,15 +1069,15 @@ XLNET_START_DOCSTRING = r"""
 
 XLNET_INPUTS_DOCSTRING = r"""
     Args:
-        input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`):
+        input_ids (`torch.LongTensor` of shape `({0})`):
             Indices of input sequence tokens in the vocabulary.
 
-            Indices can be obtained using [`BertTokenizer`]. See
-            [`PreTrainedTokenizer.__call__`] and [`PreTrainedTokenizer.encode`] for
+            Indices can be obtained using [`XLNetTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
             details.
 
             [What are input IDs?](../glossary#input-ids)
-        attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
             - 1 for tokens that are **not masked**,
@@ -1089,8 +1089,8 @@ XLNET_INPUTS_DOCSTRING = r"""
             decoding. The token ids which have their past given to this model should not be passed as `input_ids`
             as they have already been computed.
 
-            :obj:`use_mems` has to be set to `True` to make use of `mems`.
-        perm_mask (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length, sequence_length)`, *optional*):
+            `use_mems` has to be set to `True` to make use of `mems`.
+        perm_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length, sequence_length)`, *optional*):
             Mask to indicate the attention pattern for each input token with values selected in `[0, 1]`:
 
             - if `perm_mask[k, i, j] = 0`, i attend to j in batch k;
@@ -1098,17 +1098,18 @@ XLNET_INPUTS_DOCSTRING = r"""
 
             If not set, each token attends to all the others (full bidirectional attention). Only used during
             pretraining (to define factorization order) or for sequential decoding (generation).
-        target_mapping (`tf.Tensor` or `Numpy array` of shape `(batch_size, num_predict, sequence_length)`, *optional*):
+        target_mapping (`torch.FloatTensor` of shape `(batch_size, num_predict, sequence_length)`, *optional*):
             Mask to indicate the output tokens to use. If `target_mapping[k, i, j] = 1`, the i-th predict in batch k
-            is on the j-th token.
-        token_type_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*):
+            is on the j-th token. Only used during pretraining for partial prediction or for sequential decoding
+            (generation).
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
             Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
 
             - 0 corresponds to a *sentence A* token,
             - 1 corresponds to a *sentence B* token.
 
             [What are token type IDs?](../glossary#token-type-ids)
-        input_mask (`tf.Tensor` or `Numpy array` of shape `({0})`, *optional*):
+        input_mask (`torch.FloatTensor` of shape `{0}`, *optional*):
             Mask to avoid performing attention on padding token indices. Negative of `attention_mask`, i.e. with 0
             for real tokens and 1 for padding which is kept for compatibility with the original code base.
 
@@ -1118,30 +1119,24 @@ XLNET_INPUTS_DOCSTRING = r"""
             - 0 for tokens that are **not masked**.
 
             You can only uses one of `input_mask` and `attention_mask`.
-        head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
             Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
 
             - 1 indicates the head is **not masked**,
             - 0 indicates the head is **masked**.
 
-        inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*):
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
             Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
             This is useful if you want more control over how to convert `input_ids` indices into associated
             vectors than the model's internal embedding lookup matrix.
         output_attentions (`bool`, *optional*):
             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
-            config will be used instead.
+            tensors for more detail.
         output_hidden_states (`bool`, *optional*):
             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
-            used instead.
+            more detail.
         return_dict (`bool`, *optional*):
-            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. This
-            argument can be used in eager mode, in graph mode the value will always be set to True.
-        training (`bool`, *optional*, defaults to `False`):
-            Whether or not to use the model in training mode (some modules like dropout modules have different
-            behaviors between training and evaluation).
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
 """
 
 
diff --git a/src/transformers/models/xlnet/tokenization_xlnet.py b/src/transformers/models/xlnet/tokenization_xlnet.py
index afd87e309c..d84c568ce3 100644
--- a/src/transformers/models/xlnet/tokenization_xlnet.py
+++ b/src/transformers/models/xlnet/tokenization_xlnet.py
@@ -53,70 +53,75 @@ SEG_ID_PAD = 4
 
 class XLNetTokenizer(PreTrainedTokenizer):
     """
-    Construct an XLNet tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
+    Construct an XLNet tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
     Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a .spm extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
             Whether to lowercase the input when tokenizing.
-        remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        remove_space (`bool`, *optional*, defaults to `True`):
             Whether to strip the text when tokenizing (removing excess spaces before and after the string).
-        keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        keep_accents (`bool`, *optional*, defaults to `False`):
             Whether to keep accents when tokenizing.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"<sep>"`):
+        sep_token (`str`, *optional*, defaults to `"<sep>"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<cls>"`):
+        cls_token (`str`, *optional*, defaults to `"<cls>"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<eop>", "<eod>"]`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `["<eop>", "<eod>"]`):
             Additional special tokens used by the tokenizer.
-        sp_model_kwargs (:obj:`dict`, `optional`):
-            Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
-            <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, to set:
 
-            - ``enable_sampling``: Enable subword regularization.
-            - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
 
-              - ``nbest_size = {0,1}``: No sampling is performed.
-              - ``nbest_size > 1``: samples from the nbest_size results.
-              - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                 using forward-filtering-and-backward-sampling algorithm.
 
-            - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
               BPE-dropout.
 
     Attributes:
-        sp_model (:obj:`SentencePieceProcessor`):
-            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -251,17 +256,17 @@ class XLNetTokenizer(PreTrainedTokenizer):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. An XLNet sequence has the following format:
 
-        - single sequence: ``X <sep> <cls>``
-        - pair of sequences: ``A <sep> B <sep> <cls>``
+        - single sequence: `X <sep> <cls>`
+        - pair of sequences: `A <sep> B <sep> <cls>`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
@@ -274,18 +279,18 @@ class XLNetTokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
 
         if already_has_special_tokens:
@@ -304,21 +309,21 @@ class XLNetTokenizer(PreTrainedTokenizer):
         Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLNet
         sequence pair mask has the following format:
 
-        ::
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
 
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
-
-        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
             sequence(s).
         """
         sep = [self.sep_token_id]
diff --git a/src/transformers/models/xlnet/tokenization_xlnet_fast.py b/src/transformers/models/xlnet/tokenization_xlnet_fast.py
index 8b72c8def8..1c7d938994 100644
--- a/src/transformers/models/xlnet/tokenization_xlnet_fast.py
+++ b/src/transformers/models/xlnet/tokenization_xlnet_fast.py
@@ -63,57 +63,62 @@ SEG_ID_PAD = 4
 
 class XLNetTokenizerFast(PreTrainedTokenizerFast):
     """
-    Construct a "fast" XLNet tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
-    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__.
+    Construct a "fast" XLNet tokenizer (backed by HuggingFace's *tokenizers* library). Based on [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).
 
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main
     methods. Users should refer to this superclass for more information regarding those methods.
 
     Args:
-        vocab_file (:obj:`str`):
-            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a .spm extension) that
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
             contains the vocabulary necessary to instantiate a tokenizer.
-        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        do_lower_case (`bool`, *optional*, defaults to `True`):
             Whether to lowercase the input when tokenizing.
-        remove_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        remove_space (`bool`, *optional*, defaults to `True`):
             Whether to strip the text when tokenizing (removing excess spaces before and after the string).
-        keep_accents (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        keep_accents (`bool`, *optional*, defaults to `False`):
             Whether to keep accents when tokenizing.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
             The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the beginning of
-                sequence. The token used is the :obj:`cls_token`.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
             The end of sequence token.
 
-            .. note::
+            <Tip>
 
-                When building a sequence using special tokens, this is not the token that is used for the end of
-                sequence. The token used is the :obj:`sep_token`.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            When building a sequence using special tokens, this is not the token that is used for the end of
+            sequence. The token used is the `sep_token`.
+
+            </Tip>
+
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
             The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
             token instead.
-        sep_token (:obj:`str`, `optional`, defaults to :obj:`"<sep>"`):
+        sep_token (`str`, *optional*, defaults to `"<sep>"`):
             The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
             sequence classification or for a text and a question for question answering. It is also used as the last
             token of a sequence built with special tokens.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
             The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<cls>"`):
+        cls_token (`str`, *optional*, defaults to `"<cls>"`):
             The classifier token which is used when doing sequence classification (classification of the whole sequence
             instead of per-token classification). It is the first token of the sequence when built with special tokens.
-        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
             The token used for masking values. This is the token used when training this model with masked language
             modeling. This is the token which the model will try to predict.
-        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<eop>", "<eod>"]`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `["<eop>", "<eod>"]`):
             Additional special tokens used by the tokenizer.
 
     Attributes:
-        sp_model (:obj:`SentencePieceProcessor`):
-            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -173,17 +178,17 @@ class XLNetTokenizerFast(PreTrainedTokenizerFast):
         Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         adding special tokens. An XLNet sequence has the following format:
 
-        - single sequence: ``X <sep> <cls>``
-        - pair of sequences: ``A <sep> B <sep> <cls>``
+        - single sequence: `X <sep> <cls>`
+        - pair of sequences: `A <sep> B <sep> <cls>`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
@@ -198,21 +203,21 @@ class XLNetTokenizerFast(PreTrainedTokenizerFast):
         Create a mask from the two sequences passed to be used in a sequence-pair classification task. An XLNet
         sequence pair mask has the following format:
 
-        ::
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
 
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
-
-        If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
+        If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
             sequence(s).
         """
         sep = [self.sep_token_id]
diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py
index e60b8b332d..5681a40710 100644
--- a/src/transformers/optimization.py
+++ b/src/transformers/optimization.py
@@ -35,13 +35,13 @@ def get_constant_schedule(optimizer: Optimizer, last_epoch: int = -1):
     Create a schedule with a constant learning rate, using the learning rate set in optimizer.
 
     Args:
-        optimizer (:class:`~torch.optim.Optimizer`):
+        optimizer ([`~torch.optim.Optimizer`]):
             The optimizer for which to schedule the learning rate.
-        last_epoch (:obj:`int`, `optional`, defaults to -1):
+        last_epoch (`int`, *optional*, defaults to -1):
             The index of the last epoch when resuming training.
 
     Return:
-        :obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
     """
     return LambdaLR(optimizer, lambda _: 1, last_epoch=last_epoch)
 
@@ -52,15 +52,15 @@ def get_constant_schedule_with_warmup(optimizer: Optimizer, num_warmup_steps: in
     increases linearly between 0 and the initial lr set in the optimizer.
 
     Args:
-        optimizer (:class:`~torch.optim.Optimizer`):
+        optimizer ([`~torch.optim.Optimizer`]):
             The optimizer for which to schedule the learning rate.
-        num_warmup_steps (:obj:`int`):
+        num_warmup_steps (`int`):
             The number of steps for the warmup phase.
-        last_epoch (:obj:`int`, `optional`, defaults to -1):
+        last_epoch (`int`, *optional*, defaults to -1):
             The index of the last epoch when resuming training.
 
     Return:
-        :obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
     """
 
     def lr_lambda(current_step: int):
@@ -77,17 +77,17 @@ def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_st
     a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer.
 
     Args:
-        optimizer (:class:`~torch.optim.Optimizer`):
+        optimizer ([`~torch.optim.Optimizer`]):
             The optimizer for which to schedule the learning rate.
-        num_warmup_steps (:obj:`int`):
+        num_warmup_steps (`int`):
             The number of steps for the warmup phase.
-        num_training_steps (:obj:`int`):
+        num_training_steps (`int`):
             The total number of training steps.
-        last_epoch (:obj:`int`, `optional`, defaults to -1):
+        last_epoch (`int`, *optional*, defaults to -1):
             The index of the last epoch when resuming training.
 
     Return:
-        :obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
     """
 
     def lr_lambda(current_step: int):
@@ -109,20 +109,20 @@ def get_cosine_schedule_with_warmup(
     initial lr set in the optimizer.
 
     Args:
-        optimizer (:class:`~torch.optim.Optimizer`):
+        optimizer ([`~torch.optim.Optimizer`]):
             The optimizer for which to schedule the learning rate.
-        num_warmup_steps (:obj:`int`):
+        num_warmup_steps (`int`):
             The number of steps for the warmup phase.
-        num_training_steps (:obj:`int`):
+        num_training_steps (`int`):
             The total number of training steps.
-        num_cycles (:obj:`float`, `optional`, defaults to 0.5):
+        num_cycles (`float`, *optional*, defaults to 0.5):
             The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
             following a half-cosine).
-        last_epoch (:obj:`int`, `optional`, defaults to -1):
+        last_epoch (`int`, *optional*, defaults to -1):
             The index of the last epoch when resuming training.
 
     Return:
-        :obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
     """
 
     def lr_lambda(current_step):
@@ -143,19 +143,19 @@ def get_cosine_with_hard_restarts_schedule_with_warmup(
     linearly between 0 and the initial lr set in the optimizer.
 
     Args:
-        optimizer (:class:`~torch.optim.Optimizer`):
+        optimizer ([`~torch.optim.Optimizer`]):
             The optimizer for which to schedule the learning rate.
-        num_warmup_steps (:obj:`int`):
+        num_warmup_steps (`int`):
             The number of steps for the warmup phase.
-        num_training_steps (:obj:`int`):
+        num_training_steps (`int`):
             The total number of training steps.
-        num_cycles (:obj:`int`, `optional`, defaults to 1):
+        num_cycles (`int`, *optional*, defaults to 1):
             The number of hard restarts to use.
-        last_epoch (:obj:`int`, `optional`, defaults to -1):
+        last_epoch (`int`, *optional*, defaults to -1):
             The index of the last epoch when resuming training.
 
     Return:
-        :obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
     """
 
     def lr_lambda(current_step):
@@ -174,29 +174,29 @@ def get_polynomial_decay_schedule_with_warmup(
 ):
     """
     Create a schedule with a learning rate that decreases as a polynomial decay from the initial lr set in the
-    optimizer to end lr defined by `lr_end`, after a warmup period during which it increases linearly from 0 to the
+    optimizer to end lr defined by *lr_end*, after a warmup period during which it increases linearly from 0 to the
     initial lr set in the optimizer.
 
     Args:
-        optimizer (:class:`~torch.optim.Optimizer`):
+        optimizer ([`~torch.optim.Optimizer`]):
             The optimizer for which to schedule the learning rate.
-        num_warmup_steps (:obj:`int`):
+        num_warmup_steps (`int`):
             The number of steps for the warmup phase.
-        num_training_steps (:obj:`int`):
+        num_training_steps (`int`):
             The total number of training steps.
-        lr_end (:obj:`float`, `optional`, defaults to 1e-7):
+        lr_end (`float`, *optional*, defaults to 1e-7):
             The end LR.
-        power (:obj:`float`, `optional`, defaults to 1.0):
+        power (`float`, *optional*, defaults to 1.0):
             Power factor.
-        last_epoch (:obj:`int`, `optional`, defaults to -1):
+        last_epoch (`int`, *optional*, defaults to -1):
             The index of the last epoch when resuming training.
 
-    Note: `power` defaults to 1.0 as in the fairseq implementation, which in turn is based on the original BERT
+    Note: *power* defaults to 1.0 as in the fairseq implementation, which in turn is based on the original BERT
     implementation at
     https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/optimization.py#L37
 
     Return:
-        :obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
+        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
 
     """
 
@@ -239,14 +239,14 @@ def get_scheduler(
     Unified API to get any scheduler from its name.
 
     Args:
-        name (:obj:`str` or `:obj:`SchedulerType`):
+        name (`str` or `SchedulerType`):
             The name of the scheduler to use.
-        optimizer (:obj:`torch.optim.Optimizer`):
+        optimizer (`torch.optim.Optimizer`):
             The optimizer that will be used during training.
-        num_warmup_steps (:obj:`int`, `optional`):
+        num_warmup_steps (`int`, *optional*):
             The number of warmup steps to do. This is not required by all schedulers (hence the argument being
             optional), the function will raise an error if it's unset and the scheduler type requires it.
-        num_training_steps (:obj:`int`, `optional`):
+        num_training_steps (`int``, *optional*):
             The number of training steps to do. This is not required by all schedulers (hence the argument being
             optional), the function will raise an error if it's unset and the scheduler type requires it.
     """
@@ -271,22 +271,21 @@ def get_scheduler(
 
 class AdamW(Optimizer):
     """
-    Implements Adam algorithm with weight decay fix as introduced in `Decoupled Weight Decay Regularization
-    <https://arxiv.org/abs/1711.05101>`__.
+    Implements Adam algorithm with weight decay fix as introduced in [Decoupled Weight Decay Regularization](https://arxiv.org/abs/1711.05101).
 
     Parameters:
-        params (:obj:`Iterable[nn.parameter.Parameter]`):
+        params (`Iterable[nn.parameter.Parameter]`):
             Iterable of parameters to optimize or dictionaries defining parameter groups.
-        lr (:obj:`float`, `optional`, defaults to 1e-3):
+        lr (`float`, *optional*, defaults to 1e-3):
             The learning rate to use.
-        betas (:obj:`Tuple[float,float]`, `optional`, defaults to (0.9, 0.999)):
+        betas (`Tuple[float,float]`, *optional*, defaults to (0.9, 0.999)):
             Adam's betas parameters (b1, b2).
-        eps (:obj:`float`, `optional`, defaults to 1e-6):
+        eps (`float`, *optional*, defaults to 1e-6):
             Adam's epsilon for numerical stability.
-        weight_decay (:obj:`float`, `optional`, defaults to 0):
+        weight_decay (`float`, *optional*, defaults to 0):
             Decoupled weight decay to apply.
-        correct_bias (:obj:`bool`, `optional`, defaults to `True`):
-            Whether or not to correct bias in Adam (for instance, in Bert TF repository they use :obj:`False`).
+        correct_bias (`bool`, *optional*, defaults to *True*):
+            Whether or not to correct bias in Adam (for instance, in Bert TF repository they use `False`).
     """
 
     def __init__(
@@ -315,7 +314,7 @@ class AdamW(Optimizer):
         Performs a single optimization step.
 
         Arguments:
-            closure (:obj:`Callable`, `optional`): A closure that reevaluates the model and returns the loss.
+            closure (`Callable`, *optional*): A closure that reevaluates the model and returns the loss.
         """
         loss = None
         if closure is not None:
@@ -377,31 +376,31 @@ class Adafactor(Optimizer):
     AdaFactor pytorch implementation can be used as a drop in replacement for Adam original fairseq code:
     https://github.com/pytorch/fairseq/blob/master/fairseq/optim/adafactor.py
 
-    Paper: `Adafactor: Adaptive Learning Rates with Sublinear Memory Cost` https://arxiv.org/abs/1804.04235 Note that
-    this optimizer internally adjusts the learning rate depending on the *scale_parameter*, *relative_step* and
-    *warmup_init* options. To use a manual (external) learning rate schedule you should set `scale_parameter=False` and
+    Paper: *Adafactor: Adaptive Learning Rates with Sublinear Memory Cost* https://arxiv.org/abs/1804.04235 Note that
+    this optimizer internally adjusts the learning rate depending on the `scale_parameter`, `relative_step` and
+    `warmup_init` options. To use a manual (external) learning rate schedule you should set `scale_parameter=False` and
     `relative_step=False`.
 
     Arguments:
-        params (:obj:`Iterable[nn.parameter.Parameter]`):
+        params (`Iterable[nn.parameter.Parameter]`):
             Iterable of parameters to optimize or dictionaries defining parameter groups.
-        lr (:obj:`float`, `optional`):
+        lr (`float`, *optional*):
             The external learning rate.
-        eps (:obj:`Tuple[float, float]`, `optional`, defaults to (1e-30, 1e-3)):
+        eps (`Tuple[float, float]`, *optional*, defaults to (1e-30, 1e-3)):
             Regularization constants for square gradient and parameter scale respectively
-        clip_threshold (:obj:`float`, `optional`, defaults 1.0):
+        clip_threshold (`float`, *optional*, defaults 1.0):
             Threshold of root mean square of final gradient update
-        decay_rate (:obj:`float`, `optional`, defaults to -0.8):
+        decay_rate (`float`, *optional*, defaults to -0.8):
             Coefficient used to compute running averages of square
-        beta1 (:obj:`float`, `optional`):
+        beta1 (`float`, *optional*):
             Coefficient used for computing running averages of gradient
-        weight_decay (:obj:`float`, `optional`, defaults to 0):
+        weight_decay (`float`, *optional*, defaults to 0):
             Weight decay (L2 penalty)
-        scale_parameter (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        scale_parameter (`bool`, *optional*, defaults to `True`):
             If True, learning rate is scaled by root mean square
-        relative_step (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        relative_step (`bool`, *optional*, defaults to `True`):
             If True, time-dependent learning rate is computed instead of external learning rate
-        warmup_init (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        warmup_init (`bool`, *optional*, defaults to `False`):
             Time-dependent learning rate computation depends on whether warm-up initialization is being used
 
     This implementation handles low-precision (FP16, bfloat) values, but we have not thoroughly tested.
@@ -410,43 +409,50 @@ class Adafactor(Optimizer):
 
         - Training without LR warmup or clip_threshold is not recommended.
 
-           * use scheduled LR warm-up to fixed LR
-           * use clip_threshold=1.0 (https://arxiv.org/abs/1804.04235)
+           - use scheduled LR warm-up to fixed LR
+           - use clip_threshold=1.0 (https://arxiv.org/abs/1804.04235)
         - Disable relative updates
         - Use scale_parameter=False
         - Additional optimizer operations like gradient clipping should not be used alongside Adafactor
 
-        Example::
+    Example:
 
-            Adafactor(model.parameters(), scale_parameter=False, relative_step=False, warmup_init=False, lr=1e-3)
+    ```python
+    Adafactor(model.parameters(), scale_parameter=False, relative_step=False, warmup_init=False, lr=1e-3)
+    ```
 
-        Others reported the following combination to work well::
+    Others reported the following combination to work well:
 
-            Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
+    ```python
+    Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
+    ```
 
-        When using ``lr=None`` with :class:`~transformers.Trainer` you will most likely need to use :class:`~transformers.optimization.AdafactorSchedule` scheduler as following::
+    When using `lr=None` with [`Trainer`] you will most likely need to use [`~optimization.AdafactorSchedule`] scheduler as following:
 
-            from transformers.optimization import Adafactor, AdafactorSchedule
-            optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
-            lr_scheduler = AdafactorSchedule(optimizer)
-            trainer = Trainer(..., optimizers=(optimizer, lr_scheduler))
+    ```python
+    from transformers.optimization import Adafactor, AdafactorSchedule
+    optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None)
+    lr_scheduler = AdafactorSchedule(optimizer)
+    trainer = Trainer(..., optimizers=(optimizer, lr_scheduler))
+    ```
 
-    Usage::
+    Usage:
 
-        # replace AdamW with Adafactor
-        optimizer = Adafactor(
-            model.parameters(),
-            lr=1e-3,
-            eps=(1e-30, 1e-3),
-            clip_threshold=1.0,
-            decay_rate=-0.8,
-            beta1=None,
-            weight_decay=0.0,
-            relative_step=False,
-            scale_parameter=False,
-            warmup_init=False
-        )
-    """
+    ```python
+    # replace AdamW with Adafactor
+    optimizer = Adafactor(
+        model.parameters(),
+        lr=1e-3,
+        eps=(1e-30, 1e-3),
+        clip_threshold=1.0,
+        decay_rate=-0.8,
+        beta1=None,
+        weight_decay=0.0,
+        relative_step=False,
+        scale_parameter=False,
+        warmup_init=False
+    )
+    ```"""
 
     def __init__(
         self,
@@ -605,11 +611,11 @@ class Adafactor(Optimizer):
 
 class AdafactorSchedule(LambdaLR):
     """
-    Since :class:`~transformers.optimization.Adafactor` performs its own scheduling, if the training loop relies on a
+    Since [`~optimization.Adafactor`] performs its own scheduling, if the training loop relies on a
     scheduler (e.g., for logging), this class creates a proxy object that retrieves the current lr values from the
     optimizer.
 
-    It returns ``initial_lr`` during startup and the actual ``lr`` during stepping.
+    It returns `initial_lr` during startup and the actual `lr` during stepping.
     """
 
     def __init__(self, optimizer, initial_lr=0.0):
@@ -636,16 +642,16 @@ class AdafactorSchedule(LambdaLR):
 
 def get_adafactor_schedule(optimizer, initial_lr=0.0):
     """
-    Get a proxy schedule for :class:`~transformers.optimization.Adafactor`
+    Get a proxy schedule for [`~optimization.Adafactor`]
 
     Args:
-        optimizer (:class:`~torch.optim.Optimizer`):
+        optimizer ([`~torch.optim.Optimizer`]):
             The optimizer for which to schedule the learning rate.
-        initial_lr (:obj:`float`, `optional`, defaults to 0.0):
+        initial_lr (`float`, *optional*, defaults to 0.0):
             Initial lr
 
     Return:
-        :class:`~transformers.optimization.Adafactor` proxy schedule object.
+        [`~optimization.Adafactor`] proxy schedule object.
 
 
     """
diff --git a/src/transformers/optimization_tf.py b/src/transformers/optimization_tf.py
index 8e369223df..d18f85fe78 100644
--- a/src/transformers/optimization_tf.py
+++ b/src/transformers/optimization_tf.py
@@ -26,16 +26,16 @@ class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
     Applies a warmup schedule on a given learning rate decay schedule.
 
     Args:
-        initial_learning_rate (:obj:`float`):
+        initial_learning_rate (`float`):
             The initial learning rate for the schedule after the warmup (so this will be the learning rate at the end
             of the warmup).
-        decay_schedule_fn (:obj:`Callable`):
+        decay_schedule_fn (`Callable`):
             The schedule function to apply after the warmup for the rest of training.
-        warmup_steps (:obj:`int`):
+        warmup_steps (`int`):
             The number of steps for the warmup part of training.
-        power (:obj:`float`, `optional`, defaults to 1):
+        power (`float`, *optional*, defaults to 1):
             The power to use for the polynomial warmup (defaults is a linear warmup).
-        name (:obj:`str`, `optional`):
+        name (`str`, *optional*):
             Optional name prefix for the returned tensors during the schedule.
     """
 
@@ -95,25 +95,25 @@ def create_optimizer(
     Creates an optimizer with a learning rate schedule using a warmup phase followed by a linear decay.
 
     Args:
-        init_lr (:obj:`float`):
+        init_lr (`float`):
             The desired learning rate at the end of the warmup phase.
-        num_train_steps (:obj:`int`):
+        num_train_steps (`int`):
             The total number of training steps.
-        num_warmup_steps (:obj:`int`):
+        num_warmup_steps (`int`):
             The number of warmup steps.
-        min_lr_ratio (:obj:`float`, `optional`, defaults to 0):
-            The final learning rate at the end of the linear decay will be :obj:`init_lr * min_lr_ratio`.
-        adam_beta1 (:obj:`float`, `optional`, defaults to 0.9):
+        min_lr_ratio (`float`, *optional*, defaults to 0):
+            The final learning rate at the end of the linear decay will be `init_lr * min_lr_ratio`.
+        adam_beta1 (`float`, *optional*, defaults to 0.9):
             The beta1 to use in Adam.
-        adam_beta2 (:obj:`float`, `optional`, defaults to 0.999):
+        adam_beta2 (`float`, *optional*, defaults to 0.999):
             The beta2 to use in Adam.
-        adam_epsilon (:obj:`float`, `optional`, defaults to 1e-8):
+        adam_epsilon (`float`, *optional*, defaults to 1e-8):
             The epsilon to use in Adam.
-        weight_decay_rate (:obj:`float`, `optional`, defaults to 0):
+        weight_decay_rate (`float`, *optional*, defaults to 0):
             The weight decay to use.
-        power (:obj:`float`, `optional`, defaults to 1.0):
+        power (`float`, *optional*, defaults to 1.0):
             The power to use for PolynomialDecay.
-        include_in_weight_decay (:obj:`List[str]`, `optional`):
+        include_in_weight_decay (`List[str]`, *optional*):
             List of the parameter names (or re patterns) to apply weight decay to. If none is passed, weight decay is
             applied to all parameters except bias and layer norm parameters.
     """
@@ -153,39 +153,37 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
     """
     Adam enables L2 weight decay and clip_by_global_norm on gradients. Just adding the square of the weights to the
     loss function is *not* the correct way of using L2 regularization/weight decay with Adam, since that will interact
-    with the m and v parameters in strange ways as shown in `Decoupled Weight Decay Regularization
-    <https://arxiv.org/abs/1711.05101>`__.
+    with the m and v parameters in strange ways as shown in [Decoupled Weight Decay Regularization](https://arxiv.org/abs/1711.05101).
 
     Instead we want ot decay the weights in a manner that doesn't interact with the m/v parameters. This is equivalent
     to adding the square of the weights to the loss with plain (non-momentum) SGD.
 
     Args:
-        learning_rate (:obj:`Union[float, tf.keras.optimizers.schedules.LearningRateSchedule]`, `optional`, defaults to 1e-3):
+        learning_rate (`Union[float, tf.keras.optimizers.schedules.LearningRateSchedule]`, *optional*, defaults to 1e-3):
             The learning rate to use or a schedule.
-        beta_1 (:obj:`float`, `optional`, defaults to 0.9):
+        beta_1 (`float`, *optional*, defaults to 0.9):
             The beta1 parameter in Adam, which is the exponential decay rate for the 1st momentum estimates.
-        beta_2 (:obj:`float`, `optional`, defaults to 0.999):
+        beta_2 (`float`, *optional*, defaults to 0.999):
             The beta2 parameter in Adam, which is the exponential decay rate for the 2nd momentum estimates.
-        epsilon (:obj:`float`, `optional`, defaults to 1e-7):
+        epsilon (`float`, *optional*, defaults to 1e-7):
             The epsilon parameter in Adam, which is a small constant for numerical stability.
-        amsgrad (:obj:`bool`, `optional`, default to `False`):
-            Whether to apply AMSGrad variant of this algorithm or not, see `On the Convergence of Adam and Beyond
-            <https://arxiv.org/abs/1904.09237>`__.
-        weight_decay_rate (:obj:`float`, `optional`, defaults to 0):
+        amsgrad (`bool`, *optional*, default to *False*):
+            Whether to apply AMSGrad variant of this algorithm or not, see [On the Convergence of Adam and Beyond](https://arxiv.org/abs/1904.09237).
+        weight_decay_rate (`float`, *optional*, defaults to 0):
             The weight decay to apply.
-        include_in_weight_decay (:obj:`List[str]`, `optional`):
+        include_in_weight_decay (`List[str]`, *optional*):
             List of the parameter names (or re patterns) to apply weight decay to. If none is passed, weight decay is
-            applied to all parameters by default (unless they are in :obj:`exclude_from_weight_decay`).
-        exclude_from_weight_decay (:obj:`List[str]`, `optional`):
+            applied to all parameters by default (unless they are in `exclude_from_weight_decay`).
+        exclude_from_weight_decay (`List[str]`, *optional*):
             List of the parameter names (or re patterns) to exclude from applying weight decay to. If a
-            :obj:`include_in_weight_decay` is passed, the names in it will supersede this list.
-        name (:obj:`str`, `optional`, defaults to 'AdamWeightDecay'):
+            `include_in_weight_decay` is passed, the names in it will supersede this list.
+        name (`str`, *optional*, defaults to 'AdamWeightDecay'):
             Optional name for the operations created when applying gradients.
         kwargs:
-            Keyword arguments. Allowed to be {``clipnorm``, ``clipvalue``, ``lr``, ``decay``}. ``clipnorm`` is clip
-            gradients by norm; ``clipvalue`` is clip gradients by value, ``decay`` is included for backward
-            compatibility to allow time inverse decay of learning rate. ``lr`` is included for backward compatibility,
-            recommended to use ``learning_rate`` instead.
+            Keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`, `decay`}. `clipnorm` is clip
+            gradients by norm; `clipvalue` is clip gradients by value, `decay` is included for backward
+            compatibility to allow time inverse decay of learning rate. `lr` is included for backward compatibility,
+            recommended to use `learning_rate` instead.
     """
 
     def __init__(
@@ -283,7 +281,7 @@ class GradientAccumulator(object):
     """
     Gradient accumulation utility. When used with a distribution strategy, the accumulator should be called in a
     replica context. Gradients will be accumulated locally on each replica and without synchronization. Users should
-    then call ``.gradients``, scale the gradients if required, and pass the result to ``apply_gradients``.
+    then call `.gradients`, scale the gradients if required, and pass the result to `apply_gradients`.
     """
 
     # We use the ON_READ synchronization policy so that no synchronization is
@@ -316,7 +314,7 @@ class GradientAccumulator(object):
         return list(gradient.value() if gradient is not None else gradient for gradient in self._gradients)
 
     def __call__(self, gradients):
-        """Accumulates :obj:`gradients` on the current replica."""
+        """Accumulates `gradients` on the current replica."""
         if not self._gradients:
             _ = self.step  # Create the step variable.
             self._gradients.extend(
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index 4671981218..98c59ae25f 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -317,28 +317,28 @@ def check_task(task: str) -> Tuple[Dict, Any]:
     default models if they exist.
 
     Args:
-        task (:obj:`str`):
+        task (`str`):
             The task defining which pipeline will be returned. Currently accepted tasks are:
 
-            - :obj:`"audio-classification"`
-            - :obj:`"automatic-speech-recognition"`
-            - :obj:`"conversational"`
-            - :obj:`"feature-extraction"`
-            - :obj:`"fill-mask"`
-            - :obj:`"image-classification"`
-            - :obj:`"question-answering"`
-            - :obj:`"table-question-answering"`
-            - :obj:`"text2text-generation"`
-            - :obj:`"text-classification"` (alias :obj:`"sentiment-analysis" available)
-            - :obj:`"text-generation"`
-            - :obj:`"token-classification"` (alias :obj:`"ner"` available)
-            - :obj:`"translation"`
-            - :obj:`"translation_xx_to_yy"`
-            - :obj:`"summarization"`
-            - :obj:`"zero-shot-classification"`
+            - `"audio-classification"`
+            - `"automatic-speech-recognition"`
+            - `"conversational"`
+            - `"feature-extraction"`
+            - `"fill-mask"`
+            - `"image-classification"`
+            - `"question-answering"`
+            - `"table-question-answering"`
+            - `"text2text-generation"`
+            - `"text-classification"` (alias `"sentiment-analysis"` available)
+            - `"text-generation"`
+            - `"token-classification"` (alias `"ner"` available)
+            - `"translation"`
+            - `"translation_xx_to_yy"`
+            - `"summarization"`
+            - `"zero-shot-classification"`
 
     Returns:
-        (task_defaults:obj:`dict`, task_options: (:obj:`tuple`, None)) The actual dictionary required to initialize the
+        (task_defaults`dict`, task_options: (`tuple`, None)) The actual dictionary required to initialize the
         pipeline and some extra task options for parametrized tasks like "translation_XX_to_YY"
 
 
@@ -374,114 +374,114 @@ def pipeline(
     **kwargs
 ) -> Pipeline:
     """
-    Utility factory method to build a :class:`~transformers.Pipeline`.
+    Utility factory method to build a [`Pipeline`].
 
     Pipelines are made of:
 
-        - A :doc:`tokenizer <tokenizer>` in charge of mapping raw textual input to token.
-        - A :doc:`model <model>` to make predictions from the inputs.
+        - A [tokenizer](tokenizer) in charge of mapping raw textual input to token.
+        - A [model](model) to make predictions from the inputs.
         - Some (optional) post processing for enhancing model's output.
 
     Args:
-        task (:obj:`str`):
+        task (`str`):
             The task defining which pipeline will be returned. Currently accepted tasks are:
 
-            - :obj:`"audio-classification"`: will return a :class:`~transformers.AudioClassificationPipeline`:.
-            - :obj:`"automatic-speech-recognition"`: will return a
-              :class:`~transformers.AutomaticSpeechRecognitionPipeline`:.
-            - :obj:`"conversational"`: will return a :class:`~transformers.ConversationalPipeline`:.
-            - :obj:`"feature-extraction"`: will return a :class:`~transformers.FeatureExtractionPipeline`:.
-            - :obj:`"fill-mask"`: will return a :class:`~transformers.FillMaskPipeline`:.
-            - :obj:`"image-classification"`: will return a :class:`~transformers.ImageClassificationPipeline`:.
-            - :obj:`"question-answering"`: will return a :class:`~transformers.QuestionAnsweringPipeline`:.
-            - :obj:`"table-question-answering"`: will return a :class:`~transformers.TableQuestionAnsweringPipeline`:.
-            - :obj:`"text2text-generation"`: will return a :class:`~transformers.Text2TextGenerationPipeline`:.
-            - :obj:`"text-classification"` (alias :obj:`"sentiment-analysis" available): will return a
-              :class:`~transformers.TextClassificationPipeline`:.
-            - :obj:`"text-generation"`: will return a :class:`~transformers.TextGenerationPipeline`:.
-            - :obj:`"token-classification"` (alias :obj:`"ner"` available): will return a
-              :class:`~transformers.TokenClassificationPipeline`:.
-            - :obj:`"translation"`: will return a :class:`~transformers.TranslationPipeline`:.
-            - :obj:`"translation_xx_to_yy"`: will return a :class:`~transformers.TranslationPipeline`:.
-            - :obj:`"summarization"`: will return a :class:`~transformers.SummarizationPipeline`:.
-            - :obj:`"zero-shot-classification"`: will return a :class:`~transformers.ZeroShotClassificationPipeline`:.
+            - `"audio-classification"`: will return a [`AudioClassificationPipeline`].
+            - `"automatic-speech-recognition"`: will return a
+              [`AutomaticSpeechRecognitionPipeline`].
+            - `"conversational"`: will return a [`ConversationalPipeline`].
+            - `"feature-extraction"`: will return a [`FeatureExtractionPipeline`].
+            - `"fill-mask"`: will return a [`FillMaskPipeline`]:.
+            - `"image-classification"`: will return a [`ImageClassificationPipeline`].
+            - `"question-answering"`: will return a [`QuestionAnsweringPipeline`].
+            - `"table-question-answering"`: will return a [`TableQuestionAnsweringPipeline`].
+            - `"text2text-generation"`: will return a [`Text2TextGenerationPipeline`].
+            - `"text-classification"` (alias `"sentiment-analysis"` available): will return a
+              [`TextClassificationPipeline`].
+            - `"text-generation"`: will return a [`TextGenerationPipeline`]:.
+            - `"token-classification"` (alias `"ner"` available): will return a
+              [`TokenClassificationPipeline`].
+            - `"translation"`: will return a [`TranslationPipeline`].
+            - `"translation_xx_to_yy"`: will return a [`TranslationPipeline`].
+            - `"summarization"`: will return a [`SummarizationPipeline`].
+            - `"zero-shot-classification"`: will return a [`ZeroShotClassificationPipeline`].
 
-        model (:obj:`str` or :class:`~transformers.PreTrainedModel` or :class:`~transformers.TFPreTrainedModel`, `optional`):
+        model (`str` or [`PreTrainedModel`] or [`TFPreTrainedModel`], *optional*):
             The model that will be used by the pipeline to make predictions. This can be a model identifier or an
-            actual instance of a pretrained model inheriting from :class:`~transformers.PreTrainedModel` (for PyTorch)
-            or :class:`~transformers.TFPreTrainedModel` (for TensorFlow).
+            actual instance of a pretrained model inheriting from [`PreTrainedModel`] (for PyTorch)
+            or [`TFPreTrainedModel`] (for TensorFlow).
 
-            If not provided, the default for the :obj:`task` will be loaded.
-        config (:obj:`str` or :class:`~transformers.PretrainedConfig`, `optional`):
+            If not provided, the default for the `task` will be loaded.
+        config (`str` or [`PretrainedConfig`], *optional*):
             The configuration that will be used by the pipeline to instantiate the model. This can be a model
             identifier or an actual pretrained model configuration inheriting from
-            :class:`~transformers.PretrainedConfig`.
+            [`PretrainedConfig`].
 
             If not provided, the default configuration file for the requested model will be used. That means that if
-            :obj:`model` is given, its default configuration will be used. However, if :obj:`model` is not supplied,
-            this :obj:`task`'s default model's config is used instead.
-        tokenizer (:obj:`str` or :class:`~transformers.PreTrainedTokenizer`, `optional`):
+            `model` is given, its default configuration will be used. However, if `model` is not supplied,
+            this `task`'s default model's config is used instead.
+        tokenizer (`str` or [`PreTrainedTokenizer`], *optional*):
             The tokenizer that will be used by the pipeline to encode data for the model. This can be a model
-            identifier or an actual pretrained tokenizer inheriting from :class:`~transformers.PreTrainedTokenizer`.
+            identifier or an actual pretrained tokenizer inheriting from [`PreTrainedTokenizer`].
 
-            If not provided, the default tokenizer for the given :obj:`model` will be loaded (if it is a string). If
-            :obj:`model` is not specified or not a string, then the default tokenizer for :obj:`config` is loaded (if
-            it is a string). However, if :obj:`config` is also not given or not a string, then the default tokenizer
-            for the given :obj:`task` will be loaded.
-        feature_extractor (:obj:`str` or :class:`~transformers.PreTrainedFeatureExtractor`, `optional`):
+            If not provided, the default tokenizer for the given `model` will be loaded (if it is a string). If
+            `model` is not specified or not a string, then the default tokenizer for `config` is loaded (if
+            it is a string). However, if `config` is also not given or not a string, then the default tokenizer
+            for the given `task` will be loaded.
+        feature_extractor (`str` or [`PreTrainedFeatureExtractor`], *optional*):
             The feature extractor that will be used by the pipeline to encode data for the model. This can be a model
             identifier or an actual pretrained feature extractor inheriting from
-            :class:`~transformers.PreTrainedFeatureExtractor`.
+            [`PreTrainedFeatureExtractor`].
 
             Feature extractors are used for non-NLP models, such as Speech or Vision models as well as multi-modal
             models. Multi-modal models will also require a tokenizer to be passed.
 
-            If not provided, the default feature extractor for the given :obj:`model` will be loaded (if it is a
-            string). If :obj:`model` is not specified or not a string, then the default feature extractor for
-            :obj:`config` is loaded (if it is a string). However, if :obj:`config` is also not given or not a string,
-            then the default feature extractor for the given :obj:`task` will be loaded.
-        framework (:obj:`str`, `optional`):
-            The framework to use, either :obj:`"pt"` for PyTorch or :obj:`"tf"` for TensorFlow. The specified framework
+            If not provided, the default feature extractor for the given `model` will be loaded (if it is a
+            string). If `model` is not specified or not a string, then the default feature extractor for
+            `config` is loaded (if it is a string). However, if `config` is also not given or not a string,
+            then the default feature extractor for the given `task` will be loaded.
+        framework (`str`, *optional*):
+            The framework to use, either `"pt"` for PyTorch or `"tf"` for TensorFlow. The specified framework
             must be installed.
 
             If no framework is specified, will default to the one currently installed. If no framework is specified and
-            both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if no model
+            both frameworks are installed, will default to the framework of the `model`, or to PyTorch if no model
             is provided.
-        revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+        revision(`str`, *optional*, defaults to `"main"`):
             When passing a task name or a string model identifier: The specific model version to use. It can be a
             branch name, a tag name, or a commit id, since we use a git-based system for storing models and other
-            artifacts on huggingface.co, so ``revision`` can be any identifier allowed by git.
-        use_fast (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether or not to use a Fast tokenizer if possible (a :class:`~transformers.PreTrainedTokenizerFast`).
-        use_auth_token (:obj:`str` or `bool`, `optional`):
-            The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
-            generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
-            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+            artifacts on huggingface.co, so `revision` can be any identifier allowed by git.
+        use_fast (`bool`, *optional*, defaults to `True`):
+            Whether or not to use a Fast tokenizer if possible (a [`PreTrainedTokenizerFast`]).
+        use_auth_token (`str` or *bool*, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token
+            generated when running `transformers-cli login` (stored in `~/.huggingface`).
+            revision(`str`, *optional*, defaults to `"main"`):
         model_kwargs:
-            Additional dictionary of keyword arguments passed along to the model's :obj:`from_pretrained(...,
-            **model_kwargs)` function.
+            Additional dictionary of keyword arguments passed along to the model's `from_pretrained(..., **model_kwargs)` function.
         kwargs:
             Additional keyword arguments passed along to the specific pipeline init (see the documentation for the
             corresponding pipeline class for possible values).
 
     Returns:
-        :class:`~transformers.Pipeline`: A suitable pipeline for the task.
+        [`Pipeline`]: A suitable pipeline for the task.
 
-    Examples::
+    Examples:
 
-        >>> from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
+    ```python
+    >>> from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
 
-        >>> # Sentiment analysis pipeline
-        >>> pipeline('sentiment-analysis')
+    >>> # Sentiment analysis pipeline
+    >>> pipeline('sentiment-analysis')
 
-        >>> # Question answering pipeline, specifying the checkpoint identifier
-        >>> pipeline('question-answering', model='distilbert-base-cased-distilled-squad', tokenizer='bert-base-cased')
+    >>> # Question answering pipeline, specifying the checkpoint identifier
+    >>> pipeline('question-answering', model='distilbert-base-cased-distilled-squad', tokenizer='bert-base-cased')
 
-        >>> # Named entity recognition pipeline, passing in a specific model and tokenizer
-        >>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
-        >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
-        >>> pipeline('ner', model=model, tokenizer=tokenizer)
-    """
+    >>> # Named entity recognition pipeline, passing in a specific model and tokenizer
+    >>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
+    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+    >>> pipeline('ner', model=model, tokenizer=tokenizer)
+    ```"""
     if model_kwargs is None:
         model_kwargs = {}
 
diff --git a/src/transformers/pipelines/audio_classification.py b/src/transformers/pipelines/audio_classification.py
index 2f01060390..882d01f77e 100644
--- a/src/transformers/pipelines/audio_classification.py
+++ b/src/transformers/pipelines/audio_classification.py
@@ -66,15 +66,14 @@ def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
 @add_end_docstrings(PIPELINE_INIT_ARGS)
 class AudioClassificationPipeline(Pipeline):
     """
-    Audio classification pipeline using any :obj:`AutoModelForAudioClassification`. This pipeline predicts the class of
+    Audio classification pipeline using any `AutoModelForAudioClassification`. This pipeline predicts the class of
     a raw waveform or an audio file. In case of an audio file, ffmpeg should be installed to support multiple audio
     formats.
 
-    This pipeline can currently be loaded from :func:`~transformers.pipeline` using the following task identifier:
-    :obj:`"audio-classification"`.
+    This pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"audio-classification"`.
 
-    See the list of available models on `huggingface.co/models
-    <https://huggingface.co/models?filter=audio-classification>`__.
+    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=audio-classification).
     """
 
     def __init__(self, *args, **kwargs):
@@ -93,26 +92,26 @@ class AudioClassificationPipeline(Pipeline):
         **kwargs,
     ):
         """
-        Classify the sequence(s) given as inputs. See the :class:`~transformers.AutomaticSpeechRecognitionPipeline`
+        Classify the sequence(s) given as inputs. See the [`AutomaticSpeechRecognitionPipeline`]
         documentation for more information.
 
         Args:
-            inputs (:obj:`np.ndarray` or :obj:`bytes` or :obj:`str`):
-                The inputs is either a raw waveform (:obj:`np.ndarray` of shape (n, ) of type :obj:`np.float32` or
-                :obj:`np.float64`) at the correct sampling rate (no further check will be done) or a :obj:`str` that is
+            inputs (`np.ndarray` or `bytes` or `str`):
+                The inputs is either a raw waveform (`np.ndarray` of shape (n, ) of type `np.float32` or
+                `np.float64`) at the correct sampling rate (no further check will be done) or a `str` that is
                 the filename of the audio file, the file will be read at the correct sampling rate to get the waveform
-                using `ffmpeg`. This requires `ffmpeg` to be installed on the system. If `inputs` is :obj:`bytes` it is
-                supposed to be the content of an audio file and is interpreted by `ffmpeg` in the same way.
-            top_k (:obj:`int`, `optional`, defaults to None):
-                The number of top labels that will be returned by the pipeline. If the provided number is `None` or
+                using *ffmpeg*. This requires *ffmpeg* to be installed on the system. If *inputs* is `bytes` it is
+                supposed to be the content of an audio file and is interpreted by *ffmpeg* in the same way.
+            top_k (`int`, *optional*, defaults to None):
+                The number of top labels that will be returned by the pipeline. If the provided number is *None* or
                 higher than the number of labels available in the model configuration, it will default to the number of
                 labels.
 
         Return:
-            A list of :obj:`dict` with the following keys:
+            A list of `dict` with the following keys:
 
-            - **label** (:obj:`str`) -- The label predicted.
-            - **score** (:obj:`float`) -- The corresponding probability.
+            - **label** (`str`) -- The label predicted.
+            - **score** (`float`) -- The corresponding probability.
         """
         return super().__call__(inputs, **kwargs)
 
diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index efc564ae8a..a6b2ec6e69 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -77,25 +77,25 @@ class AutomaticSpeechRecognitionPipeline(Pipeline):
     def __init__(self, feature_extractor: Union["SequenceFeatureExtractor", str], *args, **kwargs):
         """
         Arguments:
-            feature_extractor (:class:`~transformers.SequenceFeatureExtractor`):
+            feature_extractor ([`SequenceFeatureExtractor`]):
                 The feature extractor that will be used by the pipeline to encode waveform for the model.
-            model (:class:`~transformers.PreTrainedModel` or :class:`~transformers.TFPreTrainedModel`):
+            model ([`PreTrainedModel`] or [`TFPreTrainedModel`]):
                 The model that will be used by the pipeline to make predictions. This needs to be a model inheriting
-                from :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel`
+                from [`PreTrainedModel`] for PyTorch and [`TFPreTrainedModel`]
                 for TensorFlow.
-            tokenizer (:class:`~transformers.PreTrainedTokenizer`):
+            tokenizer ([`PreTrainedTokenizer`]):
                 The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
-                :class:`~transformers.PreTrainedTokenizer`.
-            modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`):
+                [`PreTrainedTokenizer`].
+            modelcard (`str` or [`ModelCard`], *optional*):
                 Model card attributed to the model for this pipeline.
-            framework (:obj:`str`, `optional`):
-                The framework to use, either :obj:`"pt"` for PyTorch or :obj:`"tf"` for TensorFlow. The specified
+            framework (`str`, *optional*):
+                The framework to use, either `"pt"` for PyTorch or `"tf"` for TensorFlow. The specified
                 framework must be installed.
 
                 If no framework is specified, will default to the one currently installed. If no framework is specified
-                and both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if
+                and both frameworks are installed, will default to the framework of the `model`, or to PyTorch if
                 no model is provided.
-            device (:obj:`int`, `optional`, defaults to -1):
+            device (`int`, *optional*, defaults to -1):
                 Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the
                 model on the associated CUDA device id.
         """
@@ -114,21 +114,21 @@ class AutomaticSpeechRecognitionPipeline(Pipeline):
         **kwargs,
     ):
         """
-        Classify the sequence(s) given as inputs. See the :class:`~transformers.AutomaticSpeechRecognitionPipeline`
+        Classify the sequence(s) given as inputs. See the [`AutomaticSpeechRecognitionPipeline`]
         documentation for more information.
 
         Args:
-            inputs (:obj:`np.ndarray` or :obj:`bytes` or :obj:`str`):
-                The inputs is either a raw waveform (:obj:`np.ndarray` of shape (n, ) of type :obj:`np.float32` or
-                :obj:`np.float64`) at the correct sampling rate (no further check will be done) or a :obj:`str` that is
+            inputs (`np.ndarray` or `bytes` or `str`):
+                The inputs is either a raw waveform (`np.ndarray` of shape (n, ) of type `np.float32` or
+                `np.float64`) at the correct sampling rate (no further check will be done) or a `str` that is
                 the filename of the audio file, the file will be read at the correct sampling rate to get the waveform
-                using `ffmpeg`. This requires `ffmpeg` to be installed on the system. If `inputs` is :obj:`bytes` it is
-                supposed to be the content of an audio file and is interpreted by `ffmpeg` in the same way.
+                using *ffmpeg*. This requires *ffmpeg* to be installed on the system. If *inputs* is `bytes` it is
+                supposed to be the content of an audio file and is interpreted by *ffmpeg* in the same way.
 
         Return:
-            A :obj:`dict` with the following keys:
+            A `dict` with the following keys:
 
-            - **text** (:obj:`str`) -- The recognized text.
+            - **text** (`str`) -- The recognized text.
         """
         return super().__call__(inputs, **kwargs)
 
diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py
index 5bf8c31886..383f37e8e4 100644
--- a/src/transformers/pipelines/base.py
+++ b/src/transformers/pipelines/base.py
@@ -145,30 +145,29 @@ def infer_framework_load_model(
     **model_kwargs
 ):
     """
-    Select framework (TensorFlow or PyTorch) to use from the :obj:`model` passed. Returns a tuple (framework, model).
+    Select framework (TensorFlow or PyTorch) to use from the `model` passed. Returns a tuple (framework, model).
 
-    If :obj:`model` is instantiated, this function will just infer the framework from the model class. Otherwise
-    :obj:`model` is actually a checkpoint name and this method will try to instantiate it using :obj:`model_classes`.
+    If `model` is instantiated, this function will just infer the framework from the model class. Otherwise
+    `model` is actually a checkpoint name and this method will try to instantiate it using `model_classes`.
     Since we don't want to instantiate the model twice, this model is returned for use by the pipeline.
 
-    If both frameworks are installed and available for :obj:`model`, PyTorch is selected.
+    If both frameworks are installed and available for `model`, PyTorch is selected.
 
     Args:
-        model (:obj:`str`, :class:`~transformers.PreTrainedModel` or :class:`~transformers.TFPreTrainedModel`):
-            The model to infer the framework from. If :obj:`str`, a checkpoint name. The model to infer the framewrok
+        model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel`]):
+            The model to infer the framework from. If `str`, a checkpoint name. The model to infer the framewrok
             from.
-        config (:class:`~transformers.AutoConfig`):
+        config ([`AutoConfig`]):
             The config associated with the model to help using the correct class
-        model_classes (dictionary :obj:`str` to :obj:`type`, `optional`):
+        model_classes (dictionary `str` to `type`, *optional*):
             A mapping framework to class.
-        task (:obj:`str`):
+        task (`str`):
             The task defining which pipeline will be returned.
         model_kwargs:
-            Additional dictionary of keyword arguments passed along to the model's :obj:`from_pretrained(...,
-            **model_kwargs)` function.
+            Additional dictionary of keyword arguments passed along to the model's `from_pretrained(..., **model_kwargs)` function.
 
     Returns:
-        :obj:`Tuple`: A tuple framework, model.
+        `Tuple`: A tuple framework, model.
     """
     if not is_tf_available() and not is_torch_available():
         raise RuntimeError(
@@ -242,28 +241,27 @@ def infer_framework_from_model(
     **model_kwargs
 ):
     """
-    Select framework (TensorFlow or PyTorch) to use from the :obj:`model` passed. Returns a tuple (framework, model).
+    Select framework (TensorFlow or PyTorch) to use from the `model` passed. Returns a tuple (framework, model).
 
-    If :obj:`model` is instantiated, this function will just infer the framework from the model class. Otherwise
-    :obj:`model` is actually a checkpoint name and this method will try to instantiate it using :obj:`model_classes`.
+    If `model` is instantiated, this function will just infer the framework from the model class. Otherwise
+    `model` is actually a checkpoint name and this method will try to instantiate it using `model_classes`.
     Since we don't want to instantiate the model twice, this model is returned for use by the pipeline.
 
-    If both frameworks are installed and available for :obj:`model`, PyTorch is selected.
+    If both frameworks are installed and available for `model`, PyTorch is selected.
 
     Args:
-        model (:obj:`str`, :class:`~transformers.PreTrainedModel` or :class:`~transformers.TFPreTrainedModel`):
-            The model to infer the framework from. If :obj:`str`, a checkpoint name. The model to infer the framewrok
+        model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel`]):
+            The model to infer the framework from. If `str`, a checkpoint name. The model to infer the framewrok
             from.
-        model_classes (dictionary :obj:`str` to :obj:`type`, `optional`):
+        model_classes (dictionary `str` to `type`, *optional*):
             A mapping framework to class.
-        task (:obj:`str`):
+        task (`str`):
             The task defining which pipeline will be returned.
         model_kwargs:
-            Additional dictionary of keyword arguments passed along to the model's :obj:`from_pretrained(...,
-            **model_kwargs)` function.
+            Additional dictionary of keyword arguments passed along to the model's `from_pretrained(..., **model_kwargs)` function.
 
     Returns:
-        :obj:`Tuple`: A tuple framework, model.
+        `Tuple`: A tuple framework, model.
     """
     if isinstance(model, str):
         config = AutoConfig.from_pretrained(model, _from_pipeline=task, **model_kwargs)
@@ -279,7 +277,7 @@ def get_framework(model, revision: Optional[str] = None):
     Select framework (TensorFlow or PyTorch) to use.
 
     Args:
-        model (:obj:`str`, :class:`~transformers.PreTrainedModel` or :class:`~transformers.TFPreTrainedModel`):
+        model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel`]):
             If both frameworks are installed, picks the one corresponding to the model passed (either a model class or
             the model name). If no specific model is provided, defaults to using PyTorch.
     """
@@ -313,19 +311,19 @@ def get_default_model(targeted_task: Dict, framework: Optional[str], task_option
     Select a default model to use for a given task. Defaults to pytorch if ambiguous.
 
     Args:
-        targeted_task (:obj:`Dict` ):
+        targeted_task (`Dict` ):
            Dictionary representing the given task, that should contain default models
 
-        framework (:obj:`str`, None)
+        framework (`str`, None)
            "pt", "tf" or None, representing a specific framework if it was specified, or None if we don't know yet.
 
-        task_options (:obj:`Any`, None)
+        task_options (`Any`, None)
            Any further value required by the task to get fully specified, for instance (SRC, TGT) languages for
            translation task.
 
     Returns
 
-        :obj:`str` The model string representing the default model for this pipeline
+        `str` The model string representing the default model for this pipeline
     """
     if is_torch_available() and not is_tf_available():
         framework = "pt"
@@ -352,12 +350,12 @@ def get_default_model(targeted_task: Dict, framework: Optional[str], task_option
 
 class PipelineException(Exception):
     """
-    Raised by a :class:`~transformers.Pipeline` when handling __call__.
+    Raised by a [`Pipeline`] when handling __call__.
 
     Args:
-        task (:obj:`str`): The task of the pipeline.
-        model (:obj:`str`): The model used by the pipeline.
-        reason (:obj:`str`): The error message to display.
+        task (`str`): The task of the pipeline.
+        model (`str`): The model used by the pipeline.
+        reason (`str`): The error message to display.
     """
 
     def __init__(self, task: str, model: str, reason: str):
@@ -369,7 +367,7 @@ class PipelineException(Exception):
 
 class ArgumentHandler(ABC):
     """
-    Base interface for handling arguments for each :class:`~transformers.pipelines.Pipeline`.
+    Base interface for handling arguments for each [`~pipelines.Pipeline`].
     """
 
     @abstractmethod
@@ -386,15 +384,15 @@ class PipelineDataFormat:
     - CSV
     - stdin/stdout (pipe)
 
-    :obj:`PipelineDataFormat` also includes some utilities to work with multi-columns like mapping from datasets
-    columns to pipelines keyword arguments through the :obj:`dataset_kwarg_1=dataset_column_1` format.
+    `PipelineDataFormat` also includes some utilities to work with multi-columns like mapping from datasets
+    columns to pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format.
 
     Args:
-        output_path (:obj:`str`, `optional`): Where to save the outgoing data.
-        input_path (:obj:`str`, `optional`): Where to look for the input data.
-        column (:obj:`str`, `optional`): The column to read.
-        overwrite (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to overwrite the :obj:`output_path`.
+        output_path (`str`, *optional*): Where to save the outgoing data.
+        input_path (`str`, *optional*): Where to look for the input data.
+        column (`str`, *optional*): The column to read.
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to overwrite the `output_path`.
     """
 
     SUPPORTED_FORMATS = ["json", "csv", "pipe"]
@@ -430,10 +428,10 @@ class PipelineDataFormat:
     def save(self, data: Union[dict, List[dict]]):
         """
         Save the provided data object with the representation for the current
-        :class:`~transformers.pipelines.PipelineDataFormat`.
+        [`~pipelines.PipelineDataFormat`].
 
         Args:
-            data (:obj:`dict` or list of :obj:`dict`): The data to store.
+            data (`dict` or list of `dict`): The data to store.
         """
         raise NotImplementedError()
 
@@ -442,10 +440,10 @@ class PipelineDataFormat:
         Save the provided data object as a pickle-formatted binary data on the disk.
 
         Args:
-            data (:obj:`dict` or list of :obj:`dict`): The data to store.
+            data (`dict` or list of `dict`): The data to store.
 
         Returns:
-            :obj:`str`: Path where the data has been saved.
+            `str`: Path where the data has been saved.
         """
         path, _ = os.path.splitext(self.output_path)
         binary_path = os.path.extsep.join((path, "pickle"))
@@ -464,23 +462,23 @@ class PipelineDataFormat:
         overwrite=False,
     ) -> "PipelineDataFormat":
         """
-        Creates an instance of the right subclass of :class:`~transformers.pipelines.PipelineDataFormat` depending on
-        :obj:`format`.
+        Creates an instance of the right subclass of [`~pipelines.PipelineDataFormat`] depending on
+        `format`.
 
         Args:
-            format: (:obj:`str`):
-                The format of the desired pipeline. Acceptable values are :obj:`"json"`, :obj:`"csv"` or :obj:`"pipe"`.
-            output_path (:obj:`str`, `optional`):
+            format: (`str`):
+                The format of the desired pipeline. Acceptable values are `"json"`, `"csv"` or `"pipe"`.
+            output_path (`str`, *optional*):
                 Where to save the outgoing data.
-            input_path (:obj:`str`, `optional`):
+            input_path (`str`, *optional*):
                 Where to look for the input data.
-            column (:obj:`str`, `optional`):
+            column (`str`, *optional*):
                 The column to read.
-            overwrite (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not to overwrite the :obj:`output_path`.
+            overwrite (`bool`, *optional*, defaults to `False`):
+                Whether or not to overwrite the `output_path`.
 
         Returns:
-            :class:`~transformers.pipelines.PipelineDataFormat`: The proper data format.
+            [`~pipelines.PipelineDataFormat`]: The proper data format.
         """
         if format == "json":
             return JsonPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
@@ -497,11 +495,11 @@ class CsvPipelineDataFormat(PipelineDataFormat):
     Support for pipelines using CSV data format.
 
     Args:
-        output_path (:obj:`str`, `optional`): Where to save the outgoing data.
-        input_path (:obj:`str`, `optional`): Where to look for the input data.
-        column (:obj:`str`, `optional`): The column to read.
-        overwrite (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to overwrite the :obj:`output_path`.
+        output_path (`str`, *optional*): Where to save the outgoing data.
+        input_path (`str`, *optional*): Where to look for the input data.
+        column (`str`, *optional*): The column to read.
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to overwrite the `output_path`.
     """
 
     def __init__(
@@ -525,10 +523,10 @@ class CsvPipelineDataFormat(PipelineDataFormat):
     def save(self, data: List[dict]):
         """
         Save the provided data object with the representation for the current
-        :class:`~transformers.pipelines.PipelineDataFormat`.
+        [`~pipelines.PipelineDataFormat`].
 
         Args:
-            data (:obj:`List[dict]`): The data to store.
+            data (`List[dict]`): The data to store.
         """
         with open(self.output_path, "w") as f:
             if len(data) > 0:
@@ -542,11 +540,11 @@ class JsonPipelineDataFormat(PipelineDataFormat):
     Support for pipelines using JSON file format.
 
     Args:
-        output_path (:obj:`str`, `optional`): Where to save the outgoing data.
-        input_path (:obj:`str`, `optional`): Where to look for the input data.
-        column (:obj:`str`, `optional`): The column to read.
-        overwrite (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to overwrite the :obj:`output_path`.
+        output_path (`str`, *optional*): Where to save the outgoing data.
+        input_path (`str`, *optional*): Where to look for the input data.
+        column (`str`, *optional*): The column to read.
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to overwrite the `output_path`.
     """
 
     def __init__(
@@ -573,7 +571,7 @@ class JsonPipelineDataFormat(PipelineDataFormat):
         Save the provided data object in a json file.
 
         Args:
-            data (:obj:`dict`): The data to store.
+            data (`dict`): The data to store.
         """
         with open(self.output_path, "w") as f:
             json.dump(data, f)
@@ -586,11 +584,11 @@ class PipedPipelineDataFormat(PipelineDataFormat):
     If columns are provided, then the output will be a dictionary with {column_x: value_x}
 
     Args:
-        output_path (:obj:`str`, `optional`): Where to save the outgoing data.
-        input_path (:obj:`str`, `optional`): Where to look for the input data.
-        column (:obj:`str`, `optional`): The column to read.
-        overwrite (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to overwrite the :obj:`output_path`.
+        output_path (`str`, *optional*): Where to save the outgoing data.
+        input_path (`str`, *optional*): Where to look for the input data.
+        column (`str`, *optional*): The column to read.
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to overwrite the `output_path`.
     """
 
     def __iter__(self):
@@ -614,7 +612,7 @@ class PipedPipelineDataFormat(PipelineDataFormat):
         Print the data.
 
         Args:
-            data (:obj:`dict`): The data to store.
+            data (`dict`): The data to store.
         """
         print(data)
 
@@ -644,37 +642,36 @@ class _ScikitCompat(ABC):
 
 PIPELINE_INIT_ARGS = r"""
     Arguments:
-        model (:class:`~transformers.PreTrainedModel` or :class:`~transformers.TFPreTrainedModel`):
+        model ([`PreTrainedModel`] or [`TFPreTrainedModel`]):
             The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
-            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
+            [`PreTrainedModel`] for PyTorch and [`TFPreTrainedModel`] for
             TensorFlow.
-        tokenizer (:class:`~transformers.PreTrainedTokenizer`):
+        tokenizer ([`PreTrainedTokenizer`]):
             The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
-            :class:`~transformers.PreTrainedTokenizer`.
-        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`):
+            [`PreTrainedTokenizer`].
+        modelcard (`str` or [`ModelCard`], *optional*):
             Model card attributed to the model for this pipeline.
-        framework (:obj:`str`, `optional`):
-            The framework to use, either :obj:`"pt"` for PyTorch or :obj:`"tf"` for TensorFlow. The specified framework
+        framework (`str`, *optional*):
+            The framework to use, either `"pt"` for PyTorch or `"tf"` for TensorFlow. The specified framework
             must be installed.
 
             If no framework is specified, will default to the one currently installed. If no framework is specified and
-            both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if no model
+            both frameworks are installed, will default to the framework of the `model`, or to PyTorch if no model
             is provided.
-        task (:obj:`str`, defaults to :obj:`""`):
+        task (`str`, defaults to `""`):
             A task-identifier for the pipeline.
-        num_workers (:obj:`int`, `optional`, defaults to 8):
-            When the pipeline will use `DataLoader` (when passing a dataset, on GPU for a Pytorch model), the number of
+        num_workers (`int`, *optional*, defaults to 8):
+            When the pipeline will use *DataLoader* (when passing a dataset, on GPU for a Pytorch model), the number of
             workers to be used.
-        batch_size (:obj:`int`, `optional`, defaults to 1):
-            When the pipeline will use `DataLoader` (when passing a dataset, on GPU for a Pytorch model), the size of
-            the batch to use, for inference this is not always beneficial, please read `Batching with pipelines
-            <https://huggingface.co/transformers/main_classes/pipelines.html#pipeline-batching>`_ .
-        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`):
+        batch_size (`int`, *optional*, defaults to 1):
+            When the pipeline will use *DataLoader* (when passing a dataset, on GPU for a Pytorch model), the size of
+            the batch to use, for inference this is not always beneficial, please read [Batching with pipelines](https://huggingface.co/transformers/main_classes/pipelines.html#pipeline-batching) .
+        args_parser ([`~pipelines.ArgumentHandler`], *optional*):
             Reference to the object in charge of parsing supplied pipeline parameters.
-        device (:obj:`int`, `optional`, defaults to -1):
+        device (`int`, *optional*, defaults to -1):
             Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model on
             the associated CUDA device id.
-        binary_output (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        binary_output (`bool`, *optional*, defaults to `False`):
             Flag indicating if the output the pipeline should happen in a binary format (i.e., pickle) or as raw text.
 """
 
@@ -699,29 +696,29 @@ if is_torch_available():
             """
             Roughly equivalent to
 
-            .. code-block::
-                for item in loader:
-                    yield infer(item, **params)
+            ```python
+            for item in loader:
+                yield infer(item, **params)
+            ```
 
             Arguments:
-                loader (:obj:`torch.utils.data.DataLoader` or any iterator):
-                    The iterator that will be used to apply :obj:`infer` on.
+                loader (`torch.utils.data.DataLoader` or any iterator):
+                    The iterator that will be used to apply `infer` on.
                 infer (any function):
-                    The function to apply of each element of :obj:`loader`.
-                params (:obj:`dict`):
-                    The parameters passed to :obj:`infer` along with every item
-                loader_batch_size (:obj:`int`, `optional`):
-                    If specified, the items of :obj:`loader` are supposed to come as batch, and are loader_batched here
+                    The function to apply of each element of `loader`.
+                params (`dict`):
+                    The parameters passed to `infer` along with every item
+                loader_batch_size (`int`, *optional*):
+                    If specified, the items of `loader` are supposed to come as batch, and are loader_batched here
                     making it roughly behave as
 
 
-                    .. code-block::
-
-                        for items in loader:
-                            for i in loader_batch_size:
-                                item = items[i]
-                                yield infer(item, **params)
-            """
+            ```python
+            for items in loader:
+                for i in loader_batch_size:
+                    item = items[i]
+                    yield infer(item, **params)
+            ```"""
             self.loader = loader
             self.infer = infer
             self.params = params
@@ -815,9 +812,9 @@ class Pipeline(_ScikitCompat):
 
     Pipeline supports running on CPU or GPU through the device argument (see below).
 
-    Some pipeline, like for instance :class:`~transformers.FeatureExtractionPipeline` (:obj:`'feature-extraction'` )
+    Some pipeline, like for instance [`FeatureExtractionPipeline`] (`'feature-extraction'`)
     output large tensor object as nested-lists. In order to avoid dumping such large structure as textual data we
-    provide the :obj:`binary_output` constructor argument. If set to :obj:`True`, the output will be stored in the
+    provide the `binary_output` constructor argument. If set to `True`, the output will be stored in the
     pickle format.
     """
 
@@ -866,7 +863,7 @@ class Pipeline(_ScikitCompat):
         Save the pipeline's model and tokenizer.
 
         Args:
-            save_directory (:obj:`str`):
+            save_directory (`str`):
                 A path to the directory where to saved. It will be created if it doesn't exist.
         """
         if os.path.isfile(save_directory):
@@ -905,14 +902,15 @@ class Pipeline(_ScikitCompat):
         Returns:
             Context manager
 
-        Examples::
+        Examples:
 
-            # Explicitly ask for tensor allocation on CUDA device :0
-            pipe = pipeline(..., device=0)
-            with pipe.device_placement():
-                # Every framework specific tensor allocation will be done on the request device
-                output = pipe(...)
-        """
+        ```python
+        # Explicitly ask for tensor allocation on CUDA device :0
+        pipe = pipeline(..., device=0)
+        with pipe.device_placement():
+            # Every framework specific tensor allocation will be done on the request device
+            output = pipe(...)
+        ```"""
         if self.framework == "tf":
             with tf.device("/CPU:0" if self.device == -1 else f"/device:GPU:{self.device}"):
                 yield
@@ -927,11 +925,11 @@ class Pipeline(_ScikitCompat):
         Ensure PyTorch tensors are on the specified device.
 
         Args:
-            inputs (keyword arguments that should be :obj:`torch.Tensor`, the rest is ignored): The tensors to place on :obj:`self.device`.
+            inputs (keyword arguments that should be `torch.Tensor`, the rest is ignored): The tensors to place on `self.device`.
             Recursive on lists **only**.
 
         Return:
-            :obj:`Dict[str, torch.Tensor]`: The same as :obj:`inputs` but on the proper device.
+            `Dict[str, torch.Tensor]`: The same as `inputs` but on the proper device.
         """
         return self._ensure_tensor_on_device(inputs, self.device)
 
@@ -958,7 +956,7 @@ class Pipeline(_ScikitCompat):
         Check if the model class is in supported by the pipeline.
 
         Args:
-            supported_models (:obj:`List[str]` or :obj:`dict`):
+            supported_models (`List[str]` or `dict`):
                 The list of models supported by the pipeline, or a dictionary with model class values.
         """
         if not isinstance(supported_models, list):  # Create from a model mapping
diff --git a/src/transformers/pipelines/conversational.py b/src/transformers/pipelines/conversational.py
index 2fd90061a5..c5653e57e4 100644
--- a/src/transformers/pipelines/conversational.py
+++ b/src/transformers/pipelines/conversational.py
@@ -19,41 +19,42 @@ logger = logging.get_logger(__name__)
 class Conversation:
     """
     Utility class containing a conversation and its history. This class is meant to be used as an input to the
-    :class:`~transformers.ConversationalPipeline`. The conversation contains a number of utility function to manage the
+    [`ConversationalPipeline`]. The conversation contains a number of utility function to manage the
     addition of new user input and generated model responses. A conversation needs to contain an unprocessed user input
-    before being passed to the :class:`~transformers.ConversationalPipeline`. This user input is either created when
-    the class is instantiated, or by calling :obj:`conversational_pipeline.append_response("input")` after a
+    before being passed to the [`ConversationalPipeline`]. This user input is either created when
+    the class is instantiated, or by calling `conversational_pipeline.append_response("input")` after a
     conversation turn.
 
     Arguments:
-        text (:obj:`str`, `optional`):
+        text (`str`, *optional*):
             The initial user input to start the conversation. If not provided, a user input needs to be provided
-            manually using the :meth:`~transformers.Conversation.add_user_input` method before the conversation can
+            manually using the [`~Conversation.add_user_input`] method before the conversation can
             begin.
-        conversation_id (:obj:`uuid.UUID`, `optional`):
+        conversation_id (`uuid.UUID`, *optional*):
             Unique identifier for the conversation. If not provided, a random UUID4 id will be assigned to the
             conversation.
-        past_user_inputs (:obj:`List[str]`, `optional`):
+        past_user_inputs (`List[str]`, *optional*):
             Eventual past history of the conversation of the user. You don't need to pass it manually if you use the
-            pipeline interactively but if you want to recreate history you need to set both :obj:`past_user_inputs` and
-            :obj:`generated_responses` with equal length lists of strings
-        generated_responses (:obj:`List[str]`, `optional`):
+            pipeline interactively but if you want to recreate history you need to set both `past_user_inputs` and
+            `generated_responses` with equal length lists of strings
+        generated_responses (`List[str]`, *optional*):
             Eventual past history of the conversation of the model. You don't need to pass it manually if you use the
-            pipeline interactively but if you want to recreate history you need to set both :obj:`past_user_inputs` and
-            :obj:`generated_responses` with equal length lists of strings
+            pipeline interactively but if you want to recreate history you need to set both `past_user_inputs` and
+            `generated_responses` with equal length lists of strings
 
-    Usage::
+    Usage:
 
-        conversation = Conversation("Going to the movies tonight - any suggestions?")
+    ```python
+    conversation = Conversation("Going to the movies tonight - any suggestions?")
 
-        # Steps usually performed by the model when generating a response:
-        # 1. Mark the user input as processed (moved to the history)
-        conversation.mark_processed()
-        # 2. Append a mode response
-        conversation.append_response("The Big lebowski.")
+    # Steps usually performed by the model when generating a response:
+    # 1. Mark the user input as processed (moved to the history)
+    conversation.mark_processed()
+    # 2. Append a mode response
+    conversation.append_response("The Big lebowski.")
 
-        conversation.add_user_input("Is it good?")
-    """
+    conversation.add_user_input("Is it good?")
+    ```"""
 
     def __init__(
         self, text: str = None, conversation_id: uuid.UUID = None, past_user_inputs=None, generated_responses=None
@@ -83,12 +84,12 @@ class Conversation:
 
     def add_user_input(self, text: str, overwrite: bool = False):
         """
-        Add a user input to the conversation for the next round. This populates the internal :obj:`new_user_input`
+        Add a user input to the conversation for the next round. This populates the internal `new_user_input`
         field.
 
         Args:
-            text (:obj:`str`): The user input for the next conversation round.
-            overwrite (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            text (`str`): The user input for the next conversation round.
+            overwrite (`bool`, *optional*, defaults to `False`):
                 Whether or not existing and unprocessed user input should be overwritten when this function is called.
         """
         if self.new_user_input:
@@ -108,8 +109,8 @@ class Conversation:
 
     def mark_processed(self):
         """
-        Mark the conversation as processed (moves the content of :obj:`new_user_input` to :obj:`past_user_inputs`) and
-        empties the :obj:`new_user_input` field.
+        Mark the conversation as processed (moves the content of `new_user_input` to `past_user_inputs`) and
+        empties the `new_user_input` field.
         """
         if self.new_user_input:
             self.past_user_inputs.append(self.new_user_input)
@@ -120,7 +121,7 @@ class Conversation:
         Append a response to the list of generated responses.
 
         Args:
-            response (:obj:`str`): The model generated response.
+            response (`str`): The model generated response.
         """
         self.generated_responses.append(response)
 
@@ -128,8 +129,8 @@ class Conversation:
         """
         Iterates over all blobs of the conversation.
 
-        Returns: Iterator of (is_user, text_chunk) in chronological order of the conversation. ``is_user`` is a
-        :obj:`bool`, ``text_chunks`` is a :obj:`str`.
+        Returns: Iterator of (is_user, text_chunk) in chronological order of the conversation. `is_user` is a
+        `bool`, `text_chunks` is a `str`.
         """
         for user_input, generated_response in zip(self.past_user_inputs, self.generated_responses):
             yield True, user_input
@@ -142,7 +143,7 @@ class Conversation:
         Generates a string representation of the conversation.
 
         Return:
-            :obj:`str`:
+            `str`:
 
             Example: Conversation id: 7d15686b-dc94-49f2-9c4b-c9eac6a1f114 user >> Going to the movies tonight - any
             suggestions? bot >> The Big Lebowski
@@ -157,9 +158,9 @@ class Conversation:
 @add_end_docstrings(
     PIPELINE_INIT_ARGS,
     r"""
-        min_length_for_response (:obj:`int`, `optional`, defaults to 32):
+        min_length_for_response (`int`, *optional*, defaults to 32):
             The minimum length (in number of tokens) for a response.
-        minimum_tokens (:obj:`int`, `optional`, defaults to 10):
+        minimum_tokens (`int`, *optional*, defaults to 10):
             The minimum length of tokens to leave for a response.
     """,
 )
@@ -167,28 +168,28 @@ class ConversationalPipeline(Pipeline):
     """
     Multi-turn conversational pipeline.
 
-    This conversational pipeline can currently be loaded from :func:`~transformers.pipeline` using the following task
-    identifier: :obj:`"conversational"`.
+    This conversational pipeline can currently be loaded from [`pipeline`] using the following task
+    identifier: `"conversational"`.
 
     The models that this pipeline can use are models that have been fine-tuned on a multi-turn conversational task,
-    currently: `'microsoft/DialoGPT-small'`, `'microsoft/DialoGPT-medium'`, `'microsoft/DialoGPT-large'`. See the
-    up-to-date list of available models on `huggingface.co/models
-    <https://huggingface.co/models?filter=conversational>`__.
+    currently: *'microsoft/DialoGPT-small'*, *'microsoft/DialoGPT-medium'*, *'microsoft/DialoGPT-large'*. See the
+    up-to-date list of available models on [huggingface.co/models](https://huggingface.co/models?filter=conversational).
 
-    Usage::
+    Usage:
 
-        conversational_pipeline = pipeline("conversational")
+    ```python
+    conversational_pipeline = pipeline("conversational")
 
-        conversation_1 = Conversation("Going to the movies tonight - any suggestions?")
-        conversation_2 = Conversation("What's the last book you have read?")
+    conversation_1 = Conversation("Going to the movies tonight - any suggestions?")
+    conversation_2 = Conversation("What's the last book you have read?")
 
-        conversational_pipeline([conversation_1, conversation_2])
+    conversational_pipeline([conversation_1, conversation_2])
 
-        conversation_1.add_user_input("Is it an action movie?")
-        conversation_2.add_user_input("What is the genre of this book?")
+    conversation_1.add_user_input("Is it an action movie?")
+    conversation_2.add_user_input("What is the genre of this book?")
 
-        conversational_pipeline([conversation_1, conversation_2])
-    """
+    conversational_pipeline([conversation_1, conversation_2])
+    ```"""
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -222,16 +223,16 @@ class ConversationalPipeline(Pipeline):
         Generate responses for the conversation(s) given as inputs.
 
         Args:
-            conversations (a :class:`~transformers.Conversation` or a list of :class:`~transformers.Conversation`):
+            conversations (a [`Conversation`] or a list of [`Conversation`]):
                 Conversations to generate responses for.
-            clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
                 Whether or not to clean up the potential extra spaces in the text output.
             generate_kwargs:
                 Additional keyword arguments to pass along to the generate method of the model (see the generate method
-                corresponding to your framework `here <./model.html#generative-models>`__).
+                corresponding to your framework [here](./model#generative-models)).
 
         Returns:
-            :class:`~transformers.Conversation` or a list of :class:`~transformers.Conversation`: Conversation(s) with
+            [`Conversation`] or a list of [`Conversation`]: Conversation(s) with
             updated generated responses for those containing a new user input.
         """
         # XXX: num_workers==0 is required to be backward compatible
diff --git a/src/transformers/pipelines/feature_extraction.py b/src/transformers/pipelines/feature_extraction.py
index 1e2433a741..082a5f4027 100644
--- a/src/transformers/pipelines/feature_extraction.py
+++ b/src/transformers/pipelines/feature_extraction.py
@@ -9,34 +9,34 @@ class FeatureExtractionPipeline(Pipeline):
     Feature extraction pipeline using no model head. This pipeline extracts the hidden states from the base
     transformer, which can be used as features in downstream tasks.
 
-    This feature extraction pipeline can currently be loaded from :func:`~transformers.pipeline` using the task
-    identifier: :obj:`"feature-extraction"`.
+    This feature extraction pipeline can currently be loaded from [`pipeline`] using the task
+    identifier: `"feature-extraction"`.
 
     All models may be used for this pipeline. See a list of all models, including community-contributed models on
-    `huggingface.co/models <https://huggingface.co/models>`__.
+    [huggingface.co/models](https://huggingface.co/models).
 
     Arguments:
-        model (:class:`~transformers.PreTrainedModel` or :class:`~transformers.TFPreTrainedModel`):
+        model ([`PreTrainedModel`] or [`TFPreTrainedModel`]):
             The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
-            :class:`~transformers.PreTrainedModel` for PyTorch and :class:`~transformers.TFPreTrainedModel` for
+            [`PreTrainedModel`] for PyTorch and [`TFPreTrainedModel`] for
             TensorFlow.
-        tokenizer (:class:`~transformers.PreTrainedTokenizer`):
+        tokenizer ([`PreTrainedTokenizer`]):
             The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
-            :class:`~transformers.PreTrainedTokenizer`.
-        modelcard (:obj:`str` or :class:`~transformers.ModelCard`, `optional`):
+            [`PreTrainedTokenizer`].
+        modelcard (`str` or [`ModelCard`], *optional*):
             Model card attributed to the model for this pipeline.
-        framework (:obj:`str`, `optional`):
-            The framework to use, either :obj:`"pt"` for PyTorch or :obj:`"tf"` for TensorFlow. The specified framework
+        framework (`str`, *optional*):
+            The framework to use, either `"pt"` for PyTorch or `"tf"` for TensorFlow. The specified framework
             must be installed.
 
             If no framework is specified, will default to the one currently installed. If no framework is specified and
-            both frameworks are installed, will default to the framework of the :obj:`model`, or to PyTorch if no model
+            both frameworks are installed, will default to the framework of the `model`, or to PyTorch if no model
             is provided.
-        task (:obj:`str`, defaults to :obj:`""`):
+        task (`str`, defaults to `""`):
             A task-identifier for the pipeline.
-        args_parser (:class:`~transformers.pipelines.ArgumentHandler`, `optional`):
+        args_parser ([`~pipelines.ArgumentHandler`], *optional*):
             Reference to the object in charge of parsing supplied pipeline parameters.
-        device (:obj:`int`, `optional`, defaults to -1):
+        device (`int`, *optional*, defaults to -1):
             Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model on
             the associated CUDA device id.
     """
@@ -72,9 +72,9 @@ class FeatureExtractionPipeline(Pipeline):
         Extract the features of the input(s).
 
         Args:
-            args (:obj:`str` or :obj:`List[str]`): One or several texts (or one list of texts) to get the features of.
+            args (`str` or `List[str]`): One or several texts (or one list of texts) to get the features of.
 
         Return:
-            A nested list of :obj:`float`: The features computed by the model.
+            A nested list of `float`: The features computed by the model.
         """
         return super().__call__(*args, **kwargs)
diff --git a/src/transformers/pipelines/fill_mask.py b/src/transformers/pipelines/fill_mask.py
index 2e7ab0ed90..7247cd0477 100644
--- a/src/transformers/pipelines/fill_mask.py
+++ b/src/transformers/pipelines/fill_mask.py
@@ -21,9 +21,9 @@ logger = logging.get_logger(__name__)
 @add_end_docstrings(
     PIPELINE_INIT_ARGS,
     r"""
-        top_k (:obj:`int`, defaults to 5):
+        top_k (`int`, defaults to 5):
             The number of predictions to return.
-        targets (:obj:`str` or :obj:`List[str]`, `optional`):
+        targets (`str` or `List[str]`, *optional*):
             When passed, the model will limit the scores to the passed targets instead of looking up in the whole
             vocab. If the provided targets are not in the model vocab, they will be tokenized and the first resulting
             token will be used (with a warning, and that might be slower).
@@ -32,22 +32,23 @@ logger = logging.get_logger(__name__)
 )
 class FillMaskPipeline(Pipeline):
     """
-    Masked language modeling prediction pipeline using any :obj:`ModelWithLMHead`. See the `masked language modeling
-    examples <../task_summary.html#masked-language-modeling>`__ for more information.
+    Masked language modeling prediction pipeline using any `ModelWithLMHead`. See the [masked language modeling
+    examples](../task_summary#masked-language-modeling) for more information.
 
-    This mask filling pipeline can currently be loaded from :func:`~transformers.pipeline` using the following task
-    identifier: :obj:`"fill-mask"`.
+    This mask filling pipeline can currently be loaded from [`pipeline`] using the following task
+    identifier: `"fill-mask"`.
 
     The models that this pipeline can use are models that have been trained with a masked language modeling objective,
     which includes the bi-directional models in the library. See the up-to-date list of available models on
-    `huggingface.co/models <https://huggingface.co/models?filter=fill-mask>`__.
+    [huggingface.co/models](https://huggingface.co/models?filter=fill-mask).
 
-    .. note::
+    <Tip>
 
-        This pipeline only works for inputs with exactly one token masked. Experimental: We added support for multiple
-        masks. The returned values are raw model output, and correspond to disjoint probabilities where one might
-        expect joint probabilities (See `discussion <https://github.com/huggingface/transformers/pull/10222>`__).
-    """
+    This pipeline only works for inputs with exactly one token masked. Experimental: We added support for multiple
+    masks. The returned values are raw model output, and correspond to disjoint probabilities where one might
+    expect joint probabilities (See [discussion](https://github.com/huggingface/transformers/pull/10222)).
+
+    </Tip>"""
 
     def get_masked_index(self, input_ids: GenericTensor) -> np.ndarray:
         if self.framework == "tf":
@@ -205,22 +206,22 @@ class FillMaskPipeline(Pipeline):
         Fill the masked token in the text(s) given as inputs.
 
         Args:
-            args (:obj:`str` or :obj:`List[str]`):
+            args (`str` or `List[str]`):
                 One or several texts (or one list of prompts) with masked tokens.
-            targets (:obj:`str` or :obj:`List[str]`, `optional`):
+            targets (`str` or `List[str]`, *optional*):
                 When passed, the model will limit the scores to the passed targets instead of looking up in the whole
                 vocab. If the provided targets are not in the model vocab, they will be tokenized and the first
                 resulting token will be used (with a warning, and that might be slower).
-            top_k (:obj:`int`, `optional`):
+            top_k (`int`, *optional*):
                 When passed, overrides the number of predictions to return.
 
         Return:
-            A list or a list of list of :obj:`dict`: Each result comes as list of dictionaries with the following keys:
+            A list or a list of list of `dict`: Each result comes as list of dictionaries with the following keys:
 
-            - **sequence** (:obj:`str`) -- The corresponding input with the mask token prediction.
-            - **score** (:obj:`float`) -- The corresponding probability.
-            - **token** (:obj:`int`) -- The predicted token id (to replace the masked one).
-            - **token** (:obj:`str`) -- The predicted token (to replace the masked one).
+            - **sequence** (`str`) -- The corresponding input with the mask token prediction.
+            - **score** (`float`) -- The corresponding probability.
+            - **token** (`int`) -- The predicted token id (to replace the masked one).
+            - **token** (`str`) -- The predicted token (to replace the masked one).
         """
         outputs = super().__call__(inputs, **kwargs)
         if isinstance(inputs, list) and len(inputs) == 1:
diff --git a/src/transformers/pipelines/image_classification.py b/src/transformers/pipelines/image_classification.py
index 2d2ab68cba..466566f37c 100644
--- a/src/transformers/pipelines/image_classification.py
+++ b/src/transformers/pipelines/image_classification.py
@@ -19,14 +19,13 @@ logger = logging.get_logger(__name__)
 @add_end_docstrings(PIPELINE_INIT_ARGS)
 class ImageClassificationPipeline(Pipeline):
     """
-    Image classification pipeline using any :obj:`AutoModelForImageClassification`. This pipeline predicts the class of
+    Image classification pipeline using any `AutoModelForImageClassification`. This pipeline predicts the class of
     an image.
 
-    This image classification pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
-    task identifier: :obj:`"image-classification"`.
+    This image classification pipeline can currently be loaded from [`pipeline`] using the following
+    task identifier: `"image-classification"`.
 
-    See the list of available models on `huggingface.co/models
-    <https://huggingface.co/models?filter=image-classification>`__.
+    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=image-classification).
     """
 
     def __init__(self, *args, **kwargs):
@@ -49,7 +48,7 @@ class ImageClassificationPipeline(Pipeline):
         Assign labels to the image(s) passed as inputs.
 
         Args:
-            images (:obj:`str`, :obj:`List[str]`, :obj:`PIL.Image` or :obj:`List[PIL.Image]`):
+            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                 The pipeline handles three types of images:
 
                 - A string containing a http link pointing to an image
@@ -59,7 +58,7 @@ class ImageClassificationPipeline(Pipeline):
                 The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
                 Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
                 images.
-            top_k (:obj:`int`, `optional`, defaults to 5):
+            top_k (`int`, *optional*, defaults to 5):
                 The number of top labels that will be returned by the pipeline. If the provided number is higher than
                 the number of labels available in the model configuration, it will default to the number of labels.
 
@@ -70,8 +69,8 @@ class ImageClassificationPipeline(Pipeline):
 
             The dictionaries contain the following keys:
 
-            - **label** (:obj:`str`) -- The label identified by the model.
-            - **score** (:obj:`int`) -- The score attributed by the model for that label.
+            - **label** (`str`) -- The label identified by the model.
+            - **score** (`int`) -- The score attributed by the model for that label.
         """
         return super().__call__(images, **kwargs)
 
diff --git a/src/transformers/pipelines/image_segmentation.py b/src/transformers/pipelines/image_segmentation.py
index fac8cddc67..4effb11290 100644
--- a/src/transformers/pipelines/image_segmentation.py
+++ b/src/transformers/pipelines/image_segmentation.py
@@ -29,14 +29,13 @@ Predictions = List[Prediction]
 @add_end_docstrings(PIPELINE_INIT_ARGS)
 class ImageSegmentationPipeline(Pipeline):
     """
-    Image segmentation pipeline using any :obj:`AutoModelForImageSegmentation`. This pipeline predicts masks of objects
+    Image segmentation pipeline using any `AutoModelForImageSegmentation`. This pipeline predicts masks of objects
     and their classes.
 
-    This image segmntation pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
-    task identifier: :obj:`"image-segmentation"`.
+    This image segmntation pipeline can currently be loaded from [`pipeline`] using the following
+    task identifier: `"image-segmentation"`.
 
-    See the list of available models on `huggingface.co/models
-    <https://huggingface.co/models?filter=image-segmentation>`__.
+    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=image-segmentation).
     """
 
     def __init__(self, *args, **kwargs):
@@ -61,7 +60,7 @@ class ImageSegmentationPipeline(Pipeline):
         Perform segmentation (detect masks & classes) in the image(s) passed as inputs.
 
         Args:
-            images (:obj:`str`, :obj:`List[str]`, :obj:`PIL.Image` or :obj:`List[PIL.Image]`):
+            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                 The pipeline handles three types of images:
 
                 - A string containing an HTTP(S) link pointing to an image
@@ -70,9 +69,9 @@ class ImageSegmentationPipeline(Pipeline):
 
                 The pipeline accepts either a single image or a batch of images. Images in a batch must all be in the
                 same format: all as HTTP(S) links, all as local paths, or all as PIL images.
-            threshold (:obj:`float`, `optional`, defaults to 0.9):
+            threshold (`float`, *optional*, defaults to 0.9):
                 The probability necessary to make a prediction.
-            mask_threshold (:obj:`float`, `optional`, defaults to 0.5):
+            mask_threshold (`float`, *optional*, defaults to 0.5):
                 Threshold to use when turning the predicted masks into binary values.
 
         Return:
@@ -82,9 +81,9 @@ class ImageSegmentationPipeline(Pipeline):
 
             The dictionaries contain the following keys:
 
-            - **label** (:obj:`str`) -- The class label identified by the model.
-            - **score** (:obj:`float`) -- The score attributed by the model for that label.
-            - **mask** (:obj:`str`) -- base64 string of a grayscale (single-channel) PNG image that contain masks
+            - **label** (`str`) -- The class label identified by the model.
+            - **score** (`float`) -- The score attributed by the model for that label.
+            - **mask** (`str`) -- base64 string of a grayscale (single-channel) PNG image that contain masks
               information. The PNG image has size (heigth, width) of the original image. Pixel values in the image are
               either 0 or 255 (i.e. mask is absent VS mask is present).
         """
@@ -130,7 +129,8 @@ class ImageSegmentationPipeline(Pipeline):
         Turns mask numpy array into mask base64 str.
 
         Args:
-            mask (np.array): Numpy array (with shape (heigth, width) of the original image) containing masks information. Values in the array are either 0 or 255 (i.e. mask is absent VS mask is present).
+            mask (`np.array`): Numpy array (with shape (heigth, width) of the original image) containing masks
+                information. Values in the array are either 0 or 255 (i.e. mask is absent VS mask is present).
 
         Returns:
             A base64 string of a single-channel PNG image that contain masks information.
diff --git a/src/transformers/pipelines/object_detection.py b/src/transformers/pipelines/object_detection.py
index 0d8df38575..4cfcf8435c 100644
--- a/src/transformers/pipelines/object_detection.py
+++ b/src/transformers/pipelines/object_detection.py
@@ -24,14 +24,13 @@ Predictions = List[Prediction]
 @add_end_docstrings(PIPELINE_INIT_ARGS)
 class ObjectDetectionPipeline(Pipeline):
     """
-    Object detection pipeline using any :obj:`AutoModelForObjectDetection`. This pipeline predicts bounding boxes of
+    Object detection pipeline using any `AutoModelForObjectDetection`. This pipeline predicts bounding boxes of
     objects and their classes.
 
-    This object detection pipeline can currently be loaded from :func:`~transformers.pipeline` using the following task
-    identifier: :obj:`"object-detection"`.
+    This object detection pipeline can currently be loaded from [`pipeline`] using the following task
+    identifier: `"object-detection"`.
 
-    See the list of available models on `huggingface.co/models
-    <https://huggingface.co/models?filter=object-detection>`__.
+    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=object-detection).
     """
 
     def __init__(self, *args, **kwargs):
@@ -54,7 +53,7 @@ class ObjectDetectionPipeline(Pipeline):
         Detect objects (bounding boxes & classes) in the image(s) passed as inputs.
 
         Args:
-            images (:obj:`str`, :obj:`List[str]`, :obj:`PIL.Image` or :obj:`List[PIL.Image]`):
+            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                 The pipeline handles three types of images:
 
                 - A string containing an HTTP(S) link pointing to an image
@@ -63,7 +62,7 @@ class ObjectDetectionPipeline(Pipeline):
 
                 The pipeline accepts either a single image or a batch of images. Images in a batch must all be in the
                 same format: all as HTTP(S) links, all as local paths, or all as PIL images.
-            threshold (:obj:`float`, `optional`, defaults to 0.9):
+            threshold (`float`, *optional*, defaults to 0.9):
                 The probability necessary to make a prediction.
 
         Return:
@@ -73,9 +72,9 @@ class ObjectDetectionPipeline(Pipeline):
 
             The dictionaries contain the following keys:
 
-            - **label** (:obj:`str`) -- The class label identified by the model.
-            - **score** (:obj:`float`) -- The score attributed by the model for that label.
-            - **box** (:obj:`List[Dict[str, int]]`) -- The bounding box of detected object in image's original size.
+            - **label** (`str`) -- The class label identified by the model.
+            - **score** (`float`) -- The score attributed by the model for that label.
+            - **box** (`List[Dict[str, int]]`) -- The bounding box of detected object in image's original size.
         """
 
         return super().__call__(*args, **kwargs)
@@ -120,10 +119,10 @@ class ObjectDetectionPipeline(Pipeline):
         Turns list [xmin, xmax, ymin, ymax] into dict { "xmin": xmin, ... }
 
         Args:
-            box (torch.Tensor): Tensor containing the coordinates in corners format.
+            box (`torch.Tensor`): Tensor containing the coordinates in corners format.
 
         Returns:
-            bbox (Dict[str, int]): Dict containing the coordinates in corners format.
+            bbox (`Dict[str, int]`): Dict containing the coordinates in corners format.
         """
         if self.framework != "pt":
             raise ValueError("The ObjectDetectionPipeline is only available in PyTorch.")
diff --git a/src/transformers/pipelines/question_answering.py b/src/transformers/pipelines/question_answering.py
index 44cb512f80..3549eb7bf2 100644
--- a/src/transformers/pipelines/question_answering.py
+++ b/src/transformers/pipelines/question_answering.py
@@ -32,9 +32,9 @@ if is_torch_available():
 class QuestionAnsweringArgumentHandler(ArgumentHandler):
     """
     QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped to
-    internal :class:`~transformers.SquadExample`.
+    internal [`SquadExample`].
 
-    QuestionAnsweringArgumentHandler manages all the possible to create a :class:`~transformers.SquadExample` from the
+    QuestionAnsweringArgumentHandler manages all the possible to create a [`SquadExample`] from the
     command-line supplied arguments.
     """
 
@@ -101,15 +101,13 @@ class QuestionAnsweringArgumentHandler(ArgumentHandler):
 @add_end_docstrings(PIPELINE_INIT_ARGS)
 class QuestionAnsweringPipeline(Pipeline):
     """
-    Question Answering pipeline using any :obj:`ModelForQuestionAnswering`. See the `question answering examples
-    <../task_summary.html#question-answering>`__ for more information.
+    Question Answering pipeline using any `ModelForQuestionAnswering`. See the [question answering examples](../task_summary#question-answering) for more information.
 
-    This question answering pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
-    task identifier: :obj:`"question-answering"`.
+    This question answering pipeline can currently be loaded from [`pipeline`] using the following
+    task identifier: `"question-answering"`.
 
     The models that this pipeline can use are models that have been fine-tuned on a question answering task. See the
-    up-to-date list of available models on `huggingface.co/models
-    <https://huggingface.co/models?filter=question-answering>`__.
+    up-to-date list of available models on [huggingface.co/models](https://huggingface.co/models?filter=question-answering).
     """
 
     default_input_names = "question,context"
@@ -145,17 +143,17 @@ class QuestionAnsweringPipeline(Pipeline):
         question: Union[str, List[str]], context: Union[str, List[str]]
     ) -> Union[SquadExample, List[SquadExample]]:
         """
-        QuestionAnsweringPipeline leverages the :class:`~transformers.SquadExample` internally. This helper method
-        encapsulate all the logic for converting question(s) and context(s) to :class:`~transformers.SquadExample`.
+        QuestionAnsweringPipeline leverages the [`SquadExample`] internally. This helper method
+        encapsulate all the logic for converting question(s) and context(s) to [`SquadExample`].
 
         We currently support extractive question answering.
 
         Arguments:
-            question (:obj:`str` or :obj:`List[str]`): The question(s) asked.
-            context (:obj:`str` or :obj:`List[str]`): The context(s) in which we will look for the answer.
+            question (`str` or `List[str]`): The question(s) asked.
+            context (`str` or `List[str]`): The context(s) in which we will look for the answer.
 
         Returns:
-            One or a list of :class:`~transformers.SquadExample`: The corresponding :class:`~transformers.SquadExample`
+            One or a list of [`SquadExample`]: The corresponding [`SquadExample`]
             grouping question and context.
         """
         if isinstance(question, list):
@@ -206,43 +204,43 @@ class QuestionAnsweringPipeline(Pipeline):
         Answer the question(s) given as inputs by using the context(s).
 
         Args:
-            args (:class:`~transformers.SquadExample` or a list of :class:`~transformers.SquadExample`):
-                One or several :class:`~transformers.SquadExample` containing the question and context.
-            X (:class:`~transformers.SquadExample` or a list of :class:`~transformers.SquadExample`, `optional`):
-                One or several :class:`~transformers.SquadExample` containing the question and context (will be treated
+            args ([`SquadExample`] or a list of [`SquadExample`]):
+                One or several [`SquadExample`] containing the question and context.
+            X ([`SquadExample`] or a list of [`SquadExample`], *optional*):
+                One or several [`SquadExample`] containing the question and context (will be treated
                 the same way as if passed as the first positional argument).
-            data (:class:`~transformers.SquadExample` or a list of :class:`~transformers.SquadExample`, `optional`):
-                One or several :class:`~transformers.SquadExample` containing the question and context (will be treated
+            data ([`SquadExample`] or a list of [`SquadExample`], *optional*):
+                One or several [`SquadExample`] containing the question and context (will be treated
                 the same way as if passed as the first positional argument).
-            question (:obj:`str` or :obj:`List[str]`):
-                One or several question(s) (must be used in conjunction with the :obj:`context` argument).
-            context (:obj:`str` or :obj:`List[str]`):
+            question (`str` or `List[str]`):
+                One or several question(s) (must be used in conjunction with the `context` argument).
+            context (`str` or `List[str]`):
                 One or several context(s) associated with the question(s) (must be used in conjunction with the
-                :obj:`question` argument).
-            topk (:obj:`int`, `optional`, defaults to 1):
+                `question` argument).
+            topk (`int`, *optional*, defaults to 1):
                 The number of answers to return (will be chosen by order of likelihood). Note that we return less than
                 topk answers if there are not enough options available within the context.
-            doc_stride (:obj:`int`, `optional`, defaults to 128):
+            doc_stride (`int`, *optional*, defaults to 128):
                 If the context is too long to fit with the question for the model, it will be split in several chunks
                 with some overlap. This argument controls the size of that overlap.
-            max_answer_len (:obj:`int`, `optional`, defaults to 15):
+            max_answer_len (`int`, *optional*, defaults to 15):
                 The maximum length of predicted answers (e.g., only answers with a shorter length are considered).
-            max_seq_len (:obj:`int`, `optional`, defaults to 384):
+            max_seq_len (`int`, *optional*, defaults to 384):
                 The maximum length of the total sentence (context + question) after tokenization. The context will be
-                split in several chunks (using :obj:`doc_stride`) if needed.
-            max_question_len (:obj:`int`, `optional`, defaults to 64):
+                split in several chunks (using `doc_stride`) if needed.
+            max_question_len (`int`, *optional*, defaults to 64):
                 The maximum length of the question after tokenization. It will be truncated if needed.
-            handle_impossible_answer (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            handle_impossible_answer (`bool`, *optional*, defaults to `False`):
                 Whether or not we accept impossible as an answer.
 
         Return:
-            A :obj:`dict` or a list of :obj:`dict`: Each result comes as a dictionary with the following keys:
+            A `dict` or a list of `dict`: Each result comes as a dictionary with the following keys:
 
-            - **score** (:obj:`float`) -- The probability associated to the answer.
-            - **start** (:obj:`int`) -- The character start index of the answer (in the tokenized version of the
+            - **score** (`float`) -- The probability associated to the answer.
+            - **start** (`int`) -- The character start index of the answer (in the tokenized version of the
               input).
-            - **end** (:obj:`int`) -- The character end index of the answer (in the tokenized version of the input).
-            - **answer** (:obj:`str`) -- The answer to the question.
+            - **end** (`int`) -- The character end index of the answer (in the tokenized version of the input).
+            - **answer** (`str`) -- The answer to the question.
         """
         if kwargs.get("batch_size", 1) > 1:
             logger.error("Batch_size > 1 is not supported for question answering pipeline, setting it to 1.")
@@ -480,7 +478,7 @@ class QuestionAnsweringPipeline(Pipeline):
         self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int, undesired_tokens: np.ndarray
     ) -> Tuple:
         """
-        Take the output of any :obj:`ModelForQuestionAnswering` and will generate probabilities for each span to be the
+        Take the output of any `ModelForQuestionAnswering` and will generate probabilities for each span to be the
         actual answer.
 
         In addition, it filters out some unwanted/impossible cases like answer len being greater than max_answer_len or
@@ -488,11 +486,11 @@ class QuestionAnsweringPipeline(Pipeline):
         the topk argument.
 
         Args:
-            start (:obj:`np.ndarray`): Individual start probabilities for each token.
-            end (:obj:`np.ndarray`): Individual end probabilities for each token.
-            topk (:obj:`int`): Indicates how many possible answer span(s) to extract from the model output.
-            max_answer_len (:obj:`int`): Maximum size of the answer to extract from the model's output.
-            undesired_tokens (:obj:`np.ndarray`): Mask determining tokens that can be part of the answer
+            start (`np.ndarray`): Individual start probabilities for each token.
+            end (`np.ndarray`): Individual end probabilities for each token.
+            topk (`int`): Indicates how many possible answer span(s) to extract from the model output.
+            max_answer_len (`int`): Maximum size of the answer to extract from the model's output.
+            undesired_tokens (`np.ndarray`): Mask determining tokens that can be part of the answer
         """
         # Ensure we have batch axis
         if start.ndim == 1:
@@ -530,12 +528,12 @@ class QuestionAnsweringPipeline(Pipeline):
         When decoding from token probabilities, this method maps token indexes to actual word in the initial context.
 
         Args:
-            text (:obj:`str`): The actual context to extract the answer from.
-            start (:obj:`int`): The answer starting token index.
-            end (:obj:`int`): The answer end token index.
+            text (`str`): The actual context to extract the answer from.
+            start (`int`): The answer starting token index.
+            end (`int`): The answer end token index.
 
         Returns:
-            Dictionary like :obj:`{'answer': str, 'start': int, 'end': int}`
+            Dictionary like `{'answer': str, 'start': int, 'end': int}`
         """
         words = []
         token_idx = char_start_idx = char_end_idx = chars_idx = 0
diff --git a/src/transformers/pipelines/table_question_answering.py b/src/transformers/pipelines/table_question_answering.py
index 3634fa8c69..fcce75b827 100644
--- a/src/transformers/pipelines/table_question_answering.py
+++ b/src/transformers/pipelines/table_question_answering.py
@@ -82,15 +82,14 @@ class TableQuestionAnsweringArgumentHandler(ArgumentHandler):
 @add_end_docstrings(PIPELINE_INIT_ARGS)
 class TableQuestionAnsweringPipeline(Pipeline):
     """
-    Table Question Answering pipeline using a :obj:`ModelForTableQuestionAnswering`. This pipeline is only available in
+    Table Question Answering pipeline using a `ModelForTableQuestionAnswering`. This pipeline is only available in
     PyTorch.
 
-    This tabular question answering pipeline can currently be loaded from :func:`~transformers.pipeline` using the
-    following task identifier: :obj:`"table-question-answering"`.
+    This tabular question answering pipeline can currently be loaded from [`pipeline`] using the
+    following task identifier: `"table-question-answering"`.
 
     The models that this pipeline can use are models that have been fine-tuned on a tabular question answering task.
-    See the up-to-date list of available models on `huggingface.co/models
-    <https://huggingface.co/models?filter=table-question-answering>`__.
+    See the up-to-date list of available models on [huggingface.co/models](https://huggingface.co/models?filter=table-question-answering).
     """
 
     default_input_names = "table,query"
@@ -245,60 +244,63 @@ class TableQuestionAnsweringPipeline(Pipeline):
         r"""
         Answers queries according to a table. The pipeline accepts several types of inputs which are detailed below:
 
-        - ``pipeline(table, query)``
-        - ``pipeline(table, [query])``
-        - ``pipeline(table=table, query=query)``
-        - ``pipeline(table=table, query=[query])``
-        - ``pipeline({"table": table, "query": query})``
-        - ``pipeline({"table": table, "query": [query]})``
-        - ``pipeline([{"table": table, "query": query}, {"table": table, "query": query}])``
+        - `pipeline(table, query)`
+        - `pipeline(table, [query])`
+        - `pipeline(table=table, query=query)`
+        - `pipeline(table=table, query=[query])`
+        - `pipeline({"table": table, "query": query})`
+        - `pipeline({"table": table, "query": [query]})`
+        - `pipeline([{"table": table, "query": query}, {"table": table, "query": query}])`
 
-        The :obj:`table` argument should be a dict or a DataFrame built from that dict, containing the whole table:
+        The `table` argument should be a dict or a DataFrame built from that dict, containing the whole table:
 
-        Example::
+        Example:
 
-            data = {
-                "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
-                "age": ["56", "45", "59"],
-                "number of movies": ["87", "53", "69"],
-                "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
-            }
+        ```python
+        data = {
+        "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
+        "age": ["56", "45", "59"],
+        "number of movies": ["87", "53", "69"],
+        "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
+        }
+        ```
 
         This dictionary can be passed in as such, or can be converted to a pandas DataFrame:
 
-        Example::
-
-            import pandas as pd
-            table = pd.DataFrame.from_dict(data)
+        Example:
 
+        ```python
+        import pandas as pd
+        table = pd.DataFrame.from_dict(data)
+        ```
 
         Args:
-            table (:obj:`pd.DataFrame` or :obj:`Dict`):
+            table (`pd.DataFrame` or `Dict`):
                 Pandas DataFrame or dictionary that will be converted to a DataFrame containing all the table values.
                 See above for an example of dictionary.
-            query (:obj:`str` or :obj:`List[str]`):
+            query (`str` or `List[str]`):
                 Query or list of queries that will be sent to the model alongside the table.
-            sequential (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            sequential (`bool`, *optional*, defaults to `False`):
                 Whether to do inference sequentially or as a batch. Batching is faster, but models like SQA require the
                 inference to be done sequentially to extract relations within sequences, given their conversational
                 nature.
-            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
                 Activates and controls padding. Accepts the following values:
 
-                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a
                   single sequence if provided).
-                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
                   maximum acceptable input length for the model if that argument is not provided.
-                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
                   different lengths).
 
-            truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.TapasTruncationStrategy`, `optional`, defaults to :obj:`False`):
+            truncation (`bool`, `str` or [`TapasTruncationStrategy`], *optional*, defaults to `False`):
                 Activates and controls truncation. Accepts the following values:
 
-                * :obj:`True` or :obj:`'drop_rows_to_fit'`: Truncate to a maximum length specified with the argument
-                  :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
+                - `True` or `'drop_rows_to_fit'`: Truncate to a maximum length specified with the argument
+                  `max_length` or to the maximum acceptable input length for the model if that argument is not
                   provided. This will truncate row by row, removing rows from the table.
-                * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with
+                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with
                   sequence lengths greater than the model maximum admissible input size).
 
 
@@ -306,11 +308,11 @@ class TableQuestionAnsweringPipeline(Pipeline):
             A dictionary or a list of dictionaries containing results: Each result is a dictionary with the following
             keys:
 
-            - **answer** (:obj:`str`) -- The answer of the query given the table. If there is an aggregator, the answer
-              will be preceded by :obj:`AGGREGATOR >`.
-            - **coordinates** (:obj:`List[Tuple[int, int]]`) -- Coordinates of the cells of the answers.
-            - **cells** (:obj:`List[str]`) -- List of strings made up of the answer cell values.
-            - **aggregator** (:obj:`str`) -- If the model has an aggregator, this returns the aggregator.
+            - **answer** (`str`) -- The answer of the query given the table. If there is an aggregator, the answer
+              will be preceded by `AGGREGATOR >`.
+            - **coordinates** (`List[Tuple[int, int]]`) -- Coordinates of the cells of the answers.
+            - **cells** (`List[str]`) -- List of strings made up of the answer cell values.
+            - **aggregator** (`str`) -- If the model has an aggregator, this returns the aggregator.
         """
         pipeline_inputs = self._args_parser(*args, **kwargs)
 
diff --git a/src/transformers/pipelines/text2text_generation.py b/src/transformers/pipelines/text2text_generation.py
index 050d025783..69bff60a80 100644
--- a/src/transformers/pipelines/text2text_generation.py
+++ b/src/transformers/pipelines/text2text_generation.py
@@ -27,18 +27,18 @@ class Text2TextGenerationPipeline(Pipeline):
     """
     Pipeline for text to text generation using seq2seq models.
 
-    This Text2TextGenerationPipeline pipeline can currently be loaded from :func:`~transformers.pipeline` using the
-    following task identifier: :obj:`"text2text-generation"`.
+    This Text2TextGenerationPipeline pipeline can currently be loaded from [`pipeline`] using the
+    following task identifier: `"text2text-generation"`.
 
     The models that this pipeline can use are models that have been fine-tuned on a translation task. See the
-    up-to-date list of available models on `huggingface.co/models
-    <https://huggingface.co/models?filter=text2text-generation>`__.
+    up-to-date list of available models on [huggingface.co/models](https://huggingface.co/models?filter=text2text-generation).
 
-    Usage::
+    Usage:
 
-        text2text_generator = pipeline("text2text-generation")
-        text2text_generator("question: What is 42 ? context: 42 is the answer to life, the universe and everything")
-    """
+    ```python
+    text2text_generator = pipeline("text2text-generation")
+    text2text_generator("question: What is 42 ? context: 42 is the answer to life, the universe and everything")
+    ```"""
 
     # Used in the return key of the pipeline.
     return_name = "generated"
@@ -110,27 +110,27 @@ class Text2TextGenerationPipeline(Pipeline):
         Generate the output text(s) using text(s) given as inputs.
 
         Args:
-            args (:obj:`str` or :obj:`List[str]`):
+            args (`str` or `List[str]`):
                 Input text for the encoder.
-            return_tensors (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            return_tensors (`bool`, *optional*, defaults to `False`):
                 Whether or not to include the tensors of predictions (as token indices) in the outputs.
-            return_text (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            return_text (`bool`, *optional*, defaults to `True`):
                 Whether or not to include the decoded texts in the outputs.
-            clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
                 Whether or not to clean up the potential extra spaces in the text output.
-            truncation (:obj:`TruncationStrategy`, `optional`, defaults to :obj:`TruncationStrategy.DO_NOT_TRUNCATE`):
+            truncation (`TruncationStrategy`, *optional*, defaults to `TruncationStrategy.DO_NOT_TRUNCATE`):
                 The truncation strategy for the tokenization within the pipeline.
-                :obj:`TruncationStrategy.DO_NOT_TRUNCATE` (default) will never truncate, but it is sometimes desirable
+                `TruncationStrategy.DO_NOT_TRUNCATE` (default) will never truncate, but it is sometimes desirable
                 to truncate the input to fit the model's max_length instead of throwing an error down the line.
             generate_kwargs:
                 Additional keyword arguments to pass along to the generate method of the model (see the generate method
-                corresponding to your framework `here <./model.html#generative-models>`__).
+                corresponding to your framework [here](./model#generative-models)).
 
         Return:
-            A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the following keys:
+            A list or a list of list of `dict`: Each result comes as a dictionary with the following keys:
 
-            - **generated_text** (:obj:`str`, present when ``return_text=True``) -- The generated text.
-            - **generated_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``)
+            - **generated_text** (`str`, present when `return_text=True`) -- The generated text.
+            - **generated_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`)
               -- The token ids of the generated text.
         """
 
@@ -175,23 +175,24 @@ class SummarizationPipeline(Text2TextGenerationPipeline):
     """
     Summarize news articles and other documents.
 
-    This summarizing pipeline can currently be loaded from :func:`~transformers.pipeline` using the following task
-    identifier: :obj:`"summarization"`.
+    This summarizing pipeline can currently be loaded from [`pipeline`] using the following task
+    identifier: `"summarization"`.
 
     The models that this pipeline can use are models that have been fine-tuned on a summarization task, which is
-    currently, '`bart-large-cnn`', '`t5-small`', '`t5-base`', '`t5-large`', '`t5-3b`', '`t5-11b`'. See the up-to-date
-    list of available models on `huggingface.co/models <https://huggingface.co/models?filter=summarization>`__.
+    currently, '*bart-large-cnn*', '*t5-small*', '*t5-base*', '*t5-large*', '*t5-3b*', '*t5-11b*'. See the up-to-date
+    list of available models on [huggingface.co/models](https://huggingface.co/models?filter=summarization).
 
-    Usage::
+    Usage:
 
-        # use bart in pytorch
-        summarizer = pipeline("summarization")
-        summarizer("An apple a day, keeps the doctor away", min_length=5, max_length=20)
+    ```python
+    # use bart in pytorch
+    summarizer = pipeline("summarization")
+    summarizer("An apple a day, keeps the doctor away", min_length=5, max_length=20)
 
-        # use t5 in tf
-        summarizer = pipeline("summarization", model="t5-base", tokenizer="t5-base", framework="tf")
-        summarizer("An apple a day, keeps the doctor away", min_length=5, max_length=20)
-    """
+    # use t5 in tf
+    summarizer = pipeline("summarization", model="t5-base", tokenizer="t5-base", framework="tf")
+    summarizer("An apple a day, keeps the doctor away", min_length=5, max_length=20)
+    ```"""
 
     # Used in the return key of the pipeline.
     return_name = "summary"
@@ -201,24 +202,24 @@ class SummarizationPipeline(Text2TextGenerationPipeline):
         Summarize the text(s) given as inputs.
 
         Args:
-            documents (`str` or :obj:`List[str]`):
+            documents (*str* or `List[str]`):
                 One or several articles (or one list of articles) to summarize.
-            return_text (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            return_text (`bool`, *optional*, defaults to `True`):
                 Whether or not to include the decoded texts in the outputs
-            return_tensors (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            return_tensors (`bool`, *optional*, defaults to `False`):
                 Whether or not to include the tensors of predictions (as token indices) in the outputs.
-            clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
                 Whether or not to clean up the potential extra spaces in the text output.
             generate_kwargs:
                 Additional keyword arguments to pass along to the generate method of the model (see the generate method
-                corresponding to your framework `here <./model.html#generative-models>`__).
+                corresponding to your framework [here](./model#generative-models)).
 
         Return:
-            A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the following keys:
+            A list or a list of list of `dict`: Each result comes as a dictionary with the following keys:
 
-            - **summary_text** (:obj:`str`, present when ``return_text=True``) -- The summary of the corresponding
+            - **summary_text** (`str`, present when `return_text=True`) -- The summary of the corresponding
               input.
-            - **summary_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``) --
+            - **summary_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) --
               The token ids of the summary.
         """
         return super().__call__(*args, **kwargs)
@@ -242,17 +243,18 @@ class TranslationPipeline(Text2TextGenerationPipeline):
     """
     Translates from one language to another.
 
-    This translation pipeline can currently be loaded from :func:`~transformers.pipeline` using the following task
-    identifier: :obj:`"translation_xx_to_yy"`.
+    This translation pipeline can currently be loaded from [`pipeline`] using the following task
+    identifier: `"translation_xx_to_yy"`.
 
     The models that this pipeline can use are models that have been fine-tuned on a translation task. See the
-    up-to-date list of available models on `huggingface.co/models
-    <https://huggingface.co/models?filter=translation>`__.
+    up-to-date list of available models on [huggingface.co/models](https://huggingface.co/models?filter=translation).
 
-    Usage::
-        en_fr_translator = pipeline("translation_en_to_fr")
-        en_fr_translator("How old are you?")
-    """
+    Usage:
+
+    ```python
+    en_fr_translator = pipeline("translation_en_to_fr")
+    en_fr_translator("How old are you?")
+    ```"""
 
     # Used in the return key of the pipeline.
     return_name = "translation"
@@ -294,29 +296,29 @@ class TranslationPipeline(Text2TextGenerationPipeline):
         Translate the text(s) given as inputs.
 
         Args:
-            args (:obj:`str` or :obj:`List[str]`):
+            args (`str` or `List[str]`):
                 Texts to be translated.
-            return_tensors (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            return_tensors (`bool`, *optional*, defaults to `False`):
                 Whether or not to include the tensors of predictions (as token indices) in the outputs.
-            return_text (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            return_text (`bool`, *optional*, defaults to `True`):
                 Whether or not to include the decoded texts in the outputs.
-            clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
                 Whether or not to clean up the potential extra spaces in the text output.
-            src_lang (:obj:`str`, `optional`):
+            src_lang (`str`, *optional*):
                 The language of the input. Might be required for multilingual models. Will not have any effect for
                 single pair translation models
-            tgt_lang (:obj:`str`, `optional`):
+            tgt_lang (`str`, *optional*):
                 The language of the desired output. Might be required for multilingual models. Will not have any effect
                 for single pair translation models
             generate_kwargs:
                 Additional keyword arguments to pass along to the generate method of the model (see the generate method
-                corresponding to your framework `here <./model.html#generative-models>`__).
+                corresponding to your framework [here](./model#generative-models)).
 
         Return:
-            A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the following keys:
+            A list or a list of list of `dict`: Each result comes as a dictionary with the following keys:
 
-            - **translation_text** (:obj:`str`, present when ``return_text=True``) -- The translation.
-            - **translation_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``)
+            - **translation_text** (`str`, present when `return_text=True`) -- The translation.
+            - **translation_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`)
               -- The token ids of the translation.
         """
         return super().__call__(*args, **kwargs)
diff --git a/src/transformers/pipelines/text_classification.py b/src/transformers/pipelines/text_classification.py
index 4c331d9170..e3af7265a7 100644
--- a/src/transformers/pipelines/text_classification.py
+++ b/src/transformers/pipelines/text_classification.py
@@ -32,33 +32,32 @@ class ClassificationFunction(ExplicitEnum):
 @add_end_docstrings(
     PIPELINE_INIT_ARGS,
     r"""
-        return_all_scores (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        return_all_scores (`bool`, *optional*, defaults to `False`):
             Whether to return all prediction scores or just the one of the predicted class.
-        function_to_apply (:obj:`str`, `optional`, defaults to :obj:`"default"`):
+        function_to_apply (`str`, *optional*, defaults to `"default"`):
             The function to apply to the model outputs in order to retrieve the scores. Accepts four different values:
 
-            - :obj:`"default"`: if the model has a single label, will apply the sigmoid function on the output. If the
+            - `"default"`: if the model has a single label, will apply the sigmoid function on the output. If the
               model has several labels, will apply the softmax function on the output.
-            - :obj:`"sigmoid"`: Applies the sigmoid function on the output.
-            - :obj:`"softmax"`: Applies the softmax function on the output.
-            - :obj:`"none"`: Does not apply any function on the output.
+            - `"sigmoid"`: Applies the sigmoid function on the output.
+            - `"softmax"`: Applies the softmax function on the output.
+            - `"none"`: Does not apply any function on the output.
     """,
 )
 class TextClassificationPipeline(Pipeline):
     """
-    Text classification pipeline using any :obj:`ModelForSequenceClassification`. See the `sequence classification
-    examples <../task_summary.html#sequence-classification>`__ for more information.
+    Text classification pipeline using any `ModelForSequenceClassification`. See the [sequence classification
+    examples](../task_summary#sequence-classification) for more information.
 
-    This text classification pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
-    task identifier: :obj:`"sentiment-analysis"` (for classifying sequences according to positive or negative
+    This text classification pipeline can currently be loaded from [`pipeline`] using the following
+    task identifier: `"sentiment-analysis"` (for classifying sequences according to positive or negative
     sentiments).
 
-    If multiple classification labels are available (:obj:`model.config.num_labels >= 2`), the pipeline will run a
+    If multiple classification labels are available (`model.config.num_labels >= 2`), the pipeline will run a
     softmax over the results. If there is a single label, the pipeline will run a sigmoid over the result.
 
     The models that this pipeline can use are models that have been fine-tuned on a sequence classification task. See
-    the up-to-date list of available models on `huggingface.co/models
-    <https://huggingface.co/models?filter=text-classification>`__.
+    the up-to-date list of available models on [huggingface.co/models](https://huggingface.co/models?filter=text-classification).
     """
 
     return_all_scores = False
@@ -95,11 +94,11 @@ class TextClassificationPipeline(Pipeline):
         Classify the text(s) given as inputs.
 
         Args:
-            args (:obj:`str` or :obj:`List[str]`):
+            args (`str` or `List[str]`):
                 One or several texts (or one list of prompts) to classify.
-            return_all_scores (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            return_all_scores (`bool`, *optional*, defaults to `False`):
                 Whether to return scores for all labels.
-            function_to_apply (:obj:`str`, `optional`, defaults to :obj:`"default"`):
+            function_to_apply (`str`, *optional*, defaults to `"default"`):
                 The function to apply to the model outputs in order to retrieve the scores. Accepts four different
                 values:
 
@@ -111,17 +110,17 @@ class TextClassificationPipeline(Pipeline):
 
                 Possible values are:
 
-                - :obj:`"sigmoid"`: Applies the sigmoid function on the output.
-                - :obj:`"softmax"`: Applies the softmax function on the output.
-                - :obj:`"none"`: Does not apply any function on the output.
+                - `"sigmoid"`: Applies the sigmoid function on the output.
+                - `"softmax"`: Applies the softmax function on the output.
+                - `"none"`: Does not apply any function on the output.
 
         Return:
-            A list or a list of list of :obj:`dict`: Each result comes as list of dictionaries with the following keys:
+            A list or a list of list of `dict`: Each result comes as list of dictionaries with the following keys:
 
-            - **label** (:obj:`str`) -- The label predicted.
-            - **score** (:obj:`float`) -- The corresponding probability.
+            - **label** (`str`) -- The label predicted.
+            - **score** (`float`) -- The corresponding probability.
 
-            If ``self.return_all_scores=True``, one such dictionary is returned per label.
+            If `self.return_all_scores=True`, one such dictionary is returned per label.
         """
         result = super().__call__(*args, **kwargs)
         if isinstance(args[0], str):
diff --git a/src/transformers/pipelines/text_generation.py b/src/transformers/pipelines/text_generation.py
index aed44bab46..9ae77c3486 100644
--- a/src/transformers/pipelines/text_generation.py
+++ b/src/transformers/pipelines/text_generation.py
@@ -15,15 +15,15 @@ class ReturnType(enum.Enum):
 @add_end_docstrings(PIPELINE_INIT_ARGS)
 class TextGenerationPipeline(Pipeline):
     """
-    Language generation pipeline using any :obj:`ModelWithLMHead`. This pipeline predicts the words that will follow a
+    Language generation pipeline using any `ModelWithLMHead`. This pipeline predicts the words that will follow a
     specified text prompt.
 
-    This language generation pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
-    task identifier: :obj:`"text-generation"`.
+    This language generation pipeline can currently be loaded from [`pipeline`] using the following
+    task identifier: `"text-generation"`.
 
     The models that this pipeline can use are models that have been trained with an autoregressive language modeling
     objective, which includes the uni-directional models in the library (e.g. gpt2). See the list of available models
-    on `huggingface.co/models <https://huggingface.co/models?filter=text-generation>`__.
+    on [huggingface.co/models](https://huggingface.co/models?filter=text-generation).
     """
 
     # Prefix text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
@@ -134,38 +134,38 @@ class TextGenerationPipeline(Pipeline):
         Complete the prompt(s) given as inputs.
 
         Args:
-            args (:obj:`str` or :obj:`List[str]`):
+            args (`str` or `List[str]`):
                 One or several prompts (or one list of prompts) to complete.
-            return_tensors (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            return_tensors (`bool`, *optional*, defaults to `False`):
                 Whether or not to include the tensors of predictions (as token indices) in the outputs.
-            return_text (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            return_text (`bool`, *optional*, defaults to `True`):
                 Whether or not to include the decoded texts in the outputs.
-            return_full_text (:obj:`bool`, `optional`, defaults to :obj:`True`):
-                If set to :obj:`False` only added text is returned, otherwise the full text is returned Only meaningful
-                if `return_text` is set to True.
-            clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            return_full_text (`bool`, *optional*, defaults to `True`):
+                If set to `False` only added text is returned, otherwise the full text is returned Only meaningful
+                if *return_text* is set to True.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
                 Whether or not to clean up the potential extra spaces in the text output.
-            prefix (:obj:`str`, `optional`):
+            prefix (`str`, *optional*):
                 Prefix added to prompt.
-            handle_long_generation (:obj:`str`, `optional`):
+            handle_long_generation (`str`, *optional*):
                 By default, this pipelines does not handle long generation (ones that exceed in one form or the other
                 the model maximum length). There is no perfect way to adress this (more info
                 :https://github.com/huggingface/transformers/issues/14033#issuecomment-948385227). This provides common
                 strategies to work around that problem depending on your use case.
 
-                - :obj:`None` : default strategy where nothing in particular happens
-                - :obj:`"hole"`: Truncates left of input, and leaves a gap wide enough to let generation happen (might
+                - `None` : default strategy where nothing in particular happens
+                - `"hole"`: Truncates left of input, and leaves a gap wide enough to let generation happen (might
                   truncate a lot of the prompt and not suitable when generation exceed the model capacity)
 
             generate_kwargs:
                 Additional keyword arguments to pass along to the generate method of the model (see the generate method
-                corresponding to your framework `here <./model.html#generative-models>`__).
+                corresponding to your framework [here](./model#generative-models)).
 
         Return:
-            A list or a list of list of :obj:`dict`: Each result comes as a dictionary with the following keys:
+            A list or a list of list of `dict`: Each result comes as a dictionary with the following keys:
 
-            - **generated_text** (:obj:`str`, present when ``return_text=True``) -- The generated text.
-            - **generated_token_ids** (:obj:`torch.Tensor` or :obj:`tf.Tensor`, present when ``return_tensors=True``)
+            - **generated_text** (`str`, present when `return_text=True`) -- The generated text.
+            - **generated_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`)
               -- The token ids of the generated text.
         """
         return super().__call__(text_inputs, **kwargs)
diff --git a/src/transformers/pipelines/token_classification.py b/src/transformers/pipelines/token_classification.py
index 9bfea4c378..13d7014693 100644
--- a/src/transformers/pipelines/token_classification.py
+++ b/src/transformers/pipelines/token_classification.py
@@ -56,12 +56,13 @@ class AggregationStrategy(ExplicitEnum):
 @add_end_docstrings(
     PIPELINE_INIT_ARGS,
     r"""
-        ignore_labels (:obj:`List[str]`, defaults to :obj:`["O"]`):
+        ignore_labels (`List[str]`, defaults to `["O"]`):
             A list of labels to ignore.
-        grouped_entities (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            DEPRECATED, use :obj:`aggregation_strategy` instead. Whether or not to group the tokens corresponding to
+        grouped_entities (`bool`, *optional*, defaults to `False`):
+            DEPRECATED, use `aggregation_strategy` instead. Whether or not to group the tokens corresponding to
             the same entity together in the predictions or not.
-        aggregation_strategy (:obj:`str`, `optional`, defaults to :obj:`"none"`): The strategy to fuse (or not) tokens based on the model prediction.
+        aggregation_strategy (`str`, *optional*, defaults to `"none"`):
+            The strategy to fuse (or not) tokens based on the model prediction.
 
                 - "none" : Will simply not do any aggregation and simply return raw results from the model
                 - "simple" : Will attempt to group entities following the default schema. (A, B-TAG), (B, I-TAG), (C,
@@ -72,28 +73,27 @@ class AggregationStrategy(ExplicitEnum):
                   "NAME"}]. Look for FIRST, MAX, AVERAGE for ways to mitigate that and disambiguate words (on languages
                   that support that meaning, which is basically tokens separated by a space). These mitigations will
                   only work on real words, "New york" might still be tagged with two different entities.
-                - "first" : (works only on word based models) Will use the :obj:`SIMPLE` strategy except that words,
+                - "first" : (works only on word based models) Will use the `SIMPLE` strategy except that words,
                   cannot end up with different tags. Words will simply use the tag of the first token of the word when
                   there is ambiguity.
-                - "average" : (works only on word based models) Will use the :obj:`SIMPLE` strategy except that words,
+                - "average" : (works only on word based models) Will use the `SIMPLE` strategy except that words,
                   cannot end up with different tags. scores will be averaged first across tokens, and then the maximum
                   label is applied.
-                - "max" : (works only on word based models) Will use the :obj:`SIMPLE` strategy except that words,
+                - "max" : (works only on word based models) Will use the `SIMPLE` strategy except that words,
                   cannot end up with different tags. Word entity will simply be the token with the maximum score.
     """,
 )
 class TokenClassificationPipeline(Pipeline):
     """
-    Named Entity Recognition pipeline using any :obj:`ModelForTokenClassification`. See the `named entity recognition
-    examples <../task_summary.html#named-entity-recognition>`__ for more information.
+    Named Entity Recognition pipeline using any `ModelForTokenClassification`. See the [named entity recognition
+    examples](../task_summary#named-entity-recognition) for more information.
 
-    This token recognition pipeline can currently be loaded from :func:`~transformers.pipeline` using the following
-    task identifier: :obj:`"ner"` (for predicting the classes of tokens in a sequence: person, organisation, location
+    This token recognition pipeline can currently be loaded from [`pipeline`] using the following
+    task identifier: `"ner"` (for predicting the classes of tokens in a sequence: person, organisation, location
     or miscellaneous).
 
     The models that this pipeline can use are models that have been fine-tuned on a token classification task. See the
-    up-to-date list of available models on `huggingface.co/models
-    <https://huggingface.co/models?filter=token-classification>`__.
+    up-to-date list of available models on [huggingface.co/models](https://huggingface.co/models?filter=token-classification).
     """
 
     default_input_names = "sequences"
@@ -162,23 +162,23 @@ class TokenClassificationPipeline(Pipeline):
         Classify each token of the text(s) given as inputs.
 
         Args:
-            inputs (:obj:`str` or :obj:`List[str]`):
+            inputs (`str` or `List[str]`):
                 One or several texts (or one list of texts) for token classification.
 
         Return:
-            A list or a list of list of :obj:`dict`: Each result comes as a list of dictionaries (one for each token in
+            A list or a list of list of `dict`: Each result comes as a list of dictionaries (one for each token in
             the corresponding input, or each entity if this pipeline was instantiated with an aggregation_strategy)
             with the following keys:
 
-            - **word** (:obj:`str`) -- The token/word classified.
-            - **score** (:obj:`float`) -- The corresponding probability for :obj:`entity`.
-            - **entity** (:obj:`str`) -- The entity predicted for that token/word (it is named `entity_group` when
-              `aggregation_strategy` is not :obj:`"none"`.
-            - **index** (:obj:`int`, only present when ``aggregation_strategy="none"``) -- The index of the
+            - **word** (`str`) -- The token/word classified.
+            - **score** (`float`) -- The corresponding probability for `entity`.
+            - **entity** (`str`) -- The entity predicted for that token/word (it is named *entity_group* when
+              *aggregation_strategy* is not `"none"`.
+            - **index** (`int`, only present when `aggregation_strategy="none"`) -- The index of the
               corresponding token in the sentence.
-            - **start** (:obj:`int`, `optional`) -- The index of the start of the corresponding entity in the sentence.
+            - **start** (`int`, *optional*) -- The index of the start of the corresponding entity in the sentence.
               Only exists if the offsets are available within the tokenizer
-            - **end** (:obj:`int`, `optional`) -- The index of the end of the corresponding entity in the sentence.
+            - **end** (`int`, *optional*) -- The index of the end of the corresponding entity in the sentence.
               Only exists if the offsets are available within the tokenizer
         """
 
@@ -395,7 +395,7 @@ class TokenClassificationPipeline(Pipeline):
         Group together the adjacent tokens with the same entity predicted.
 
         Args:
-            entities (:obj:`dict`): The entities predicted by the pipeline.
+            entities (`dict`): The entities predicted by the pipeline.
         """
         # Get the first entity in the entity group
         entity = entities[0]["entity"].split("-")[-1]
@@ -430,7 +430,7 @@ class TokenClassificationPipeline(Pipeline):
         Find and group together the adjacent tokens with the same entity predicted.
 
         Args:
-            entities (:obj:`dict`): The entities predicted by the pipeline.
+            entities (`dict`): The entities predicted by the pipeline.
         """
 
         entity_groups = []
diff --git a/src/transformers/pipelines/zero_shot_classification.py b/src/transformers/pipelines/zero_shot_classification.py
index be188cfb82..e5ffb9b1c7 100644
--- a/src/transformers/pipelines/zero_shot_classification.py
+++ b/src/transformers/pipelines/zero_shot_classification.py
@@ -46,19 +46,19 @@ class ZeroShotClassificationArgumentHandler(ArgumentHandler):
 @add_end_docstrings(PIPELINE_INIT_ARGS)
 class ZeroShotClassificationPipeline(Pipeline):
     """
-    NLI-based zero-shot classification pipeline using a :obj:`ModelForSequenceClassification` trained on NLI (natural
+    NLI-based zero-shot classification pipeline using a `ModelForSequenceClassification` trained on NLI (natural
     language inference) tasks.
 
     Any combination of sequences and labels can be passed and each combination will be posed as a premise/hypothesis
-    pair and passed to the pretrained model. Then, the logit for `entailment` is taken as the logit for the candidate
-    label being valid. Any NLI model can be used, but the id of the `entailment` label must be included in the model
-    config's :attr:`~transformers.PretrainedConfig.label2id`.
+    pair and passed to the pretrained model. Then, the logit for *entailment* is taken as the logit for the candidate
+    label being valid. Any NLI model can be used, but the id of the *entailment* label must be included in the model
+    config's :attr:*~transformers.PretrainedConfig.label2id*.
 
-    This NLI pipeline can currently be loaded from :func:`~transformers.pipeline` using the following task identifier:
-    :obj:`"zero-shot-classification"`.
+    This NLI pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"zero-shot-classification"`.
 
     The models that this pipeline can use are models that have been fine-tuned on an NLI task. See the up-to-date list
-    of available models on `huggingface.co/models <https://huggingface.co/models?search=nli>`__.
+    of available models on [huggingface.co/models](https://huggingface.co/models?search=nli).
     """
 
     def __init__(self, args_parser=ZeroShotClassificationArgumentHandler(), *args, **kwargs):
@@ -154,34 +154,34 @@ class ZeroShotClassificationPipeline(Pipeline):
         **kwargs,
     ):
         """
-        Classify the sequence(s) given as inputs. See the :class:`~transformers.ZeroShotClassificationPipeline`
+        Classify the sequence(s) given as inputs. See the [`ZeroShotClassificationPipeline`]
         documentation for more information.
 
         Args:
-            sequences (:obj:`str` or :obj:`List[str]`):
+            sequences (`str` or `List[str]`):
                 The sequence(s) to classify, will be truncated if the model input is too large.
-            candidate_labels (:obj:`str` or :obj:`List[str]`):
+            candidate_labels (`str` or `List[str]`):
                 The set of possible class labels to classify each sequence into. Can be a single label, a string of
                 comma-separated labels, or a list of labels.
-            hypothesis_template (:obj:`str`, `optional`, defaults to :obj:`"This example is {}."`):
+            hypothesis_template (`str`, *optional*, defaults to `"This example is {}."`):
                 The template used to turn each label into an NLI-style hypothesis. This template must include a {} or
                 similar syntax for the candidate label to be inserted into the template. For example, the default
-                template is :obj:`"This example is {}."` With the candidate label :obj:`"sports"`, this would be fed
-                into the model like :obj:`"<cls> sequence to classify <sep> This example is sports . <sep>"`. The
+                template is `"This example is {}."` With the candidate label `"sports"`, this would be fed
+                into the model like `"<cls> sequence to classify <sep> This example is sports . <sep>"`. The
                 default template works well in many cases, but it may be worthwhile to experiment with different
                 templates depending on the task setting.
-            multi_label (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not multiple candidate labels can be true. If :obj:`False`, the scores are normalized such
-                that the sum of the label likelihoods for each sequence is 1. If :obj:`True`, the labels are considered
+            multi_label (`bool`, *optional*, defaults to `False`):
+                Whether or not multiple candidate labels can be true. If `False`, the scores are normalized such
+                that the sum of the label likelihoods for each sequence is 1. If `True`, the labels are considered
                 independent and probabilities are normalized for each candidate by doing a softmax of the entailment
                 score vs. the contradiction score.
 
         Return:
-            A :obj:`dict` or a list of :obj:`dict`: Each result comes as a dictionary with the following keys:
+            A `dict` or a list of `dict`: Each result comes as a dictionary with the following keys:
 
-            - **sequence** (:obj:`str`) -- The sequence for which this is the output.
-            - **labels** (:obj:`List[str]`) -- The labels sorted by order of likelihood.
-            - **scores** (:obj:`List[float]`) -- The probabilities for each of the labels.
+            - **sequence** (`str`) -- The sequence for which this is the output.
+            - **labels** (`List[str]`) -- The labels sorted by order of likelihood.
+            - **scores** (`List[float]`) -- The probabilities for each of the labels.
         """
         if kwargs.get("batch_size", 1) > 1:
             logger.error("Batch size > 1 is not supported for zero-shot pipeline, setting batch_size=1.")
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index 37ef221472..d56c516ad0 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -688,49 +688,49 @@ class CaptureStd:
     """
     Context manager to capture:
 
-        - stdout: replay it, clean it up and make it available via ``obj.out``
-        - stderr: replay it and make it available via ``obj.err``
+        - stdout: replay it, clean it up and make it available via `obj.out`
+        - stderr: replay it and make it available via `obj.err`
 
-        init arguments:
+    Args:
+        out (`bool`, *optional*, defaults to `True`): Whether to capture stdout or not.
+        err (`bool`, *optional*, defaults to `True`): Whether to capture stderr or not.
+        replay (`bool`, *optional*, defaults to `True`): Whether to replay or not.
+            By default each captured stream gets replayed back on context's exit, so that one can see what the test was
+            doing. If this is a not wanted behavior and the captured data shouldn't be replayed, pass `replay=False`
+            to disable this feature.
 
-        - out - capture stdout:`` True``/``False``, default ``True``
-        - err - capture stdout: ``True``/``False``, default ``True``
-        - replay - whether to replay or not: ``True``/``False``, default ``True``. By default each
-        captured stream gets replayed back on context's exit, so that one can see what the test was doing. If this is a
-        not wanted behavior and the captured data shouldn't be replayed, pass ``replay=False`` to disable this feature.
+    Examples:
 
-        Examples::
+    ```python
+    # to capture stdout only with auto-replay
+    with CaptureStdout() as cs:
+        print("Secret message")
+    assert "message" in cs.out
 
-            # to capture stdout only with auto-replay
-            with CaptureStdout() as cs:
-                print("Secret message")
-            assert "message" in cs.out
+    # to capture stderr only with auto-replay
+    import sys
+    with CaptureStderr() as cs:
+        print("Warning: ", file=sys.stderr)
+    assert "Warning" in cs.err
 
-            # to capture stderr only with auto-replay
-            import sys
-            with CaptureStderr() as cs:
-                print("Warning: ", file=sys.stderr)
-            assert "Warning" in cs.err
+    # to capture both streams with auto-replay
+    with CaptureStd() as cs:
+        print("Secret message")
+        print("Warning: ", file=sys.stderr)
+    assert "message" in cs.out
+    assert "Warning" in cs.err
 
-            # to capture both streams with auto-replay
-            with CaptureStd() as cs:
-                print("Secret message")
-                print("Warning: ", file=sys.stderr)
-            assert "message" in cs.out
-            assert "Warning" in cs.err
+    # to capture just one of the streams, and not the other, with auto-replay
+    with CaptureStd(err=False) as cs:
+        print("Secret message")
+    assert "message" in cs.out
+    # but best use the stream-specific subclasses
 
-            # to capture just one of the streams, and not the other, with auto-replay
-            with CaptureStd(err=False) as cs:
-                print("Secret message")
-            assert "message" in cs.out
-            # but best use the stream-specific subclasses
-
-            # to capture without auto-replay
-            with CaptureStd(replay=False) as cs:
-                print("Secret message")
-            assert "message" in cs.out
-
-    """
+    # to capture without auto-replay
+    with CaptureStd(replay=False) as cs:
+        print("Secret message")
+    assert "message" in cs.out
+    ```"""
 
     def __init__(self, out=True, err=True, replay=True):
 
@@ -810,23 +810,24 @@ class CaptureLogger:
     Context manager to capture `logging` streams
 
     Args:
+        logger: 'logging` logger object
 
-    - logger: 'logging` logger object
-
-    Results:
+    Returns:
         The captured output is available via `self.out`
 
-    Example::
+    Example:
 
-        >>> from transformers import logging
-        >>> from transformers.testing_utils import CaptureLogger
+    ```python
+    >>> from transformers import logging
+    >>> from transformers.testing_utils import CaptureLogger
 
-        >>> msg = "Testing 1, 2, 3"
-        >>> logging.set_verbosity_info()
-        >>> logger = logging.get_logger("transformers.models.bart.tokenization_bart")
-        >>> with CaptureLogger(logger) as cl:
-        ...     logger.info(msg)
-        >>> assert cl.out, msg+"\n"
+    >>> msg = "Testing 1, 2, 3"
+    >>> logging.set_verbosity_info()
+    >>> logger = logging.get_logger("transformers.models.bart.tokenization_bart")
+    >>> with CaptureLogger(logger) as cl:
+    ...     logger.info(msg)
+    >>> assert cl.out, msg+"\n"
+    ```
     """
 
     def __init__(self, logger):
@@ -853,11 +854,12 @@ def LoggingLevel(level):
     This is a context manager to temporarily change transformers modules logging level to the desired value and have it
     restored to the original setting at the end of the scope.
 
-    For example ::
-
-        with LoggingLevel(logging.INFO):
-            AutoModel.from_pretrained("gpt2") # calls logger.info() several times
+    Example:
 
+    ```python
+    with LoggingLevel(logging.INFO):
+        AutoModel.from_pretrained("gpt2") # calls logger.info() several times
+    ```
     """
     orig_level = transformers_logging.get_verbosity()
     try:
@@ -873,11 +875,12 @@ def ExtendSysPath(path: Union[str, os.PathLike]) -> Iterator[None]:
     """
     Temporary add given path to `sys.path`.
 
-    Usage ::
-
-       with ExtendSysPath('/path/to/dir'):
-           mymodule = importlib.import_module('mymodule')
+    Usage :
 
+    ```python
+    with ExtendSysPath('/path/to/dir'):
+        mymodule = importlib.import_module('mymodule')
+    ```
     """
 
     path = os.fspath(path)
@@ -890,7 +893,7 @@ def ExtendSysPath(path: Union[str, os.PathLike]) -> Iterator[None]:
 
 class TestCasePlus(unittest.TestCase):
     """
-    This class extends `unittest.TestCase` with additional features.
+    This class extends *unittest.TestCase* with additional features.
 
     Feature 1: A set of fully resolved important file and dir path accessors.
 
@@ -898,75 +901,74 @@ class TestCasePlus(unittest.TestCase):
     test could be invoked from more than one directory or could reside in sub-directories with different depths. This
     class solves this problem by sorting out all the basic paths and provides easy accessors to them:
 
-    * ``pathlib`` objects (all fully resolved):
+    - `pathlib` objects (all fully resolved):
 
-       - ``test_file_path`` - the current test file path (=``__file__``)
-       - ``test_file_dir`` - the directory containing the current test file
-       - ``tests_dir`` - the directory of the ``tests`` test suite
-       - ``examples_dir`` - the directory of the ``examples`` test suite
-       - ``repo_root_dir`` - the directory of the repository
-       - ``src_dir`` - the directory of ``src`` (i.e. where the ``transformers`` sub-dir resides)
+       - `test_file_path` - the current test file path (=`__file__`)
+       - `test_file_dir` - the directory containing the current test file
+       - `tests_dir` - the directory of the `tests` test suite
+       - `examples_dir` - the directory of the `examples` test suite
+       - `repo_root_dir` - the directory of the repository
+       - `src_dir` - the directory of `src` (i.e. where the `transformers` sub-dir resides)
 
-    * stringified paths---same as above but these return paths as strings, rather than ``pathlib`` objects:
+    - stringified paths---same as above but these return paths as strings, rather than `pathlib` objects:
 
-       - ``test_file_path_str``
-       - ``test_file_dir_str``
-       - ``tests_dir_str``
-       - ``examples_dir_str``
-       - ``repo_root_dir_str``
-       - ``src_dir_str``
+       - `test_file_path_str`
+       - `test_file_dir_str`
+       - `tests_dir_str`
+       - `examples_dir_str`
+       - `repo_root_dir_str`
+       - `src_dir_str`
 
     Feature 2: Flexible auto-removable temporary dirs which are guaranteed to get removed at the end of test.
 
     1. Create a unique temporary dir:
 
-    ::
+    ```python
+    def test_whatever(self):
+        tmp_dir = self.get_auto_remove_tmp_dir()
+    ```
 
-        def test_whatever(self):
-            tmp_dir = self.get_auto_remove_tmp_dir()
-
-    ``tmp_dir`` will contain the path to the created temporary dir. It will be automatically removed at the end of the
+    `tmp_dir` will contain the path to the created temporary dir. It will be automatically removed at the end of the
     test.
 
 
     2. Create a temporary dir of my choice, ensure it's empty before the test starts and don't
     empty it after the test.
 
-    ::
-
-        def test_whatever(self):
-            tmp_dir = self.get_auto_remove_tmp_dir("./xxx")
+    ```python
+    def test_whatever(self):
+        tmp_dir = self.get_auto_remove_tmp_dir("./xxx")
+    ```
 
     This is useful for debug when you want to monitor a specific directory and want to make sure the previous tests
     didn't leave any data in there.
 
-    3. You can override the first two options by directly overriding the ``before`` and ``after`` args, leading to the
-       following behavior:
+    3. You can override the first two options by directly overriding the `before` and `after` args, leading to the
+        following behavior:
 
-    ``before=True``: the temporary dir will always be cleared at the beginning of the test.
+    `before=True`: the temporary dir will always be cleared at the beginning of the test.
 
-    ``before=False``: if the temporary dir already existed, any existing files will remain there.
+    `before=False`: if the temporary dir already existed, any existing files will remain there.
 
-    ``after=True``: the temporary dir will always be deleted at the end of the test.
+    `after=True`: the temporary dir will always be deleted at the end of the test.
 
-    ``after=False``: the temporary dir will always be left intact at the end of the test.
+    `after=False`: the temporary dir will always be left intact at the end of the test.
 
-    Note 1: In order to run the equivalent of ``rm -r`` safely, only subdirs of the project repository checkout are
-    allowed if an explicit ``tmp_dir`` is used, so that by mistake no ``/tmp`` or similar important part of the
-    filesystem will get nuked. i.e. please always pass paths that start with ``./``
+    Note 1: In order to run the equivalent of `rm -r` safely, only subdirs of the project repository checkout are
+    allowed if an explicit `tmp_dir` is used, so that by mistake no `/tmp` or similar important part of the
+    filesystem will get nuked. i.e. please always pass paths that start with `./`
 
     Note 2: Each test can register multiple temporary dirs and they all will get auto-removed, unless requested
     otherwise.
 
-    Feature 3: Get a copy of the ``os.environ`` object that sets up ``PYTHONPATH`` specific to the current test suite.
+    Feature 3: Get a copy of the `os.environ` object that sets up `PYTHONPATH` specific to the current test suite.
     This is useful for invoking external programs from the test suite - e.g. distributed training.
 
 
-    ::
-        def test_whatever(self):
-            env = self.get_env()
-
-    """
+    ```python
+    def test_whatever(self):
+        env = self.get_env()
+    ```"""
 
     def setUp(self):
         # get_auto_remove_tmp_dir feature:
@@ -1038,12 +1040,12 @@ class TestCasePlus(unittest.TestCase):
 
     def get_env(self):
         """
-        Return a copy of the ``os.environ`` object that sets up ``PYTHONPATH`` correctly, depending on the test suite
+        Return a copy of the `os.environ` object that sets up `PYTHONPATH` correctly, depending on the test suite
         it's invoked from. This is useful for invoking external programs from the test suite - e.g. distributed
         training.
 
-        It always inserts ``./src`` first, then ``./tests`` or ``./examples`` depending on the test suite type and
-        finally the preset ``PYTHONPATH`` if any (all full resolved paths).
+        It always inserts `./src` first, then `./tests` or `./examples` depending on the test suite type and
+        finally the preset `PYTHONPATH` if any (all full resolved paths).
 
         """
         env = os.environ.copy()
@@ -1060,26 +1062,26 @@ class TestCasePlus(unittest.TestCase):
     def get_auto_remove_tmp_dir(self, tmp_dir=None, before=None, after=None):
         """
         Args:
-            tmp_dir (:obj:`string`, `optional`):
-                if :obj:`None`:
+            tmp_dir (`string`, *optional*):
+                if `None`:
 
                    - a unique temporary path will be created
-                   - sets ``before=True`` if ``before`` is :obj:`None`
-                   - sets ``after=True`` if ``after`` is :obj:`None`
+                   - sets `before=True` if `before` is `None`
+                   - sets `after=True` if `after` is `None`
                 else:
 
-                   - :obj:`tmp_dir` will be created
-                   - sets ``before=True`` if ``before`` is :obj:`None`
-                   - sets ``after=False`` if ``after`` is :obj:`None`
-            before (:obj:`bool`, `optional`):
-                If :obj:`True` and the :obj:`tmp_dir` already exists, make sure to empty it right away if :obj:`False`
-                and the :obj:`tmp_dir` already exists, any existing files will remain there.
-            after (:obj:`bool`, `optional`):
-                If :obj:`True`, delete the :obj:`tmp_dir` at the end of the test if :obj:`False`, leave the
-                :obj:`tmp_dir` and its contents intact at the end of the test.
+                   - `tmp_dir` will be created
+                   - sets `before=True` if `before` is `None`
+                   - sets `after=False` if `after` is `None`
+            before (`bool`, *optional*):
+                If `True` and the `tmp_dir` already exists, make sure to empty it right away if `False`
+                and the `tmp_dir` already exists, any existing files will remain there.
+            after (`bool`, *optional*):
+                If `True`, delete the `tmp_dir` at the end of the test if `False`, leave the
+                `tmp_dir` and its contents intact at the end of the test.
 
         Returns:
-            tmp_dir(:obj:`string`): either the same value as passed via `tmp_dir` or the path to the auto-selected tmp
+            tmp_dir(`string`): either the same value as passed via *tmp_dir* or the path to the auto-selected tmp
             dir
         """
         if tmp_dir is not None:
@@ -1152,9 +1154,9 @@ def mockenv(**kwargs):
 @contextlib.contextmanager
 def mockenv_context(*remove, **update):
     """
-    Temporarily updates the ``os.environ`` dictionary in-place. Similar to mockenv
+    Temporarily updates the `os.environ` dictionary in-place. Similar to mockenv
 
-    The ``os.environ`` dictionary is updated in-place so that the modification is sure to work in all situations.
+    The `os.environ` dictionary is updated in-place so that the modification is sure to work in all situations.
 
     Args:
       remove: Environment variables to remove.
@@ -1423,8 +1425,8 @@ def execute_subprocess_async(cmd, env=None, stdin=None, timeout=180, quiet=False
 
 def pytest_xdist_worker_id():
     """
-    Returns an int value of worker's numerical id under ``pytest-xdist``'s concurrent workers ``pytest -n N`` regime,
-    or 0 if ``-n 1`` or ``pytest-xdist`` isn't being used.
+    Returns an int value of worker's numerical id under `pytest-xdist`'s concurrent workers `pytest -n N` regime,
+    or 0 if `-n 1` or `pytest-xdist` isn't being used.
     """
     worker = os.environ.get("PYTEST_XDIST_WORKER", "gw0")
     worker = re.sub(r"^gw", "", worker, 0, re.M)
@@ -1433,9 +1435,9 @@ def pytest_xdist_worker_id():
 
 def get_torch_dist_unique_port():
     """
-    Returns a port number that can be fed to ``torch.distributed.launch``'s ``--master_port`` argument.
+    Returns a port number that can be fed to `torch.distributed.launch`'s `--master_port` argument.
 
-    Under ``pytest-xdist`` it adds a delta number based on a worker id so that concurrent tests don't try to use the
+    Under `pytest-xdist` it adds a delta number based on a worker id so that concurrent tests don't try to use the
     same port at once.
     """
     port = 29500
diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py
index a8bcb98f85..fafb962d63 100644
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -66,15 +66,17 @@ class Trie:
 
         This function is idempotent, adding twice the same word will leave the trie unchanged
 
-        Example::
+        Example:
 
-            >>> trie = Trie()
-            >>> trie.add("Hello 友達")
-            >>> trie.data
-            {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}}
-            >>> trie.add("Hello")
-            >>> trie.data
-            {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}}
+        ```python
+        >>> trie = Trie()
+        >>> trie.add("Hello 友達")
+        >>> trie.data
+        {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}}
+        >>> trie.add("Hello")
+        >>> trie.data
+        {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}}
+        ```
         """
         if not word:
             # Prevent empty string
@@ -92,16 +94,18 @@ class Trie:
 
         This trie will match the longest possible word first !
 
-        Example::
+        Example:
 
-            >>> trie = Trie()
-            >>> trie.split("[CLS] This is a extra_id_100")
-            ["[CLS] This is a extra_id_100"]
-            >>> trie.add("[CLS]")
-            >>> trie.add("extra_id_1")
-            >>> trie.add("extra_id_100")
-            >>> trie.split("[CLS] This is a extra_id_100")
-            ["[CLS]", " This is a ", "extra_id_100"]
+        ```python
+        >>> trie = Trie()
+        >>> trie.split("[CLS] This is a extra_id_100")
+        ["[CLS] This is a extra_id_100"]
+        >>> trie.add("[CLS]")
+        >>> trie.add("extra_id_1")
+        >>> trie.add("extra_id_100")
+        >>> trie.split("[CLS] This is a extra_id_100")
+        ["[CLS]", " This is a ", "extra_id_100"]
+        ```
         """
         # indexes are counted left of the chars index.
         # "hello", index 0, is left of h, index 1 is between h and e.
@@ -323,7 +327,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
     """
     Base class for all slow tokenizers.
 
-    Inherits from :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase`.
+    Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].
 
     Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading
     pretrained tokenizers as well as adding tokens to the vocabulary.
@@ -351,7 +355,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
     @property
     def vocab_size(self) -> int:
         """
-        :obj:`int`: Size of the base vocabulary (without the added tokens).
+        `int`: Size of the base vocabulary (without the added tokens).
         """
         raise NotImplementedError
 
@@ -360,7 +364,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
         Returns the added tokens in the vocabulary as a dictionary of token to index.
 
         Returns:
-            :obj:`Dict[str, int]`: The added tokens.
+            `Dict[str, int]`: The added tokens.
         """
         return self.added_tokens_encoder
 
@@ -376,26 +380,27 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
         it with indices starting from length of the current vocabulary.
 
         Args:
-            new_tokens (:obj:`List[str]`or :obj:`List[tokenizers.AddedToken]`):
+            new_tokens (`List[str]`or `List[tokenizers.AddedToken]`):
                 Token(s) to add in vocabulary. A token is only added if it's not already in the vocabulary (tested by
-                checking if the tokenizer assign the index of the ``unk_token`` to them).
-            special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                checking if the tokenizer assign the index of the `unk_token` to them).
+            special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the tokens should be added as special tokens.
 
         Returns:
-            :obj:`int`: The number of tokens actually added to the vocabulary.
+            `int`: The number of tokens actually added to the vocabulary.
 
-        Examples::
+        Examples:
 
-            # Let's see how to increase the vocabulary of Bert model and tokenizer
-            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-            model = BertModel.from_pretrained('bert-base-uncased')
+        ```python
+        # Let's see how to increase the vocabulary of Bert model and tokenizer
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = BertModel.from_pretrained('bert-base-uncased')
 
-            num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
-            print('We have added', num_added_toks, 'tokens')
-            # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
-            model.resize_token_embeddings(len(tokenizer))
-        """
+        num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
+        print('We have added', num_added_toks, 'tokens')
+        # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
+        model.resize_token_embeddings(len(tokenizer))
+        ```"""
         new_tokens = [str(tok) for tok in new_tokens]
 
         tokens_to_add = []
@@ -447,17 +452,20 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
         """
         Returns the number of added tokens when encoding a sequence with special tokens.
 
-        .. note::
-            This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not
-            put this inside your training loop.
+        <Tip>
+
+        This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not
+        put this inside your training loop.
+
+        </Tip>
 
         Args:
-            pair (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            pair (`bool`, *optional*, defaults to `False`):
                 Whether the number of added tokens should be computed in the case of a sequence pair or a single
                 sequence.
 
         Returns:
-            :obj:`int`: Number of special tokens added to sequences.
+            `int`: Number of special tokens added to sequences.
         """
         token_ids_0 = []
         token_ids_1 = []
@@ -471,13 +479,13 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
         (BPE/SentencePieces/WordPieces). Takes care of added tokens.
 
         Args:
-            text (:obj:`str`):
+            text (`str`):
                 The sequence to be encoded.
             **kwargs (additional keyword arguments):
-                Passed along to the model-specific ``prepare_for_tokenization`` preprocessing method.
+                Passed along to the model-specific `prepare_for_tokenization` preprocessing method.
 
         Returns:
-            :obj:`List[str]`: The list of tokens.
+            `List[str]`: The list of tokens.
         """
         # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
         all_special_tokens_extended = dict(
@@ -548,10 +556,10 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
         vocabulary.
 
         Args:
-            tokens (:obj:`str` or :obj:`List[str]`): One or several token(s) to convert to token id(s).
+            tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).
 
         Returns:
-            :obj:`int` or :obj:`List[int]`: The token id or list of token ids.
+            `int` or `List[int]`: The token id or list of token ids.
         """
         if tokens is None:
             return None
@@ -807,21 +815,21 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
         """
         Performs any necessary transformations before tokenization.
 
-        This method should pop the arguments from kwargs and return the remaining :obj:`kwargs` as well. We test the
-        :obj:`kwargs` at the end of the encoding process to be sure all the arguments have been used.
+        This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
+        `kwargs` at the end of the encoding process to be sure all the arguments have been used.
 
         Args:
-            text (:obj:`str`):
+            text (`str`):
                 The text to prepare.
-            is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the input is already pre-tokenized (e.g., split into words). If set to :obj:`True`, the
+            is_split_into_words (`bool`, *optional*, defaults to `False`):
+                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
                 tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                 which it will tokenize. This is useful for NER or token classification.
             kwargs:
                 Keyword arguments to use for the tokenization.
 
         Returns:
-            :obj:`Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
+            `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs.
         """
         return (text, kwargs)
 
@@ -830,14 +838,14 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
     ) -> List[int]:
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
+        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of ids of the first sequence.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 List of ids of the second sequence.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
@@ -871,13 +879,13 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
         added tokens.
 
         Args:
-            ids (:obj:`int` or :obj:`List[int]`):
+            ids (`int` or `List[int]`):
                 The token id (or token ids) to convert to tokens.
-            skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not to remove special tokens in the decoding.
 
         Returns:
-            :obj:`str` or :obj:`List[str]`: The decoded token(s).
+            `str` or `List[str]`: The decoded token(s).
         """
         if isinstance(ids, int):
             if ids in self.added_tokens_decoder:
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 4ca518e8ad..e1dfabdae6 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -124,7 +124,7 @@ _re_tokenizer_file = re.compile(r"tokenizer\.(.*)\.json")
 
 class TruncationStrategy(ExplicitEnum):
     """
-    Possible values for the ``truncation`` argument in :meth:`PreTrainedTokenizerBase.__call__`. Useful for
+    Possible values for the `truncation` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for
     tab-completion in an IDE.
     """
 
@@ -139,8 +139,8 @@ class CharSpan(NamedTuple):
     Character span in the original string.
 
     Args:
-        start (:obj:`int`): Index of the first character in the original string.
-        end (:obj:`int`): Index of the character following the last character in the original string.
+        start (`int`): Index of the first character in the original string.
+        end (`int`): Index of the character following the last character in the original string.
     """
 
     start: int
@@ -152,8 +152,8 @@ class TokenSpan(NamedTuple):
     Token span in an encoded string (list of tokens).
 
     Args:
-        start (:obj:`int`): Index of the first token in the span.
-        end (:obj:`int`): Index of the token following the last token in the span.
+        start (`int`): Index of the first token in the span.
+        end (`int`): Index of the token following the last token in the span.
     """
 
     start: int
@@ -162,27 +162,27 @@ class TokenSpan(NamedTuple):
 
 class BatchEncoding(UserDict):
     """
-    Holds the output of the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.encode_plus` and
-    :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.batch_encode` methods (tokens,
+    Holds the output of the [`~tokenization_utils_base.PreTrainedTokenizerBase.encode_plus`] and
+    [`~tokenization_utils_base.PreTrainedTokenizerBase.batch_encode`] methods (tokens,
     attention_masks, etc).
 
     This class is derived from a python dictionary and can be used as a dictionary. In addition, this class exposes
     utility methods to map from word/character space to token space.
 
     Args:
-        data (:obj:`dict`):
+        data (`dict`):
             Dictionary of lists/arrays/tensors returned by the encode/batch_encode methods ('input_ids',
             'attention_mask', etc.).
-        encoding (:obj:`tokenizers.Encoding` or :obj:`Sequence[tokenizers.Encoding]`, `optional`):
+        encoding (`tokenizers.Encoding` or `Sequence[tokenizers.Encoding]`, *optional*):
             If the tokenizer is a fast tokenizer which outputs additional information like mapping from word/character
-            space to token space the :obj:`tokenizers.Encoding` instance or list of instance (for batches) hold this
+            space to token space the `tokenizers.Encoding` instance or list of instance (for batches) hold this
             information.
-        tensor_type (:obj:`Union[None, str, TensorType]`, `optional`):
+        tensor_type (`Union[None, str, TensorType]`, *optional*):
             You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
             initialization.
-        prepend_batch_axis (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to add a batch axis when converting to tensors (see :obj:`tensor_type` above).
-        n_sequences (:obj:`Optional[int]`, `optional`):
+        prepend_batch_axis (`bool`, *optional*, defaults to `False`):
+            Whether or not to add a batch axis when converting to tensors (see `tensor_type` above).
+        n_sequences (`Optional[int]`, *optional*):
             You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at
             initialization.
     """
@@ -212,26 +212,26 @@ class BatchEncoding(UserDict):
     @property
     def n_sequences(self) -> Optional[int]:
         """
-        :obj:`Optional[int]`: The number of sequences used to generate each sample from the batch encoded in this
-        :class:`~transformers.BatchEncoding`. Currently can be one of :obj:`None` (unknown), :obj:`1` (a single
-        sentence) or :obj:`2` (a pair of sentences)
+        `Optional[int]`: The number of sequences used to generate each sample from the batch encoded in this
+        [`BatchEncoding`]. Currently can be one of `None` (unknown), `1` (a single
+        sentence) or `2` (a pair of sentences)
         """
         return self._n_sequences
 
     @property
     def is_fast(self) -> bool:
         """
-        :obj:`bool`: Indicate whether this :class:`~transformers.BatchEncoding` was generated from the result of a
-        :class:`~transformers.PreTrainedTokenizerFast` or not.
+        `bool`: Indicate whether this [`BatchEncoding`] was generated from the result of a
+        [`PreTrainedTokenizerFast`] or not.
         """
         return self._encodings is not None
 
     def __getitem__(self, item: Union[int, str]) -> Union[Any, EncodingFast]:
         """
-        If the key is a string, returns the value of the dict associated to :obj:`key` ('input_ids', 'attention_mask',
+        If the key is a string, returns the value of the dict associated to `key` ('input_ids', 'attention_mask',
         etc.).
 
-        If the key is an integer, get the :obj:`tokenizers.Encoding` for batch item with index :obj:`key`.
+        If the key is an integer, get the `tokenizers.Encoding` for batch item with index `key`.
         """
         if isinstance(item, str):
             return self.data[item]
@@ -275,8 +275,8 @@ class BatchEncoding(UserDict):
     @property
     def encodings(self) -> Optional[List[EncodingFast]]:
         """
-        :obj:`Optional[List[tokenizers.Encoding]]`: The list all encodings from the tokenization process. Returns
-        :obj:`None` if the input was tokenized through Python (i.e., not a fast) tokenizer.
+        `Optional[List[tokenizers.Encoding]]`: The list all encodings from the tokenization process. Returns
+        `None` if the input was tokenized through Python (i.e., not a fast) tokenizer.
         """
         return self._encodings
 
@@ -286,10 +286,10 @@ class BatchEncoding(UserDict):
         integer indices) at a given batch index (only works for the output of a fast tokenizer).
 
         Args:
-            batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.
+            batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
 
         Returns:
-            :obj:`List[str]`: The list of tokens at that index.
+            `List[str]`: The list of tokens at that index.
         """
         if not self._encodings:
             raise ValueError("tokens() is not available when using Python-based tokenizers")
@@ -299,17 +299,17 @@ class BatchEncoding(UserDict):
         """
         Return a list mapping the tokens to the id of their original sentences:
 
-            - :obj:`None` for special tokens added around or between sequences,
-            - :obj:`0` for tokens corresponding to words in the first sequence,
-            - :obj:`1` for tokens corresponding to words in the second sequence when a pair of sequences was jointly
+            - `None` for special tokens added around or between sequences,
+            - `0` for tokens corresponding to words in the first sequence,
+            - `1` for tokens corresponding to words in the second sequence when a pair of sequences was jointly
               encoded.
 
         Args:
-            batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.
+            batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
 
         Returns:
-            :obj:`List[Optional[int]]`: A list indicating the sequence id corresponding to each token. Special tokens
-            added by the tokenizer are mapped to :obj:`None` and other tokens are mapped to the index of their
+            `List[Optional[int]]`: A list indicating the sequence id corresponding to each token. Special tokens
+            added by the tokenizer are mapped to `None` and other tokens are mapped to the index of their
             corresponding sequence.
         """
         if not self._encodings:
@@ -321,11 +321,11 @@ class BatchEncoding(UserDict):
         Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
 
         Args:
-            batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.
+            batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
 
         Returns:
-            :obj:`List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by
-            the tokenizer are mapped to :obj:`None` and other tokens are mapped to the index of their corresponding
+            `List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by
+            the tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding
             word (several tokens will be mapped to the same word index if they are parts of that word).
         """
         if not self._encodings:
@@ -342,11 +342,11 @@ class BatchEncoding(UserDict):
         Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer.
 
         Args:
-            batch_index (:obj:`int`, `optional`, defaults to 0): The index to access in the batch.
+            batch_index (`int`, *optional*, defaults to 0): The index to access in the batch.
 
         Returns:
-            :obj:`List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by
-            the tokenizer are mapped to :obj:`None` and other tokens are mapped to the index of their corresponding
+            `List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by
+            the tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding
             word (several tokens will be mapped to the same word index if they are parts of that word).
         """
         if not self._encodings:
@@ -356,27 +356,27 @@ class BatchEncoding(UserDict):
     def token_to_sequence(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int:
         """
         Get the index of the sequence represented by the given token. In the general use case, this method returns
-        :obj:`0` for a single sequence or the first sequence of a pair, and :obj:`1` for the second sequence of a pair
+        `0` for a single sequence or the first sequence of a pair, and `1` for the second sequence of a pair
 
         Can be called as:
 
-        - ``self.token_to_sequence(token_index)`` if batch size is 1
-        - ``self.token_to_sequence(batch_index, token_index)`` if batch size is greater than 1
+        - `self.token_to_sequence(token_index)` if batch size is 1
+        - `self.token_to_sequence(batch_index, token_index)` if batch size is greater than 1
 
         This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
         words are defined by the user). In this case it allows to easily associate encoded tokens with provided
         tokenized words.
 
         Args:
-            batch_or_token_index (:obj:`int`):
+            batch_or_token_index (`int`):
                 Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
                 the token in the sequence.
-            token_index (:obj:`int`, `optional`):
-                If a batch index is provided in `batch_or_token_index`, this can be the index of the token in the
+            token_index (`int`, *optional*):
+                If a batch index is provided in *batch_or_token_index*, this can be the index of the token in the
                 sequence.
 
         Returns:
-            :obj:`int`: Index of the word in the input sequence.
+            `int`: Index of the word in the input sequence.
         """
 
         if not self._encodings:
@@ -398,23 +398,23 @@ class BatchEncoding(UserDict):
 
         Can be called as:
 
-        - ``self.token_to_word(token_index)`` if batch size is 1
-        - ``self.token_to_word(batch_index, token_index)`` if batch size is greater than 1
+        - `self.token_to_word(token_index)` if batch size is 1
+        - `self.token_to_word(batch_index, token_index)` if batch size is greater than 1
 
         This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e.,
         words are defined by the user). In this case it allows to easily associate encoded tokens with provided
         tokenized words.
 
         Args:
-            batch_or_token_index (:obj:`int`):
+            batch_or_token_index (`int`):
                 Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
                 the token in the sequence.
-            token_index (:obj:`int`, `optional`):
-                If a batch index is provided in `batch_or_token_index`, this can be the index of the token in the
+            token_index (`int`, *optional*):
+                If a batch index is provided in *batch_or_token_index*, this can be the index of the token in the
                 sequence.
 
         Returns:
-            :obj:`int`: Index of the word in the input sequence.
+            `int`: Index of the word in the input sequence.
         """
 
         if not self._encodings:
@@ -436,15 +436,15 @@ class BatchEncoding(UserDict):
         """
         Get the encoded token span corresponding to a word in a sequence of the batch.
 
-        Token spans are returned as a :class:`~transformers.tokenization_utils_base.TokenSpan` with:
+        Token spans are returned as a [`~tokenization_utils_base.TokenSpan`] with:
 
         - **start** -- Index of the first token.
         - **end** -- Index of the token following the last token.
 
         Can be called as:
 
-        - ``self.word_to_tokens(word_index, sequence_index: int = 0)`` if batch size is 1
-        - ``self.word_to_tokens(batch_index, word_index, sequence_index: int = 0)`` if batch size is greater or equal
+        - `self.word_to_tokens(word_index, sequence_index: int = 0)` if batch size is 1
+        - `self.word_to_tokens(batch_index, word_index, sequence_index: int = 0)` if batch size is greater or equal
           to 1
 
         This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
@@ -452,19 +452,19 @@ class BatchEncoding(UserDict):
         words.
 
         Args:
-            batch_or_word_index (:obj:`int`):
+            batch_or_word_index (`int`):
                 Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of
                 the word in the sequence.
-            word_index (:obj:`int`, `optional`):
-                If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
+            word_index (`int`, *optional*):
+                If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
                 sequence.
-            sequence_index (:obj:`int`, `optional`, defaults to 0):
+            sequence_index (`int`, *optional*, defaults to 0):
                 If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
                 or 1) the provided word index belongs to.
 
         Returns:
-            Optional :class:`~transformers.tokenization_utils_base.TokenSpan` Span of tokens in the encoded sequence.
-            Returns :obj:`None` if no tokens correspond to the word.
+            Optional [`~tokenization_utils_base.TokenSpan`] Span of tokens in the encoded sequence.
+            Returns `None` if no tokens correspond to the word.
         """
 
         if not self._encodings:
@@ -485,7 +485,7 @@ class BatchEncoding(UserDict):
         """
         Get the character span corresponding to an encoded token in a sequence of the batch.
 
-        Character spans are returned as a :class:`~transformers.tokenization_utils_base.CharSpan` with:
+        Character spans are returned as a [`~tokenization_utils_base.CharSpan`] with:
 
         - **start** -- Index of the first character in the original string associated to the token.
         - **end** -- Index of the character following the last character in the original string associated to the
@@ -493,19 +493,19 @@ class BatchEncoding(UserDict):
 
         Can be called as:
 
-        - ``self.token_to_chars(token_index)`` if batch size is 1
-        - ``self.token_to_chars(batch_index, token_index)`` if batch size is greater or equal to 1
+        - `self.token_to_chars(token_index)` if batch size is 1
+        - `self.token_to_chars(batch_index, token_index)` if batch size is greater or equal to 1
 
         Args:
-            batch_or_token_index (:obj:`int`):
+            batch_or_token_index (`int`):
                 Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
                 the token in the sequence.
-            token_index (:obj:`int`, `optional`):
-                If a batch index is provided in `batch_or_token_index`, this can be the index of the token or tokens in
+            token_index (`int`, *optional*):
+                If a batch index is provided in *batch_or_token_index*, this can be the index of the token or tokens in
                 the sequence.
 
         Returns:
-            :class:`~transformers.tokenization_utils_base.CharSpan`: Span of characters in the original string.
+            [`~tokenization_utils_base.CharSpan`]: Span of characters in the original string.
         """
 
         if not self._encodings:
@@ -526,27 +526,27 @@ class BatchEncoding(UserDict):
 
         Can be called as:
 
-        - ``self.char_to_token(char_index)`` if batch size is 1
-        - ``self.char_to_token(batch_index, char_index)`` if batch size is greater or equal to 1
+        - `self.char_to_token(char_index)` if batch size is 1
+        - `self.char_to_token(batch_index, char_index)` if batch size is greater or equal to 1
 
         This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
         are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
         words.
 
         Args:
-            batch_or_char_index (:obj:`int`):
+            batch_or_char_index (`int`):
                 Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
                 the word in the sequence
-            char_index (:obj:`int`, `optional`):
-                If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
+            char_index (`int`, *optional*):
+                If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
                 sequence.
-            sequence_index (:obj:`int`, `optional`, defaults to 0):
+            sequence_index (`int`, *optional*, defaults to 0):
                 If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
                 or 1) the provided character index belongs to.
 
 
         Returns:
-            :obj:`int`: Index of the token.
+            `int`: Index of the token.
         """
 
         if not self._encodings:
@@ -571,22 +571,22 @@ class BatchEncoding(UserDict):
 
         Can be called as:
 
-        - ``self.word_to_chars(word_index)`` if batch size is 1
-        - ``self.word_to_chars(batch_index, word_index)`` if batch size is greater or equal to 1
+        - `self.word_to_chars(word_index)` if batch size is 1
+        - `self.word_to_chars(batch_index, word_index)` if batch size is greater or equal to 1
 
         Args:
-            batch_or_word_index (:obj:`int`):
+            batch_or_word_index (`int`):
                 Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
                 the word in the sequence
-            word_index (:obj:`int`, `optional`):
-                If a batch index is provided in `batch_or_token_index`, this can be the index of the word in the
+            word_index (`int`, *optional*):
+                If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the
                 sequence.
-            sequence_index (:obj:`int`, `optional`, defaults to 0):
+            sequence_index (`int`, *optional*, defaults to 0):
                 If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
                 or 1) the provided word index belongs to.
 
         Returns:
-            :obj:`CharSpan` or :obj:`List[CharSpan]`: Span(s) of the associated character or characters in the string.
+            `CharSpan` or `List[CharSpan]`: Span(s) of the associated character or characters in the string.
             CharSpan are NamedTuple with:
 
                 - start: index of the first character associated to the token in the original string
@@ -610,27 +610,27 @@ class BatchEncoding(UserDict):
 
         Can be called as:
 
-        - ``self.char_to_word(char_index)`` if batch size is 1
-        - ``self.char_to_word(batch_index, char_index)`` if batch size is greater than 1
+        - `self.char_to_word(char_index)` if batch size is 1
+        - `self.char_to_word(batch_index, char_index)` if batch size is greater than 1
 
         This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words
         are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized
         words.
 
         Args:
-            batch_or_char_index (:obj:`int`):
+            batch_or_char_index (`int`):
                 Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of
                 the character in the original string.
-            char_index (:obj:`int`, `optional`):
-                If a batch index is provided in `batch_or_token_index`, this can be the index of the character in the
+            char_index (`int`, *optional*):
+                If a batch index is provided in *batch_or_token_index*, this can be the index of the character in the
                 original string.
-            sequence_index (:obj:`int`, `optional`, defaults to 0):
+            sequence_index (`int`, *optional*, defaults to 0):
                 If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0
                 or 1) the provided character index belongs to.
 
 
         Returns:
-            :obj:`int` or :obj:`List[int]`: Index or indices of the associated encoded token(s).
+            `int` or `List[int]`: Index or indices of the associated encoded token(s).
         """
 
         if not self._encodings:
@@ -649,10 +649,10 @@ class BatchEncoding(UserDict):
         Convert the inner content to tensors.
 
         Args:
-            tensor_type (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
-                The type of tensors to use. If :obj:`str`, should be one of the values of the enum
-                :class:`~transformers.file_utils.TensorType`. If :obj:`None`, no modification is done.
-            prepend_batch_axis (:obj:`int`, `optional`, defaults to :obj:`False`):
+            tensor_type (`str` or [`~file_utils.TensorType`], *optional*):
+                The type of tensors to use. If `str`, should be one of the values of the enum
+                [`~file_utils.TensorType`]. If `None`, no modification is done.
+            prepend_batch_axis (`int`, *optional*, defaults to `False`):
                 Whether or not to add the batch dimension during the conversion.
         """
         if tensor_type is None:
@@ -728,13 +728,13 @@ class BatchEncoding(UserDict):
     @torch_required
     def to(self, device: Union[str, "torch.device"]) -> "BatchEncoding":
         """
-        Send all values to device by calling :obj:`v.to(device)` (PyTorch only).
+        Send all values to device by calling `v.to(device)` (PyTorch only).
 
         Args:
-            device (:obj:`str` or :obj:`torch.device`): The device to put the tensors on.
+            device (`str` or `torch.device`): The device to put the tensors on.
 
         Returns:
-            :class:`~transformers.BatchEncoding`: The same instance after modification.
+            [`BatchEncoding`]: The same instance after modification.
         """
 
         # This check catches things like APEX blindly calling "to" on all inputs to a module
@@ -749,29 +749,29 @@ class BatchEncoding(UserDict):
 
 class SpecialTokensMixin:
     """
-    A mixin derived by :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast` to
+    A mixin derived by [`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`] to
     handle specific behaviors related to special tokens. In particular, this class hold the attributes which can be
     used to directly access these special tokens in a model-independent manner and allow to set and update the special
     tokens.
 
     Args:
-        bos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
+        bos_token (`str` or `tokenizers.AddedToken`, *optional*):
             A special token representing the beginning of a sentence.
-        eos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
+        eos_token (`str` or `tokenizers.AddedToken`, *optional*):
             A special token representing the end of a sentence.
-        unk_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
+        unk_token (`str` or `tokenizers.AddedToken`, *optional*):
             A special token representing an out-of-vocabulary token.
-        sep_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
+        sep_token (`str` or `tokenizers.AddedToken`, *optional*):
             A special token separating two different sentences in the same input (used by BERT for instance).
-        pad_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
+        pad_token (`str` or `tokenizers.AddedToken`, *optional*):
             A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
             attention mechanisms or loss computation.
-        cls_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
+        cls_token (`str` or `tokenizers.AddedToken`, *optional*):
             A special token representing the class of the input (used by BERT for instance).
-        mask_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
+        mask_token (`str` or `tokenizers.AddedToken`, *optional*):
             A special token representing a masked token (used by masked-language modeling pretraining objectives, like
             BERT).
-        additional_special_tokens (tuple or list of :obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
+        additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*):
             A tuple or a list of additional special tokens.
     """
 
@@ -818,13 +818,13 @@ class SpecialTokensMixin:
 
     def sanitize_special_tokens(self) -> int:
         """
-        Make sure that all the special tokens attributes of the tokenizer (:obj:`tokenizer.mask_token`,
-        :obj:`tokenizer.cls_token`, etc.) are in the vocabulary.
+        Make sure that all the special tokens attributes of the tokenizer (`tokenizer.mask_token`,
+        `tokenizer.cls_token`, etc.) are in the vocabulary.
 
         Add the missing ones to the vocabulary if needed.
 
         Return:
-            :obj:`int`: The number of tokens added in the vocabulary during the operation.
+            `int`: The number of tokens added in the vocabulary during the operation.
         """
         return self.add_tokens(self.all_special_tokens_extended, special_tokens=True)
 
@@ -834,49 +834,50 @@ class SpecialTokensMixin:
         special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the
         current vocabulary).
 
-        .. Note::
-            When adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix of
-            the model so that its embedding matrix matches the tokenizer.
+        Note,None
+        When adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix of
+        the model so that its embedding matrix matches the tokenizer.
 
-            In order to do that, please use the :meth:`~transformers.PreTrainedModel.resize_token_embeddings` method.
+        In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.
 
-        Using :obj:`add_special_tokens` will ensure your special tokens can be used in several ways:
+        Using `add_special_tokens` will ensure your special tokens can be used in several ways:
 
         - Special tokens are carefully handled by the tokenizer (they are never split).
-        - You can easily refer to special tokens using tokenizer class attributes like :obj:`tokenizer.cls_token`. This
+        - You can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This
           makes it easy to develop model-agnostic training and fine-tuning scripts.
 
         When possible, special tokens are already registered for provided pretrained models (for instance
-        :class:`~transformers.BertTokenizer` :obj:`cls_token` is already registered to be :obj`'[CLS]'` and XLM's one
-        is also registered to be :obj:`'</s>'`).
+        [`BertTokenizer`] `cls_token` is already registered to be :obj*'[CLS]'* and XLM's one
+        is also registered to be `'</s>'`).
 
         Args:
-            special_tokens_dict (dictionary `str` to `str` or :obj:`tokenizers.AddedToken`):
-                Keys should be in the list of predefined special attributes: [``bos_token``, ``eos_token``,
-                ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``,
-                ``additional_special_tokens``].
+            special_tokens_dict (dictionary *str* to *str* or `tokenizers.AddedToken`):
+                Keys should be in the list of predefined special attributes: [`bos_token`, `eos_token`,
+                `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`,
+                `additional_special_tokens`].
 
                 Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer
-                assign the index of the ``unk_token`` to them).
+                assign the index of the `unk_token` to them).
 
         Returns:
-            :obj:`int`: Number of tokens added to the vocabulary.
+            `int`: Number of tokens added to the vocabulary.
 
-        Examples::
+        Examples:
 
-            # Let's see how to add a new classification token to GPT-2
-            tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-            model = GPT2Model.from_pretrained('gpt2')
+        ```python
+        # Let's see how to add a new classification token to GPT-2
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        model = GPT2Model.from_pretrained('gpt2')
 
-            special_tokens_dict = {'cls_token': '<CLS>'}
+        special_tokens_dict = {'cls_token': '<CLS>'}
 
-            num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
-            print('We have added', num_added_toks, 'tokens')
-            # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
-            model.resize_token_embeddings(len(tokenizer))
+        num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
+        print('We have added', num_added_toks, 'tokens')
+        # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
+        model.resize_token_embeddings(len(tokenizer))
 
-            assert tokenizer.cls_token == '<CLS>'
-        """
+        assert tokenizer.cls_token == '<CLS>'
+        ```"""
         if not special_tokens_dict:
             return 0
 
@@ -908,38 +909,39 @@ class SpecialTokensMixin:
         Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to
         it with indices starting from length of the current vocabulary.
 
-        .. Note::
-            When adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix of
-            the model so that its embedding matrix matches the tokenizer.
+        Note,None
+        When adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix of
+        the model so that its embedding matrix matches the tokenizer.
 
-            In order to do that, please use the :meth:`~transformers.PreTrainedModel.resize_token_embeddings` method.
+        In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method.
 
         Args:
-            new_tokens (:obj:`str`, :obj:`tokenizers.AddedToken` or a list of `str` or :obj:`tokenizers.AddedToken`):
-                Tokens are only added if they are not already in the vocabulary. :obj:`tokenizers.AddedToken` wraps a
+            new_tokens (`str`, `tokenizers.AddedToken` or a list of *str* or `tokenizers.AddedToken`):
+                Tokens are only added if they are not already in the vocabulary. `tokenizers.AddedToken` wraps a
                 string token to let you personalize its behavior: whether this token should only match against a single
                 word, whether this token should strip all potential whitespaces on the left side, whether this token
                 should strip all potential whitespaces on the right side, etc.
-            special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            special_tokens (`bool`, *optional*, defaults to `False`):
                 Can be used to specify if the token is a special token. This mostly change the normalization behavior
                 (special tokens like CLS or [MASK] are usually not lower-cased for instance).
 
-                See details for :obj:`tokenizers.AddedToken` in HuggingFace tokenizers library.
+                See details for `tokenizers.AddedToken` in HuggingFace tokenizers library.
 
         Returns:
-            :obj:`int`: Number of tokens added to the vocabulary.
+            `int`: Number of tokens added to the vocabulary.
 
-        Examples::
+        Examples:
 
-            # Let's see how to increase the vocabulary of Bert model and tokenizer
-            tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
-            model = BertModel.from_pretrained('bert-base-uncased')
+        ```python
+        # Let's see how to increase the vocabulary of Bert model and tokenizer
+        tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
+        model = BertModel.from_pretrained('bert-base-uncased')
 
-            num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
-            print('We have added', num_added_toks, 'tokens')
-             # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
-            model.resize_token_embeddings(len(tokenizer))
-        """
+        num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
+        print('We have added', num_added_toks, 'tokens')
+        # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer.
+        model.resize_token_embeddings(len(tokenizer))
+        ```"""
         if not new_tokens:
             return 0
 
@@ -954,7 +956,7 @@ class SpecialTokensMixin:
     @property
     def bos_token(self) -> str:
         """
-        :obj:`str`: Beginning of sentence token. Log an error if used while not having been set.
+        `str`: Beginning of sentence token. Log an error if used while not having been set.
         """
         if self._bos_token is None and self.verbose:
             logger.error("Using bos_token, but it is not set yet.")
@@ -964,7 +966,7 @@ class SpecialTokensMixin:
     @property
     def eos_token(self) -> str:
         """
-        :obj:`str`: End of sentence token. Log an error if used while not having been set.
+        `str`: End of sentence token. Log an error if used while not having been set.
         """
         if self._eos_token is None and self.verbose:
             logger.error("Using eos_token, but it is not set yet.")
@@ -974,7 +976,7 @@ class SpecialTokensMixin:
     @property
     def unk_token(self) -> str:
         """
-        :obj:`str`: Unknown token. Log an error if used while not having been set.
+        `str`: Unknown token. Log an error if used while not having been set.
         """
         if self._unk_token is None and self.verbose:
             logger.error("Using unk_token, but it is not set yet.")
@@ -984,7 +986,7 @@ class SpecialTokensMixin:
     @property
     def sep_token(self) -> str:
         """
-        :obj:`str`: Separation token, to separate context and query in an input sequence. Log an error if used while
+        `str`: Separation token, to separate context and query in an input sequence. Log an error if used while
         not having been set.
         """
         if self._sep_token is None and self.verbose:
@@ -995,7 +997,7 @@ class SpecialTokensMixin:
     @property
     def pad_token(self) -> str:
         """
-        :obj:`str`: Padding token. Log an error if used while not having been set.
+        `str`: Padding token. Log an error if used while not having been set.
         """
         if self._pad_token is None and self.verbose:
             logger.error("Using pad_token, but it is not set yet.")
@@ -1005,7 +1007,7 @@ class SpecialTokensMixin:
     @property
     def cls_token(self) -> str:
         """
-        :obj:`str`: Classification token, to extract a summary of an input sequence leveraging self-attention along the
+        `str`: Classification token, to extract a summary of an input sequence leveraging self-attention along the
         full depth of the model. Log an error if used while not having been set.
         """
         if self._cls_token is None and self.verbose:
@@ -1016,7 +1018,7 @@ class SpecialTokensMixin:
     @property
     def mask_token(self) -> str:
         """
-        :obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
+        `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
         not having been set.
         """
         if self._mask_token is None and self.verbose:
@@ -1027,7 +1029,7 @@ class SpecialTokensMixin:
     @property
     def additional_special_tokens(self) -> List[str]:
         """
-        :obj:`List[str]`: All the additional special tokens you may want to use. Log an error if used while not having
+        `List[str]`: All the additional special tokens you may want to use. Log an error if used while not having
         been set.
         """
         if self._additional_special_tokens is None and self.verbose:
@@ -1070,7 +1072,7 @@ class SpecialTokensMixin:
     @property
     def bos_token_id(self) -> Optional[int]:
         """
-        :obj:`Optional[int]`: Id of the beginning of sentence token in the vocabulary. Returns :obj:`None` if the token
+        `Optional[int]`: Id of the beginning of sentence token in the vocabulary. Returns `None` if the token
         has not been set.
         """
         if self._bos_token is None:
@@ -1080,7 +1082,7 @@ class SpecialTokensMixin:
     @property
     def eos_token_id(self) -> Optional[int]:
         """
-        :obj:`Optional[int]`: Id of the end of sentence token in the vocabulary. Returns :obj:`None` if the token has
+        `Optional[int]`: Id of the end of sentence token in the vocabulary. Returns `None` if the token has
         not been set.
         """
         if self._eos_token is None:
@@ -1090,7 +1092,7 @@ class SpecialTokensMixin:
     @property
     def unk_token_id(self) -> Optional[int]:
         """
-        :obj:`Optional[int]`: Id of the unknown token in the vocabulary. Returns :obj:`None` if the token has not been
+        `Optional[int]`: Id of the unknown token in the vocabulary. Returns `None` if the token has not been
         set.
         """
         if self._unk_token is None:
@@ -1100,8 +1102,8 @@ class SpecialTokensMixin:
     @property
     def sep_token_id(self) -> Optional[int]:
         """
-        :obj:`Optional[int]`: Id of the separation token in the vocabulary, to separate context and query in an input
-        sequence. Returns :obj:`None` if the token has not been set.
+        `Optional[int]`: Id of the separation token in the vocabulary, to separate context and query in an input
+        sequence. Returns `None` if the token has not been set.
         """
         if self._sep_token is None:
             return None
@@ -1110,7 +1112,7 @@ class SpecialTokensMixin:
     @property
     def pad_token_id(self) -> Optional[int]:
         """
-        :obj:`Optional[int]`: Id of the padding token in the vocabulary. Returns :obj:`None` if the token has not been
+        `Optional[int]`: Id of the padding token in the vocabulary. Returns `None` if the token has not been
         set.
         """
         if self._pad_token is None:
@@ -1120,17 +1122,17 @@ class SpecialTokensMixin:
     @property
     def pad_token_type_id(self) -> int:
         """
-        :obj:`int`: Id of the padding token type in the vocabulary.
+        `int`: Id of the padding token type in the vocabulary.
         """
         return self._pad_token_type_id
 
     @property
     def cls_token_id(self) -> Optional[int]:
         """
-        :obj:`Optional[int]`: Id of the classification token in the vocabulary, to extract a summary of an input
+        `Optional[int]`: Id of the classification token in the vocabulary, to extract a summary of an input
         sequence leveraging self-attention along the full depth of the model.
 
-        Returns :obj:`None` if the token has not been set.
+        Returns `None` if the token has not been set.
         """
         if self._cls_token is None:
             return None
@@ -1139,8 +1141,8 @@ class SpecialTokensMixin:
     @property
     def mask_token_id(self) -> Optional[int]:
         """
-        :obj:`Optional[int]`: Id of the mask token in the vocabulary, used when training a model with masked-language
-        modeling. Returns :obj:`None` if the token has not been set.
+        `Optional[int]`: Id of the mask token in the vocabulary, used when training a model with masked-language
+        modeling. Returns `None` if the token has not been set.
         """
         if self._mask_token is None:
             return None
@@ -1149,7 +1151,7 @@ class SpecialTokensMixin:
     @property
     def additional_special_tokens_ids(self) -> List[int]:
         """
-        :obj:`List[int]`: Ids of all the additional special tokens in the vocabulary. Log an error if used while not
+        `List[int]`: Ids of all the additional special tokens in the vocabulary. Log an error if used while not
         having been set.
         """
         return self.convert_tokens_to_ids(self.additional_special_tokens)
@@ -1189,10 +1191,10 @@ class SpecialTokensMixin:
     @property
     def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]:
         """
-        :obj:`Dict[str, Union[str, List[str]]]`: A dictionary mapping special token class attributes (:obj:`cls_token`,
-        :obj:`unk_token`, etc.) to their values (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.).
+        `Dict[str, Union[str, List[str]]]`: A dictionary mapping special token class attributes (`cls_token`,
+        `unk_token`, etc.) to their values (`'<unk>'`, `'<cls>'`, etc.).
 
-        Convert potential tokens of :obj:`tokenizers.AddedToken` type to string.
+        Convert potential tokens of `tokenizers.AddedToken` type to string.
         """
         set_attr = {}
         for attr in self.SPECIAL_TOKENS_ATTRIBUTES:
@@ -1208,11 +1210,11 @@ class SpecialTokensMixin:
     @property
     def special_tokens_map_extended(self) -> Dict[str, Union[str, AddedToken, List[Union[str, AddedToken]]]]:
         """
-        :obj:`Dict[str, Union[str, tokenizers.AddedToken, List[Union[str, tokenizers.AddedToken]]]]`: A dictionary
-        mapping special token class attributes (:obj:`cls_token`, :obj:`unk_token`, etc.) to their values
-        (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.).
+        `Dict[str, Union[str, tokenizers.AddedToken, List[Union[str, tokenizers.AddedToken]]]]`: A dictionary
+        mapping special token class attributes (`cls_token`, `unk_token`, etc.) to their values
+        (`'<unk>'`, `'<cls>'`, etc.).
 
-        Don't convert tokens of :obj:`tokenizers.AddedToken` type to string so they can be used to control more finely
+        Don't convert tokens of `tokenizers.AddedToken` type to string so they can be used to control more finely
         how special tokens are tokenized.
         """
         set_attr = {}
@@ -1225,9 +1227,9 @@ class SpecialTokensMixin:
     @property
     def all_special_tokens(self) -> List[str]:
         """
-        :obj:`List[str]`: All the special tokens (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.) mapped to class attributes.
+        `List[str]`: All the special tokens (`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
 
-        Convert tokens of :obj:`tokenizers.AddedToken` type to string.
+        Convert tokens of `tokenizers.AddedToken` type to string.
         """
         all_toks = [str(s) for s in self.all_special_tokens_extended]
         return all_toks
@@ -1235,10 +1237,10 @@ class SpecialTokensMixin:
     @property
     def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]:
         """
-        :obj:`List[Union[str, tokenizers.AddedToken]]`: All the special tokens (:obj:`'<unk>'`, :obj:`'<cls>'`, etc.)
+        `List[Union[str, tokenizers.AddedToken]]`: All the special tokens (`'<unk>'`, `'<cls>'`, etc.)
         mapped to class attributes.
 
-        Don't convert tokens of :obj:`tokenizers.AddedToken` type to string so they can be used to control more finely
+        Don't convert tokens of `tokenizers.AddedToken` type to string so they can be used to control more finely
         how special tokens are tokenized.
         """
         all_toks = []
@@ -1251,7 +1253,7 @@ class SpecialTokensMixin:
     @property
     def all_special_ids(self) -> List[int]:
         """
-        :obj:`List[int]`: List the ids of the special tokens(:obj:`'<unk>'`, :obj:`'<cls>'`, etc.) mapped to class
+        `List[int]`: List the ids of the special tokens(`'<unk>'`, `'<cls>'`, etc.) mapped to class
         attributes.
         """
         all_toks = self.all_special_tokens
@@ -1260,180 +1262,180 @@ class SpecialTokensMixin:
 
 
 ENCODE_KWARGS_DOCSTRING = r"""
-            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            add_special_tokens (`bool`, *optional*, defaults to `True`):
                 Whether or not to encode the sequences with the special tokens relative to their model.
-            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
                 Activates and controls padding. Accepts the following values:
 
-                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a
                   single sequence if provided).
-                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
                   maximum acceptable input length for the model if that argument is not provided.
-                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
                   different lengths).
-            truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
+            truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
                 Activates and controls truncation. Accepts the following values:
 
-                * :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument
-                  :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
+                - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument
+                  `max_length` or to the maximum acceptable input length for the model if that argument is not
                   provided. This will truncate token by token, removing a token from the longest sequence in the pair
                   if a pair of sequences (or a batch of pairs) is provided.
-                * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
+                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to
                   the maximum acceptable input length for the model if that argument is not provided. This will only
                   truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
+                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or
                   to the maximum acceptable input length for the model if that argument is not provided. This will only
                   truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with
+                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with
                   sequence lengths greater than the model maximum admissible input size).
-            max_length (:obj:`int`, `optional`):
+            max_length (`int`, *optional*):
                 Controls the maximum length to use by one of the truncation/padding parameters.
 
-                If left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum
+                If left unset or set to `None`, this will use the predefined model maximum length if a maximum
                 length is required by one of the truncation/padding parameters. If the model has no specific maximum
                 input length (like XLNet) truncation/padding to a maximum length will be deactivated.
-            stride (:obj:`int`, `optional`, defaults to 0):
-                If set to a number along with :obj:`max_length`, the overflowing tokens returned when
-                :obj:`return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
+            stride (`int`, *optional*, defaults to 0):
+                If set to a number along with `max_length`, the overflowing tokens returned when
+                `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
                 returned to provide some overlap between truncated and overflowing sequences. The value of this
                 argument defines the number of overlapping tokens.
-            is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not the input is already pre-tokenized (e.g., split into words). If set to :obj:`True`, the
+            is_split_into_words (`bool`, *optional*, defaults to `False`):
+                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
                 tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
                 which it will tokenize. This is useful for NER or token classification.
-            pad_to_multiple_of (:obj:`int`, `optional`):
+            pad_to_multiple_of (`int`, *optional*):
                 If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                 the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
 
-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
 """
 
 ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
-            return_token_type_ids (:obj:`bool`, `optional`):
+            return_token_type_ids (`bool`, *optional*):
                 Whether to return token type IDs. If left to the default, will return the token type IDs according to
-                the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+                the specific tokenizer's default, defined by the `return_outputs` attribute.
 
-                `What are token type IDs? <../glossary.html#token-type-ids>`__
-            return_attention_mask (:obj:`bool`, `optional`):
+                [What are token type IDs?](../glossary#token-type-ids)
+            return_attention_mask (`bool`, *optional*):
                 Whether to return the attention mask. If left to the default, will return the attention mask according
-                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+                to the specific tokenizer's default, defined by the `return_outputs` attribute.
 
-                `What are attention masks? <../glossary.html#attention-mask>`__
-            return_overflowing_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                [What are attention masks?](../glossary#attention-mask)
+            return_overflowing_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch
-                of pairs) is provided with :obj:`truncation_strategy = longest_first` or :obj:`True`, an error is
+                of pairs) is provided with `truncation_strategy = longest_first` or `True`, an error is
                 raised instead of returning overflowing tokens.
-            return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            return_special_tokens_mask (`bool`, *optional*, defaults to `False`):
                 Whether or not to return special tokens mask information.
-            return_offsets_mapping (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Whether or not to return :obj:`(char_start, char_end)` for each token.
+            return_offsets_mapping (`bool`, *optional*, defaults to `False`):
+                Whether or not to return `(char_start, char_end)` for each token.
 
                 This is only available on fast tokenizers inheriting from
-                :class:`~transformers.PreTrainedTokenizerFast`, if using Python's tokenizer, this method will raise
-                :obj:`NotImplementedError`.
-            return_length  (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                [`PreTrainedTokenizerFast`], if using Python's tokenizer, this method will raise
+                `NotImplementedError`.
+            return_length  (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the lengths of the encoded inputs.
-            verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            verbose (`bool`, *optional*, defaults to `True`):
                 Whether or not to print more information and warnings.
-            **kwargs: passed to the :obj:`self.tokenize()` method
+            **kwargs: passed to the `self.tokenize()` method
 
-        Return:
-            :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields:
+            Return:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
 
             - **input_ids** -- List of token ids to be fed to a model.
 
-              `What are input IDs? <../glossary.html#input-ids>`__
+              [What are input IDs?](../glossary#input-ids)
 
-            - **token_type_ids** -- List of token type ids to be fed to a model (when :obj:`return_token_type_ids=True`
-              or if `"token_type_ids"` is in :obj:`self.model_input_names`).
+            - **token_type_ids** -- List of token type ids to be fed to a model (when `return_token_type_ids=True`
+              or if *"token_type_ids"* is in `self.model_input_names`).
 
-              `What are token type IDs? <../glossary.html#token-type-ids>`__
+              [What are token type IDs?](../glossary#token-type-ids)
 
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              :obj:`return_attention_mask=True` or if `"attention_mask"` is in :obj:`self.model_input_names`).
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`).
 
-              `What are attention masks? <../glossary.html#attention-mask>`__
+              [What are attention masks?](../glossary#attention-mask)
 
-            - **overflowing_tokens** -- List of overflowing tokens sequences (when a :obj:`max_length` is specified and
-              :obj:`return_overflowing_tokens=True`).
-            - **num_truncated_tokens** -- Number of tokens truncated (when a :obj:`max_length` is specified and
-              :obj:`return_overflowing_tokens=True`).
+            - **overflowing_tokens** -- List of overflowing tokens sequences (when a `max_length` is specified and
+              `return_overflowing_tokens=True`).
+            - **num_truncated_tokens** -- Number of tokens truncated (when a `max_length` is specified and
+              `return_overflowing_tokens=True`).
             - **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying
-              regular sequence tokens (when :obj:`add_special_tokens=True` and :obj:`return_special_tokens_mask=True`).
-            - **length** -- The length of the inputs (when :obj:`return_length=True`)
+              regular sequence tokens (when `add_special_tokens=True` and `return_special_tokens_mask=True`).
+            - **length** -- The length of the inputs (when `return_length=True`)
 """
 
 INIT_TOKENIZER_DOCSTRING = r"""
     Class attributes (overridden by derived classes)
 
-        - **vocab_files_names** (:obj:`Dict[str, str]`) -- A dictionary with, as keys, the ``__init__`` keyword name of
+        - **vocab_files_names** (`Dict[str, str]`) -- A dictionary with, as keys, the `__init__` keyword name of
           each vocabulary file required by the model, and as associated values, the filename for saving the associated
           file (string).
-        - **pretrained_vocab_files_map** (:obj:`Dict[str, Dict[str, str]]`) -- A dictionary of dictionaries, with the
-          high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the
-          low-level being the :obj:`short-cut-names` of the pretrained models with, as associated values, the
-          :obj:`url` to the associated pretrained vocabulary file.
-        - **max_model_input_sizes** (:obj:`Dict[str, Optional[int]]`) -- A dictionary with, as keys, the
-          :obj:`short-cut-names` of the pretrained models, and as associated values, the maximum length of the sequence
-          inputs of this model, or :obj:`None` if the model has no maximum input size.
-        - **pretrained_init_configuration** (:obj:`Dict[str, Dict[str, Any]]`) -- A dictionary with, as keys, the
-          :obj:`short-cut-names` of the pretrained models, and as associated values, a dictionary of specific arguments
-          to pass to the ``__init__`` method of the tokenizer class for this pretrained model when loading the
-          tokenizer with the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`
+        - **pretrained_vocab_files_map** (`Dict[str, Dict[str, str]]`) -- A dictionary of dictionaries, with the
+          high-level keys being the `__init__` keyword name of each vocabulary file required by the model, the
+          low-level being the `short-cut-names` of the pretrained models with, as associated values, the
+          `url` to the associated pretrained vocabulary file.
+        - **max_model_input_sizes** (`Dict[str, Optional[int]]`) -- A dictionary with, as keys, the
+          `short-cut-names` of the pretrained models, and as associated values, the maximum length of the sequence
+          inputs of this model, or `None` if the model has no maximum input size.
+        - **pretrained_init_configuration** (`Dict[str, Dict[str, Any]]`) -- A dictionary with, as keys, the
+          `short-cut-names` of the pretrained models, and as associated values, a dictionary of specific arguments
+          to pass to the `__init__` method of the tokenizer class for this pretrained model when loading the
+          tokenizer with the [`~tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`]
           method.
-        - **model_input_names** (:obj:`List[str]`) -- A list of inputs expected in the forward pass of the model.
-        - **padding_side** (:obj:`str`) -- The default value for the side on which the model should have padding
-          applied. Should be :obj:`'right'` or :obj:`'left'`.
+        - **model_input_names** (`List[str]`) -- A list of inputs expected in the forward pass of the model.
+        - **padding_side** (`str`) -- The default value for the side on which the model should have padding
+          applied. Should be `'right'` or `'left'`.
 
     Args:
-        model_max_length (:obj:`int`, `optional`):
+        model_max_length (`int`, *optional*):
             The maximum length (in number of tokens) for the inputs to the transformer model. When the tokenizer is
-            loaded with :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`, this
-            will be set to the value stored for the associated model in ``max_model_input_sizes`` (see above). If no
-            value is provided, will default to VERY_LARGE_INTEGER (:obj:`int(1e30)`).
-        padding_side: (:obj:`str`, `optional`):
+            loaded with [`~tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`], this
+            will be set to the value stored for the associated model in `max_model_input_sizes` (see above). If no
+            value is provided, will default to VERY_LARGE_INTEGER (`int(1e30)`).
+        padding_side: (`str`, *optional*):
             The side on which the model should have padding applied. Should be selected between ['right', 'left'].
             Default value is picked from the class attribute of the same name.
-        model_input_names (:obj:`List[string]`, `optional`):
-            The list of inputs accepted by the forward pass of the model (like :obj:`"token_type_ids"` or
-            :obj:`"attention_mask"`). Default value is picked from the class attribute of the same name.
-        bos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
-            A special token representing the beginning of a sentence. Will be associated to ``self.bos_token`` and
-            ``self.bos_token_id``.
-        eos_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
-            A special token representing the end of a sentence. Will be associated to ``self.eos_token`` and
-            ``self.eos_token_id``.
-        unk_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
-            A special token representing an out-of-vocabulary token. Will be associated to ``self.unk_token`` and
-            ``self.unk_token_id``.
-        sep_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
+        model_input_names (`List[string]`, *optional*):
+            The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or
+            `"attention_mask"`). Default value is picked from the class attribute of the same name.
+        bos_token (`str` or `tokenizers.AddedToken`, *optional*):
+            A special token representing the beginning of a sentence. Will be associated to `self.bos_token` and
+            `self.bos_token_id`.
+        eos_token (`str` or `tokenizers.AddedToken`, *optional*):
+            A special token representing the end of a sentence. Will be associated to `self.eos_token` and
+            `self.eos_token_id`.
+        unk_token (`str` or `tokenizers.AddedToken`, *optional*):
+            A special token representing an out-of-vocabulary token. Will be associated to `self.unk_token` and
+            `self.unk_token_id`.
+        sep_token (`str` or `tokenizers.AddedToken`, *optional*):
             A special token separating two different sentences in the same input (used by BERT for instance). Will be
-            associated to ``self.sep_token`` and ``self.sep_token_id``.
-        pad_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
+            associated to `self.sep_token` and `self.sep_token_id`.
+        pad_token (`str` or `tokenizers.AddedToken`, *optional*):
             A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
-            attention mechanisms or loss computation. Will be associated to ``self.pad_token`` and
-            ``self.pad_token_id``.
-        cls_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
+            attention mechanisms or loss computation. Will be associated to `self.pad_token` and
+            `self.pad_token_id`.
+        cls_token (`str` or `tokenizers.AddedToken`, *optional*):
             A special token representing the class of the input (used by BERT for instance). Will be associated to
-            ``self.cls_token`` and ``self.cls_token_id``.
-        mask_token (:obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
+            `self.cls_token` and `self.cls_token_id`.
+        mask_token (`str` or `tokenizers.AddedToken`, *optional*):
             A special token representing a masked token (used by masked-language modeling pretraining objectives, like
-            BERT). Will be associated to ``self.mask_token`` and ``self.mask_token_id``.
-        additional_special_tokens (tuple or list of :obj:`str` or :obj:`tokenizers.AddedToken`, `optional`):
+            BERT). Will be associated to `self.mask_token` and `self.mask_token_id`.
+        additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*):
             A tuple or a list of additional special tokens. Add them here to ensure they won't be split by the
-            tokenization process. Will be associated to ``self.additional_special_tokens`` and
-            ``self.additional_special_tokens_ids``.
+            tokenization process. Will be associated to `self.additional_special_tokens` and
+            `self.additional_special_tokens_ids`.
 """
 
 
 @add_end_docstrings(INIT_TOKENIZER_DOCSTRING)
 class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
     """
-    Base class for :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast`.
+    Base class for [`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`].
 
     Handles shared (mostly boiler plate) methods for those two classes.
     """
@@ -1476,14 +1478,14 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
     @property
     def max_len_single_sentence(self) -> int:
         """
-        :obj:`int`: The maximum length of a sentence that can be fed to the model.
+        `int`: The maximum length of a sentence that can be fed to the model.
         """
         return self.model_max_length - self.num_special_tokens_to_add(pair=False)
 
     @property
     def max_len_sentences_pair(self) -> int:
         """
-        :obj:`int`: The maximum combined length of a pair of sentences that can be fed to the model.
+        `int`: The maximum combined length of a pair of sentences that can be fed to the model.
         """
         return self.model_max_length - self.num_special_tokens_to_add(pair=True)
 
@@ -1526,90 +1528,91 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         """
         Returns the vocabulary as a dictionary of token to index.
 
-        :obj:`tokenizer.get_vocab()[token]` is equivalent to :obj:`tokenizer.convert_tokens_to_ids(token)` when
-        :obj:`token` is in the vocab.
+        `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when
+        `token` is in the vocab.
 
         Returns:
-            :obj:`Dict[str, int]`: The vocabulary.
+            `Dict[str, int]`: The vocabulary.
         """
         raise NotImplementedError()
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], *init_inputs, **kwargs):
         r"""
-        Instantiate a :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase` (or a derived class) from
+        Instantiate a [`~tokenization_utils_base.PreTrainedTokenizerBase`] (or a derived class) from
         a predefined tokenizer.
 
         Args:
-            pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`):
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
                 Can be either:
 
-                - A string, the `model id` of a predefined tokenizer hosted inside a model repo on huggingface.co.
-                  Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under a
-                  user or organization name, like ``dbmdz/bert-base-german-cased``.
-                - A path to a `directory` containing vocabulary files required by the tokenizer, for instance saved
-                  using the :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`
-                  method, e.g., ``./my_model_directory/``.
+                - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
+                  Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
+                  user or organization name, like `dbmdz/bert-base-german-cased`.
+                - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved
+                  using the [`~tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`]
+                  method, e.g., `./my_model_directory/`.
                 - (**Deprecated**, not applicable to all derived classes) A path or url to a single saved vocabulary
                   file (if and only if the tokenizer only requires a single vocabulary file like Bert or XLNet), e.g.,
-                  ``./my_model_directory/vocab.txt``.
-            cache_dir (:obj:`str` or :obj:`os.PathLike`, `optional`):
+                  `./my_model_directory/vocab.txt`.
+            cache_dir (`str` or `os.PathLike`, *optional*):
                 Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the
                 standard cache should not be used.
-            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            force_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to force the (re-)download the vocabulary files and override the cached versions if they
                 exist.
-            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            resume_download (`bool`, *optional*, defaults to `False`):
                 Whether or not to delete incompletely received files. Attempt to resume the download if such a file
                 exists.
-            proxies (:obj:`Dict[str, str]`, `optional`):
-                A dictionary of proxy servers to use by protocol or endpoint, e.g., :obj:`{'http': 'foo.bar:3128',
-                'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
-            use_auth_token (:obj:`str` or `bool`, `optional`):
-                The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
-                generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
-            local_files_only (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            proxies (`Dict[str, str]`, *optional*):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
+            use_auth_token (`str` or *bool*, *optional*):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token
+                generated when running `transformers-cli login` (stored in `~/.huggingface`).
+            local_files_only (`bool`, *optional*, defaults to `False`):
                 Whether or not to only rely on local files and not to attempt to download any files.
-            revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+            revision(`str`, *optional*, defaults to `"main"`):
                 The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-                git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                 identifier allowed by git.
-            subfolder (:obj:`str`, `optional`):
+            subfolder (`str`, *optional*):
                 In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for
                 facebook/rag-token-base), specify it here.
-            inputs (additional positional arguments, `optional`):
-                Will be passed along to the Tokenizer ``__init__`` method.
-            kwargs (additional keyword arguments, `optional`):
-                Will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like
-                ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``,
-                ``mask_token``, ``additional_special_tokens``. See parameters in the ``__init__`` for more details.
+            inputs (additional positional arguments, *optional*):
+                Will be passed along to the Tokenizer `__init__` method.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the Tokenizer `__init__` method. Can be used to set special tokens like
+                `bos_token`, `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`,
+                `mask_token`, `additional_special_tokens`. See parameters in the `__init__` for more details.
 
-        .. note::
+        <Tip>
 
-            Passing :obj:`use_auth_token=True` is required when you want to use a private model.
+        Passing `use_auth_token=True` is required when you want to use a private model.
 
-        Examples::
+        </Tip>
 
-            # We can't instantiate directly the base class `PreTrainedTokenizerBase` so let's show our examples on a derived class: BertTokenizer
-            # Download vocabulary from huggingface.co and cache.
-            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        Examples:
 
-            # Download vocabulary from huggingface.co (user-uploaded) and cache.
-            tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
+        ```python
+        # We can't instantiate directly the base class *PreTrainedTokenizerBase* so let's show our examples on a derived class: BertTokenizer
+        # Download vocabulary from huggingface.co and cache.
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 
-            # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
-            tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')
+        # Download vocabulary from huggingface.co (user-uploaded) and cache.
+        tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
 
-            # If the tokenizer uses a single vocabulary file, you can point directly to this file
-            tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt')
+        # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*)
+        tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')
 
-            # You can link tokens to special vocabulary when instantiating
-            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='<unk>')
-            # You should be sure '<unk>' is in the vocabulary when doing that.
-            # Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
-            assert tokenizer.unk_token == '<unk>'
+        # If the tokenizer uses a single vocabulary file, you can point directly to this file
+        tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt')
 
-        """
+        # You can link tokens to special vocabulary when instantiating
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='<unk>')
+        # You should be sure '<unk>' is in the vocabulary when doing that.
+        # Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
+        assert tokenizer.unk_token == '<unk>'
+        ```"""
         cache_dir = kwargs.pop("cache_dir", None)
         force_download = kwargs.pop("force_download", False)
         resume_download = kwargs.pop("resume_download", False)
@@ -1956,39 +1959,41 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
 
 
         This method make sure the full tokenizer can then be re-loaded using the
-        :meth:`~transformers.tokenization_utils_base.PreTrainedTokenizer.from_pretrained` class method..
+        [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] class method..
 
-        .. Warning::
-           This won't save modifications you may have applied to the tokenizer after the instantiation (for instance,
-           modifying :obj:`tokenizer.do_lower_case` after creation).
+        Warning,None
+        This won't save modifications you may have applied to the tokenizer after the instantiation (for instance,
+        modifying `tokenizer.do_lower_case` after creation).
 
         Args:
-            save_directory (:obj:`str` or :obj:`os.PathLike`): The path to a directory where the tokenizer will be saved.
-            legacy_format (:obj:`bool`, `optional`):
+            save_directory (`str` or `os.PathLike`): The path to a directory where the tokenizer will be saved.
+            legacy_format (`bool`, *optional*):
                 Only applicable for a fast tokenizer. If unset (default), will save the tokenizer in the unified JSON
                 format as well as in legacy format if it exists, i.e. with tokenizer specific vocabulary and a separate
                 added_tokens files.
 
-                If :obj:`False`, will only save the tokenizer in the unified JSON format. This format is incompatible
-                with "slow" tokenizers (not powered by the `tokenizers` library), so the tokenizer will not be able to
+                If `False`, will only save the tokenizer in the unified JSON format. This format is incompatible
+                with "slow" tokenizers (not powered by the *tokenizers* library), so the tokenizer will not be able to
                 be loaded in the corresponding "slow" tokenizer.
 
-                If :obj:`True`, will save the tokenizer in legacy format. If the "slow" tokenizer doesn't exits, a
+                If `True`, will save the tokenizer in legacy format. If the "slow" tokenizer doesn't exits, a
                 value error is raised.
-            filename_prefix: (:obj:`str`, `optional`):
+            filename_prefix: (`str`, *optional*):
                 A prefix to add to the names of the files saved by the tokenizer.
-            push_to_hub (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            push_to_hub (`bool`, *optional*, defaults to `False`):
                 Whether or not to push your model to the Hugging Face model hub after saving it.
 
-                .. warning::
+                <Tip warning={true}>
 
-                    Using :obj:`push_to_hub=True` will synchronize the repository you are pushing to with
-                    :obj:`save_directory`, which requires :obj:`save_directory` to be a local clone of the repo you are
-                    pushing to if it's an existing folder. Pass along :obj:`temp_dir=True` to use a temporary directory
-                    instead.
+                Using `push_to_hub=True` will synchronize the repository you are pushing to with
+                `save_directory`, which requires `save_directory` to be a local clone of the repo you are
+                pushing to if it's an existing folder. Pass along `temp_dir=True` to use a temporary directory
+                instead.
+
+                </Tip>
 
         Returns:
-            A tuple of :obj:`str`: The files saved.
+            A tuple of `str`: The files saved.
         """
         if os.path.isfile(save_directory):
             logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
@@ -2074,7 +2079,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens.
 
         Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the
-        specific :meth:`~transformers.tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`
+        specific [`~tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`]
         """
         if legacy_format is False:
             raise ValueError(
@@ -2102,36 +2107,36 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         Save only the vocabulary of the tokenizer (vocabulary + added tokens).
 
         This method won't save the configuration and special token mappings of the tokenizer. Use
-        :meth:`~transformers.PreTrainedTokenizerFast._save_pretrained` to save the whole state of the tokenizer.
+        [`~PreTrainedTokenizerFast._save_pretrained`] to save the whole state of the tokenizer.
 
         Args:
-            save_directory (:obj:`str`):
+            save_directory (`str`):
                 The directory in which to save the vocabulary.
-            filename_prefix (:obj:`str`, `optional`):
+            filename_prefix (`str`, *optional*):
                 An optional prefix to add to the named of the saved files.
 
         Returns:
-            :obj:`Tuple(str)`: Paths to the files saved.
+            `Tuple(str)`: Paths to the files saved.
         """
         raise NotImplementedError
 
     def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]:
         """
-        Converts a string in a sequence of tokens, replacing unknown tokens with the :obj:`unk_token`.
+        Converts a string in a sequence of tokens, replacing unknown tokens with the `unk_token`.
 
         Args:
-            text (:obj:`str`):
+            text (`str`):
                 The sequence to be encoded.
-            pair (:obj:`str`, `optional`):
+            pair (`str`, *optional*):
                 A second sequence to be encoded with the first.
-            add_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            add_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not to add the special tokens associated with the corresponding model.
-            kwargs (additional keyword arguments, `optional`):
+            kwargs (additional keyword arguments, *optional*):
                 Will be passed to the underlying model specific encode method. See details in
-                :meth:`~transformers.PreTrainedTokenizerBase.__call__`
+                [`~PreTrainedTokenizerBase.__call__`]
 
         Returns:
-            :obj:`List[str]`: The list of tokens.
+            `List[str]`: The list of tokens.
         """
         raise NotImplementedError
 
@@ -2142,7 +2147,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         """,
         """
         Returns:
-            :obj:`List[int]`, :obj:`torch.Tensor`, :obj:`tf.Tensor` or :obj:`np.ndarray`: The tokenized ids of the
+            `List[int]`, `torch.Tensor`, `tf.Tensor` or `np.ndarray`: The tokenized ids of the
             text.
         """,
     )
@@ -2161,17 +2166,17 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         """
         Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.
 
-        Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
+        Same as doing `self.convert_tokens_to_ids(self.tokenize(text))`.
 
         Args:
-            text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`):
+            text (`str`, `List[str]` or `List[int]`):
                 The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
-                ``tokenize`` method) or a list of integers (tokenized string ids using the ``convert_tokens_to_ids``
+                `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
                 method).
-            text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`):
+            text_pair (`str`, `List[str]` or `List[int]`, *optional*):
                 Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
-                the ``tokenize`` method) or a list of integers (tokenized string ids using the
-                ``convert_tokens_to_ids`` method).
+                the `tokenize` method) or a list of integers (tokenized string ids using the
+                `convert_tokens_to_ids` method).
         """
         encoded_inputs = self.encode_plus(
             text,
@@ -2353,14 +2358,14 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         sequences.
 
         Args:
-            text (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+            text (`str`, `List[str]`, `List[List[str]]`):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                :obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            text_pair (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            text_pair (`str`, `List[str]`, `List[List[str]]`):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                :obj:`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
         """
         # Input type checking for clearer error
         def _is_valid_text_input(t):
@@ -2476,18 +2481,21 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         """
         Tokenize and prepare for the model a sequence or a pair of sequences.
 
-        .. warning::
-            This method is deprecated, ``__call__`` should be used instead.
+        <Tip warning={true}>
+
+        This method is deprecated, `__call__` should be used instead.
+
+        </Tip>
 
         Args:
-            text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]` (the latter only for not-fast tokenizers)):
+            text (`str`, `List[str]` or `List[int]` (the latter only for not-fast tokenizers)):
                 The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
-                ``tokenize`` method) or a list of integers (tokenized string ids using the ``convert_tokens_to_ids``
+                `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids`
                 method).
-            text_pair (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`, `optional`):
+            text_pair (`str`, `List[str]` or `List[int]`, *optional*):
                 Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
-                the ``tokenize`` method) or a list of integers (tokenized string ids using the
-                ``convert_tokens_to_ids`` method).
+                the `tokenize` method) or a list of integers (tokenized string ids using the
+                `convert_tokens_to_ids` method).
         """
 
         # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
@@ -2575,14 +2583,17 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         """
         Tokenize and prepare for the model a list of sequences or a list of pairs of sequences.
 
-        .. warning::
-            This method is deprecated, ``__call__`` should be used instead.
+        <Tip warning={true}>
+
+        This method is deprecated, `__call__` should be used instead.
+
+        </Tip>
 
         Args:
-            batch_text_or_text_pairs (:obj:`List[str]`, :obj:`List[Tuple[str, str]]`, :obj:`List[List[str]]`, :obj:`List[Tuple[List[str], List[str]]]`, and for not-fast tokenizers, also :obj:`List[List[int]]`, :obj:`List[Tuple[List[int], List[int]]]`):
+            batch_text_or_text_pairs (`List[str]`, `List[Tuple[str, str]]`, `List[List[str]]`, `List[Tuple[List[str], List[str]]]`, and for not-fast tokenizers, also `List[List[int]]`, `List[Tuple[List[int], List[int]]]`):
                 Batch of sequences or pair of sequences to be encoded. This can be a list of
                 string/string-sequences/int-sequences or a list of pair of string/string-sequences/int-sequence (see
-                details in ``encode_plus``).
+                details in `encode_plus`).
         """
 
         # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
@@ -2664,53 +2675,54 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
         in the batch.
 
-        Padding side (left/right) padding token ids are defined at the tokenizer level (with ``self.padding_side``,
-        ``self.pad_token_id`` and ``self.pad_token_type_id``)
+        Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`,
+        `self.pad_token_id` and `self.pad_token_type_id`)
 
-        .. note::
+        <Tip>
 
-            If the ``encoded_inputs`` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
-            result will use the same type unless you provide a different tensor type with ``return_tensors``. In the
-            case of PyTorch tensors, you will lose the specific device of your tensors however.
+        If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
+        result will use the same type unless you provide a different tensor type with `return_tensors`. In the
+        case of PyTorch tensors, you will lose the specific device of your tensors however.
+
+        </Tip>
 
         Args:
-            encoded_inputs (:class:`~transformers.BatchEncoding`, list of :class:`~transformers.BatchEncoding`, :obj:`Dict[str, List[int]]`, :obj:`Dict[str, List[List[int]]` or :obj:`List[Dict[str, List[int]]]`):
-                Tokenized inputs. Can represent one input (:class:`~transformers.BatchEncoding` or :obj:`Dict[str,
-                List[int]]`) or a batch of tokenized inputs (list of :class:`~transformers.BatchEncoding`, `Dict[str,
-                List[List[int]]]` or `List[Dict[str, List[int]]]`) so you can use this method during preprocessing as
+            encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `Dict[str, List[int]]`, `Dict[str, List[List[int]]` or `List[Dict[str, List[int]]]`):
+                Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, List[int]]`) or a batch of tokenized inputs (list of [`BatchEncoding`], *Dict[str,
+                List[List[int]]]* or *List[Dict[str, List[int]]]*) so you can use this method during preprocessing as
                 well as in a PyTorch Dataloader collate function.
 
-                Instead of :obj:`List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
+                Instead of `List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
                 see the note above for the return type.
-            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`True`):
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `True`):
                  Select a strategy to pad the returned sequences (according to the model's padding side and padding
                  index) among:
 
-                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a
                   single sequence if provided).
-                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
                   maximum acceptable input length for the model if that argument is not provided.
-                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
                   different lengths).
-            max_length (:obj:`int`, `optional`):
+            max_length (`int`, *optional*):
                 Maximum length of the returned list and optionally padding length (see above).
-            pad_to_multiple_of (:obj:`int`, `optional`):
+            pad_to_multiple_of (`int`, *optional*):
                 If set will pad the sequence to a multiple of the provided value.
 
                 This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
                 >= 7.5 (Volta).
-            return_attention_mask (:obj:`bool`, `optional`):
+            return_attention_mask (`bool`, *optional*):
                 Whether to return the attention mask. If left to the default, will return the attention mask according
-                to the specific tokenizer's default, defined by the :obj:`return_outputs` attribute.
+                to the specific tokenizer's default, defined by the `return_outputs` attribute.
 
-                `What are attention masks? <../glossary.html#attention-mask>`__
-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+                [What are attention masks?](../glossary#attention-mask)
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
 
-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
-            verbose (:obj:`bool`, `optional`, defaults to :obj:`True`):
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            verbose (`bool`, *optional*, defaults to `True`):
                 Whether or not to print more information and warnings.
         """
         # If we have a list of dicts, let's convert it in a dict of lists
@@ -2807,17 +2819,16 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
     ) -> List[int]:
         """
-        Create the token type IDs corresponding to the sequences passed. `What are token type IDs?
-        <../glossary.html#token-type-ids>`__
+        Create the token type IDs corresponding to the sequences passed. [What are token type IDs?](../glossary#token-type-ids)
 
         Should be overridden in a subclass if the model has a special way of building those.
 
         Args:
-            token_ids_0 (:obj:`List[int]`): The first tokenized sequence.
-            token_ids_1 (:obj:`List[int]`, `optional`): The second tokenized sequence.
+            token_ids_0 (`List[int]`): The first tokenized sequence.
+            token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.
 
         Returns:
-            :obj:`List[int]`: The token type ids.
+            `List[int]`: The token type ids.
         """
         if token_ids_1 is None:
             return len(token_ids_0) * [0]
@@ -2833,11 +2844,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         This implementation does not add special tokens and this method should be overridden in a subclass.
 
         Args:
-            token_ids_0 (:obj:`List[int]`): The first tokenized sequence.
-            token_ids_1 (:obj:`List[int]`, `optional`): The second tokenized sequence.
+            token_ids_0 (`List[int]`): The first tokenized sequence.
+            token_ids_1 (`List[int]`, *optional*): The second tokenized sequence.
 
         Returns:
-            :obj:`List[int]`: The model input with special tokens.
+            `List[int]`: The model input with special tokens.
         """
         if token_ids_1 is None:
             return token_ids_0
@@ -2868,17 +2879,17 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         """
         Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
         adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
-        manages a moving window (with user defined stride) for overflowing tokens. Please Note, for `pair_ids`
-        different than `None` and `truncation_strategy = longest_first` or `True`, it is not possible to return
+        manages a moving window (with user defined stride) for overflowing tokens. Please Note, for *pair_ids*
+        different than *None* and *truncation_strategy = longest_first* or *True*, it is not possible to return
         overflowing tokens. Such a combination of arguments will raise an error.
 
         Args:
-            ids (:obj:`List[int]`):
-                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the ``tokenize``
-                and ``convert_tokens_to_ids`` methods.
-            pair_ids (:obj:`List[int]`, `optional`):
-                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the ``tokenize``
-                and ``convert_tokens_to_ids`` methods.
+            ids (`List[int]`):
+                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize`
+                and `convert_tokens_to_ids` methods.
+            pair_ids (`List[int]`, *optional*):
+                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
+                and `convert_tokens_to_ids` methods.
         """
 
         # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
@@ -2991,36 +3002,36 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         Truncates a sequence pair in-place following the strategy.
 
         Args:
-            ids (:obj:`List[int]`):
-                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the ``tokenize``
-                and ``convert_tokens_to_ids`` methods.
-            pair_ids (:obj:`List[int]`, `optional`):
-                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the ``tokenize``
-                and ``convert_tokens_to_ids`` methods.
-            num_tokens_to_remove (:obj:`int`, `optional`, defaults to 0):
+            ids (`List[int]`):
+                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize`
+                and `convert_tokens_to_ids` methods.
+            pair_ids (`List[int]`, *optional*):
+                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
+                and `convert_tokens_to_ids` methods.
+            num_tokens_to_remove (`int`, *optional*, defaults to 0):
                 Number of tokens to remove using the truncation strategy.
-            truncation_strategy (:obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`False`):
+            truncation_strategy (`str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
                 The strategy to follow for truncation. Can be:
 
-                * :obj:`'longest_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
+                - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
                   to the maximum acceptable input length for the model if that argument is not provided. This will
                   truncate token by token, removing a token from the longest sequence in the pair if a pair of
                   sequences (or a batch of pairs) is provided.
-                * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
+                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to
                   the maximum acceptable input length for the model if that argument is not provided. This will only
                   truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
+                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or
                   to the maximum acceptable input length for the model if that argument is not provided. This will only
                   truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                * :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
+                - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
                   greater than the model maximum admissible input size).
-            stride (:obj:`int`, `optional`, defaults to 0):
+            stride (`int`, *optional*, defaults to 0):
                 If set to a positive number, the overflowing tokens returned will contain some tokens from the main
                 sequence returned. The value of this argument defines the number of additional tokens.
 
         Returns:
-            :obj:`Tuple[List[int], List[int], List[int]]`: The truncated ``ids``, the truncated ``pair_ids`` and the
-            list of overflowing tokens. Note: The `longest_first` strategy returns empty list of overflowing tokens if
+            `Tuple[List[int], List[int], List[int]]`: The truncated `ids`, the truncated `pair_ids` and the
+            list of overflowing tokens. Note: The *longest_first* strategy returns empty list of overflowing tokens if
             a pair of sequences (or a batch of pairs) is provided.
         """
         if num_tokens_to_remove <= 0:
@@ -3153,14 +3164,14 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
 
     def convert_tokens_to_string(self, tokens: List[str]) -> str:
         """
-        Converts a sequence of tokens in a single string. The most simple way to do it is ``" ".join(tokens)`` but we
+        Converts a sequence of tokens in a single string. The most simple way to do it is `" ".join(tokens)` but we
         often want to remove sub-word tokenization artifacts at the same time.
 
         Args:
-            tokens (:obj:`List[str]`): The token to join in a string.
+            tokens (`List[str]`): The token to join in a string.
 
         Returns:
-            :obj:`str`: The joined tokens.
+            `str`: The joined tokens.
         """
         raise NotImplementedError
 
@@ -3175,17 +3186,17 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         Convert a list of lists of token ids into a list of strings by calling decode.
 
         Args:
-            sequences (:obj:`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
-                List of tokenized input ids. Can be obtained using the ``__call__`` method.
-            skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Can be obtained using the `__call__` method.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not to remove special tokens in the decoding.
-            clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
                 Whether or not to clean up the tokenization spaces.
-            kwargs (additional keyword arguments, `optional`):
+            kwargs (additional keyword arguments, *optional*):
                 Will be passed to the underlying model specific decode method.
 
         Returns:
-            :obj:`List[str]`: The list of decoded sentences.
+            `List[str]`: The list of decoded sentences.
         """
         return [
             self.decode(
@@ -3208,20 +3219,20 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
         tokens and clean up tokenization spaces.
 
-        Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
+        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
 
         Args:
-            token_ids (:obj:`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
-                List of tokenized input ids. Can be obtained using the ``__call__`` method.
-            skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Can be obtained using the `__call__` method.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not to remove special tokens in the decoding.
-            clean_up_tokenization_spaces (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
                 Whether or not to clean up the tokenization spaces.
-            kwargs (additional keyword arguments, `optional`):
+            kwargs (additional keyword arguments, *optional*):
                 Will be passed to the underlying model specific decode method.
 
         Returns:
-            :obj:`str`: The decoded sentence.
+            `str`: The decoded sentence.
         """
         # Convert inputs to python lists
         token_ids = to_py_obj(token_ids)
@@ -3247,14 +3258,14 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
     ) -> List[int]:
         """
         Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
+        special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of ids of the first sequence.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 List of ids of the second sequence.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
@@ -3279,10 +3290,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms.
 
         Args:
-            out_string (:obj:`str`): The text to clean up.
+            out_string (`str`): The text to clean up.
 
         Returns:
-            :obj:`str`: The cleaned-up string.
+            `str`: The cleaned-up string.
         """
         out_string = (
             out_string.replace(" .", ".")
@@ -3304,9 +3315,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         corresponding model
 
         Args:
-            ids (:obj:`List[str]`): The ids produced by the tokenization
-            max_length (:obj:`int`, `optional`): The max_length desired (does not trigger a warning if it is set)
-            verbose (:obj:`bool`): Whether or not to print more information and warnings.
+            ids (`List[str]`): The ids produced by the tokenization
+            max_length (`int`, *optional*): The max_length desired (does not trigger a warning if it is set)
+            verbose (`bool`): Whether or not to print more information and warnings.
 
         """
         if max_length is None and len(ids) > self.model_max_length and verbose:
@@ -3341,59 +3352,59 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         Prepare model inputs for translation. For best performance, translate one sentence at a time.
 
         Arguments:
-            src_texts (:obj:`List[str]`):
+            src_texts (`List[str]`):
                 List of documents to summarize or source language texts.
-            tgt_texts (:obj:`list`, `optional`):
+            tgt_texts (`list`, *optional*):
                 List of summaries or target language texts.
-            max_length (:obj:`int`, `optional`):
+            max_length (`int`, *optional*):
                 Controls the maximum length for encoder inputs (documents to summarize or source language texts) If
-                left unset or set to :obj:`None`, this will use the predefined model maximum length if a maximum length
+                left unset or set to `None`, this will use the predefined model maximum length if a maximum length
                 is required by one of the truncation/padding parameters. If the model has no specific maximum input
                 length (like XLNet) truncation/padding to a maximum length will be deactivated.
-            max_target_length (:obj:`int`, `optional`):
+            max_target_length (`int`, *optional*):
                 Controls the maximum length of decoder inputs (target language texts or summaries) If left unset or set
-                to :obj:`None`, this will use the max_length value.
-            padding (:obj:`bool`, :obj:`str` or :class:`~transformers.file_utils.PaddingStrategy`, `optional`, defaults to :obj:`False`):
+                to `None`, this will use the max_length value.
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
                 Activates and controls padding. Accepts the following values:
 
-                * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a
                   single sequence if provided).
-                * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
                   maximum acceptable input length for the model if that argument is not provided.
-                * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
                   different lengths).
-            return_tensors (:obj:`str` or :class:`~transformers.file_utils.TensorType`, `optional`):
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
                 If set, will return tensors instead of list of python integers. Acceptable values are:
 
-                * :obj:`'tf'`: Return TensorFlow :obj:`tf.constant` objects.
-                * :obj:`'pt'`: Return PyTorch :obj:`torch.Tensor` objects.
-                * :obj:`'np'`: Return Numpy :obj:`np.ndarray` objects.
-            truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.TruncationStrategy`, `optional`, defaults to :obj:`True`):
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `True`):
                 Activates and controls truncation. Accepts the following values:
 
-                * :obj:`True` or :obj:`'longest_first'`: Truncate to a maximum length specified with the argument
-                  :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not
+                - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument
+                  `max_length` or to the maximum acceptable input length for the model if that argument is not
                   provided. This will truncate token by token, removing a token from the longest sequence in the pair
                   if a pair of sequences (or a batch of pairs) is provided.
-                * :obj:`'only_first'`: Truncate to a maximum length specified with the argument :obj:`max_length` or to
+                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to
                   the maximum acceptable input length for the model if that argument is not provided. This will only
                   truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                * :obj:`'only_second'`: Truncate to a maximum length specified with the argument :obj:`max_length` or
+                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or
                   to the maximum acceptable input length for the model if that argument is not provided. This will only
                   truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
-                * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with
+                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with
                   sequence lengths greater than the model maximum admissible input size).
             **kwargs:
-                Additional keyword arguments passed along to :obj:`self.__call__`.
+                Additional keyword arguments passed along to `self.__call__`.
 
         Return:
-            :class:`~transformers.BatchEncoding`: A :class:`~transformers.BatchEncoding` with the following fields:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
 
             - **input_ids** -- List of token ids to be fed to the encoder.
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model.
             - **labels** -- List of token ids for tgt_texts.
 
-            The full set of keys ``[input_ids, attention_mask, labels]``, will only be returned if tgt_texts is passed.
+            The full set of keys `[input_ids, attention_mask, labels]`, will only be returned if tgt_texts is passed.
             Otherwise, input_ids, attention_mask will be the only keys.
         """
         # docstyle-ignore
@@ -3456,20 +3467,20 @@ def get_fast_tokenizer_file(
     Get the tokenizer file to use for this version of transformers.
 
     Args:
-        path_or_repo (:obj:`str` or :obj:`os.PathLike`):
-            Can be either the id of a repo on huggingface.co or a path to a `directory`.
-        revision(:obj:`str`, `optional`, defaults to :obj:`"main"`):
+        path_or_repo (`str` or `os.PathLike`):
+            Can be either the id of a repo on huggingface.co or a path to a *directory*.
+        revision(`str`, *optional*, defaults to `"main"`):
             The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
-            git-based system for storing models and other artifacts on huggingface.co, so ``revision`` can be any
+            git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
             identifier allowed by git.
-        use_auth_token (:obj:`str` or `bool`, `optional`):
-            The token to use as HTTP bearer authorization for remote files. If :obj:`True`, will use the token
-            generated when running :obj:`transformers-cli login` (stored in :obj:`~/.huggingface`).
-        local_files_only (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        use_auth_token (`str` or *bool*, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token
+            generated when running `transformers-cli login` (stored in `~/.huggingface`).
+        local_files_only (`bool`, *optional*, defaults to `False`):
             Whether or not to only rely on local files and not to attempt to download any files.
 
     Returns:
-        :obj:`str`: The tokenizer file to use.
+        `str`: The tokenizer file to use.
     """
     # Inspect all files from the repo/folder.
     all_files = get_list_of_files(
diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py
index cf6507dc21..2deb98d869 100644
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -56,11 +56,11 @@ TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
 ADDED_TOKENS_FILE = "added_tokens.json"
 
 INIT_TOKENIZER_DOCSTRING += """
-        tokenizer_object (:class:`tokenizers.Tokenizer`):
-            A :class:`tokenizers.Tokenizer` object from 🤗 tokenizers to instantiate from. See :doc:`Using tokenizers
-            from 🤗 tokenizers <../fast_tokenizers>` for more information.
-        tokenizer_file (:class:`str`):
-            A path to a local JSON file representing a previously serialized :class:`tokenizers.Tokenizer` object from
+        tokenizer_object ([`tokenizers.Tokenizer`]):
+            A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers
+            from 🤗 tokenizers](../fast_tokenizers) for more information.
+        tokenizer_file ([`str`]):
+            A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from
             🤗 tokenizers.
 """
 
@@ -77,7 +77,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
     """
     Base class for all fast tokenizers (wrapping HuggingFace tokenizers library).
 
-    Inherits from :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase`.
+    Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].
 
     Handles all the shared methods for tokenization and special tokens, as well as methods for
     downloading/caching/loading pretrained tokenizers, as well as adding tokens to the vocabulary.
@@ -139,7 +139,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
     @property
     def vocab_size(self) -> int:
         """
-        :obj:`int`: Size of the base vocabulary (without the added tokens).
+        `int`: Size of the base vocabulary (without the added tokens).
         """
         return self._tokenizer.get_vocab_size(with_added_tokens=False)
 
@@ -155,7 +155,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
         Returns the added tokens in the vocabulary as a dictionary of token to index.
 
         Returns:
-            :obj:`Dict[str, int]`: The added tokens.
+            `Dict[str, int]`: The added tokens.
         """
         base_vocab = self._tokenizer.get_vocab(with_added_tokens=False)
         full_vocab = self._tokenizer.get_vocab(with_added_tokens=True)
@@ -171,14 +171,14 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
     @property
     def backend_tokenizer(self) -> TokenizerFast:
         """
-        :obj:`tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
+        `tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
         """
         return self._tokenizer
 
     @property
     def decoder(self) -> DecoderFast:
         """
-        :obj:`tokenizers.decoders.Decoder`: The Rust decoder for this tokenizer.
+        `tokenizers.decoders.Decoder`: The Rust decoder for this tokenizer.
         """
         return self._tokenizer._tokenizer.decoder
 
@@ -235,10 +235,10 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
         vocabulary.
 
         Args:
-            tokens (:obj:`str` or :obj:`List[str]`): One or several token(s) to convert to token id(s).
+            tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).
 
         Returns:
-            :obj:`int` or :obj:`List[int]`: The token id or list of token ids.
+            `int` or `List[int]`: The token id or list of token ids.
         """
         if tokens is None:
             return None
@@ -270,17 +270,20 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
         """
         Returns the number of added tokens when encoding a sequence with special tokens.
 
-        .. note::
-            This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not
-            put this inside your training loop.
+        <Tip>
+
+        This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not
+        put this inside your training loop.
+
+        </Tip>
 
         Args:
-            pair (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            pair (`bool`, *optional*, defaults to `False`):
                 Whether the number of added tokens should be computed in the case of a sequence pair or a single
                 sequence.
 
         Returns:
-            :obj:`int`: Number of special tokens added to sequences.
+            `int`: Number of special tokens added to sequences.
         """
         return self._tokenizer.num_special_tokens_to_add(pair)
 
@@ -292,13 +295,13 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
         added tokens.
 
         Args:
-            ids (:obj:`int` or :obj:`List[int]`):
+            ids (`int` or `List[int]`):
                 The token id (or token ids) to convert to tokens.
-            skip_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not to remove special tokens in the decoding.
 
         Returns:
-            :obj:`str` or :obj:`List[str]`: The decoded token(s).
+            `str` or `List[str]`: The decoded token(s).
         """
         if isinstance(ids, int):
             return self._tokenizer.id_to_token(ids)
@@ -330,15 +333,15 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
         section.
 
         Args:
-            padding_strategy (:class:`~transformers.file_utils.PaddingStrategy`):
+            padding_strategy ([`~file_utils.PaddingStrategy`]):
                 The kind of padding that will be applied to the input
-            truncation_strategy (:class:`~transformers.tokenization_utils_base.TruncationStrategy`):
+            truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`]):
                 The kind of truncation that will be applied to the input
-            max_length (:obj:`int`):
+            max_length (`int`):
                 The maximum size of a sequence.
-            stride (:obj:`int`):
+            stride (`int`):
                 The stride to use when handling overflow.
-            pad_to_multiple_of (:obj:`int`, `optional`):
+            pad_to_multiple_of (`int`, *optional*):
                 If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                 the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
         """
@@ -589,22 +592,22 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
         as the current one.
 
         Args:
-            text_iterator (generator of :obj:`List[str]`):
+            text_iterator (generator of `List[str]`):
                 The training corpus. Should be a generator of batches of texts, for instance a list of lists of texts
                 if you have everything in memory.
-            vocab_size (:obj:`int`):
+            vocab_size (`int`):
                 The size of the vocabulary you want for your tokenizer.
-            new_special_tokens (list of :obj:`str` or :obj:`AddedToken`, `optional`):
+            new_special_tokens (list of `str` or `AddedToken`, *optional*):
                 A list of new special tokens to add to the tokenizer you are training.
-            special_tokens_map (:obj:`Dict[str, str]`, `optional`):
+            special_tokens_map (`Dict[str, str]`, *optional*):
                 If you want to rename some of the special tokens this tokenizer uses, pass along a mapping old special
                 token name to new special token name in this argument.
             kwargs:
                 Additional keyword arguments passed along to the trainer from the 🤗 Tokenizers library.
 
         Returns:
-            :class:`~transformers.PreTrainedTokenizerFast`: A new tokenizer of the same type as the original one,
-            trained on :obj:`text_iterator`.
+            [`PreTrainedTokenizerFast`]: A new tokenizer of the same type as the original one,
+            trained on `text_iterator`.
 
         """
         tokenizer_json = json.loads(self._tokenizer.to_str())
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 30b8829983..5e937823e6 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -199,73 +199,76 @@ class Trainer:
     Trainer is a simple but feature-complete training and eval loop for PyTorch, optimized for 🤗 Transformers.
 
     Args:
-        model (:class:`~transformers.PreTrainedModel` or :obj:`torch.nn.Module`, `optional`):
-            The model to train, evaluate or use for predictions. If not provided, a ``model_init`` must be passed.
+        model ([`PreTrainedModel`] or `torch.nn.Module`, *optional*):
+            The model to train, evaluate or use for predictions. If not provided, a `model_init` must be passed.
 
-            .. note::
+            <Tip>
 
-                :class:`~transformers.Trainer` is optimized to work with the :class:`~transformers.PreTrainedModel`
-                provided by the library. You can still use your own models defined as :obj:`torch.nn.Module` as long as
-                they work the same way as the 🤗 Transformers models.
-        args (:class:`~transformers.TrainingArguments`, `optional`):
+            [`Trainer`] is optimized to work with the [`PreTrainedModel`]
+            provided by the library. You can still use your own models defined as `torch.nn.Module` as long as
+            they work the same way as the 🤗 Transformers models.
+
+            </Tip>
+
+        args ([`TrainingArguments`], *optional*):
             The arguments to tweak for training. Will default to a basic instance of
-            :class:`~transformers.TrainingArguments` with the ``output_dir`` set to a directory named `tmp_trainer` in
+            [`TrainingArguments`] with the `output_dir` set to a directory named *tmp_trainer* in
             the current directory if not provided.
-        data_collator (:obj:`DataCollator`, `optional`):
-            The function to use to form a batch from a list of elements of :obj:`train_dataset` or :obj:`eval_dataset`.
-            Will default to :func:`~transformers.default_data_collator` if no ``tokenizer`` is provided, an instance of
-            :func:`~transformers.DataCollatorWithPadding` otherwise.
-        train_dataset (:obj:`torch.utils.data.Dataset` or :obj:`torch.utils.data.IterableDataset`, `optional`):
-            The dataset to use for training. If it is an :obj:`datasets.Dataset`, columns not accepted by the
-            ``model.forward()`` method are automatically removed.
+        data_collator (`DataCollator`, *optional*):
+            The function to use to form a batch from a list of elements of `train_dataset` or `eval_dataset`.
+            Will default to [`default_data_collator`] if no `tokenizer` is provided, an instance of
+            [`DataCollatorWithPadding`] otherwise.
+        train_dataset (`torch.utils.data.Dataset` or `torch.utils.data.IterableDataset`, *optional*):
+            The dataset to use for training. If it is an `datasets.Dataset`, columns not accepted by the
+            `model.forward()` method are automatically removed.
 
-            Note that if it's a :obj:`torch.utils.data.IterableDataset` with some randomization and you are training in
-            a distributed fashion, your iterable dataset should either use a internal attribute :obj:`generator` that
-            is a :obj:`torch.Generator` for the randomization that must be identical on all processes (and the Trainer
-            will manually set the seed of this :obj:`generator` at each epoch) or have a :obj:`set_epoch()` method that
+            Note that if it's a `torch.utils.data.IterableDataset` with some randomization and you are training in
+            a distributed fashion, your iterable dataset should either use a internal attribute `generator` that
+            is a `torch.Generator` for the randomization that must be identical on all processes (and the Trainer
+            will manually set the seed of this `generator` at each epoch) or have a `set_epoch()` method that
             internally sets the seed of the RNGs used.
-        eval_dataset (:obj:`torch.utils.data.Dataset`, `optional`):
-             The dataset to use for evaluation. If it is an :obj:`datasets.Dataset`, columns not accepted by the
-             ``model.forward()`` method are automatically removed.
-        tokenizer (:class:`PreTrainedTokenizerBase`, `optional`):
+        eval_dataset (`torch.utils.data.Dataset`, *optional*):
+             The dataset to use for evaluation. If it is an `datasets.Dataset`, columns not accepted by the
+             `model.forward()` method are automatically removed.
+        tokenizer ([`PreTrainedTokenizerBase`], *optional*):
             The tokenizer used to preprocess the data. If provided, will be used to automatically pad the inputs the
             maximum length when batching inputs, and it will be saved along the model to make it easier to rerun an
             interrupted training or reuse the fine-tuned model.
-        model_init (:obj:`Callable[[], PreTrainedModel]`, `optional`):
+        model_init (`Callable[[], PreTrainedModel]`, *optional*):
             A function that instantiates the model to be used. If provided, each call to
-            :meth:`~transformers.Trainer.train` will start from a new instance of the model as given by this function.
+            [`~Trainer.train`] will start from a new instance of the model as given by this function.
 
             The function may have zero argument, or a single one containing the optuna/Ray Tune/SigOpt trial object, to
             be able to choose different architectures according to hyper parameters (such as layer count, sizes of
             inner layers, dropout probabilities etc).
-        compute_metrics (:obj:`Callable[[EvalPrediction], Dict]`, `optional`):
+        compute_metrics (`Callable[[EvalPrediction], Dict]`, *optional*):
             The function that will be used to compute metrics at evaluation. Must take a
-            :class:`~transformers.EvalPrediction` and return a dictionary string to metric values.
-        callbacks (List of :class:`~transformers.TrainerCallback`, `optional`):
+            [`EvalPrediction`] and return a dictionary string to metric values.
+        callbacks (List of [`TrainerCallback`], *optional*):
             A list of callbacks to customize the training loop. Will add those to the list of default callbacks
-            detailed in :doc:`here <callback>`.
+            detailed in [here](callback).
 
-            If you want to remove one of the default callbacks used, use the :meth:`Trainer.remove_callback` method.
-        optimizers (:obj:`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, `optional`): A tuple
+            If you want to remove one of the default callbacks used, use the [`Trainer.remove_callback`] method.
+        optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*): A tuple
             containing the optimizer and the scheduler to use. Will default to an instance of
-            :class:`~transformers.AdamW` on your model and a scheduler given by
-            :func:`~transformers.get_linear_schedule_with_warmup` controlled by :obj:`args`.
+            [`AdamW`] on your model and a scheduler given by
+            [`get_linear_schedule_with_warmup`] controlled by `args`.
 
     Important attributes:
 
         - **model** -- Always points to the core model. If using a transformers model, it will be a
-          :class:`~transformers.PreTrainedModel` subclass.
+          [`PreTrainedModel`] subclass.
         - **model_wrapped** -- Always points to the most external model in case one or more other modules wrap the
-          original model. This is the model that should be used for the forward pass. For example, under ``DeepSpeed``,
-          the inner model is wrapped in ``DeepSpeed`` and then again in ``torch.nn.DistributedDataParallel``. If the
-          inner model hasn't been wrapped, then ``self.model_wrapped`` is the same as ``self.model``.
+          original model. This is the model that should be used for the forward pass. For example, under `DeepSpeed`,
+          the inner model is wrapped in `DeepSpeed` and then again in `torch.nn.DistributedDataParallel`. If the
+          inner model hasn't been wrapped, then `self.model_wrapped` is the same as `self.model`.
         - **is_model_parallel** -- Whether or not a model has been switched to a model parallel mode (different from
           data parallelism, this means some of the model layers are split on different GPUs).
         - **place_model_on_device** -- Whether or not to automatically place the model on the device - it will be set
-          to :obj:`False` if model parallel or deepspeed is used, or if the default
-          ``TrainingArguments.place_model_on_device`` is overridden to return :obj:`False` .
-        - **is_in_train** -- Whether or not a model is currently running ``train`` (e.g. when ``evaluate`` is called
-          while in ``train``)
+          to `False` if model parallel or deepspeed is used, or if the default
+          `TrainingArguments.place_model_on_device` is overridden to return `False` .
+        - **is_in_train** -- Whether or not a model is currently running `train` (e.g. when `evaluate` is called
+          while in `train`)
 
     """
 
@@ -490,38 +493,38 @@ class Trainer:
 
     def add_callback(self, callback):
         """
-        Add a callback to the current list of :class:`~transformer.TrainerCallback`.
+        Add a callback to the current list of [`~transformer.TrainerCallback`].
 
         Args:
-           callback (:obj:`type` or :class:`~transformer.TrainerCallback`):
-               A :class:`~transformer.TrainerCallback` class or an instance of a :class:`~transformer.TrainerCallback`.
+           callback (`type` or [`~transformer.TrainerCallback`]):
+               A [`~transformer.TrainerCallback`] class or an instance of a [`~transformer.TrainerCallback`].
                In the first case, will instantiate a member of that class.
         """
         self.callback_handler.add_callback(callback)
 
     def pop_callback(self, callback):
         """
-        Remove a callback from the current list of :class:`~transformer.TrainerCallback` and returns it.
+        Remove a callback from the current list of [`~transformer.TrainerCallback`] and returns it.
 
-        If the callback is not found, returns :obj:`None` (and no error is raised).
+        If the callback is not found, returns `None` (and no error is raised).
 
         Args:
-           callback (:obj:`type` or :class:`~transformer.TrainerCallback`):
-               A :class:`~transformer.TrainerCallback` class or an instance of a :class:`~transformer.TrainerCallback`.
+           callback (`type` or [`~transformer.TrainerCallback`]):
+               A [`~transformer.TrainerCallback`] class or an instance of a [`~transformer.TrainerCallback`].
                In the first case, will pop the first member of that class found in the list of callbacks.
 
         Returns:
-            :class:`~transformer.TrainerCallback`: The callback removed, if found.
+            [`~transformer.TrainerCallback`]: The callback removed, if found.
         """
         return self.callback_handler.pop_callback(callback)
 
     def remove_callback(self, callback):
         """
-        Remove a callback from the current list of :class:`~transformer.TrainerCallback`.
+        Remove a callback from the current list of [`~transformer.TrainerCallback`].
 
         Args:
-           callback (:obj:`type` or :class:`~transformer.TrainerCallback`):
-               A :class:`~transformer.TrainerCallback` class or an instance of a :class:`~transformer.TrainerCallback`.
+           callback (`type` or [`~transformer.TrainerCallback`]):
+               A [`~transformer.TrainerCallback`] class or an instance of a [`~transformer.TrainerCallback`].
                In the first case, will remove the first member of that class found in the list of callbacks.
         """
         self.callback_handler.remove_callback(callback)
@@ -624,9 +627,9 @@ class Trainer:
 
     def get_train_dataloader(self) -> DataLoader:
         """
-        Returns the training :class:`~torch.utils.data.DataLoader`.
+        Returns the training [`~torch.utils.data.DataLoader`].
 
-        Will use no sampler if :obj:`self.train_dataset` does not implement :obj:`__len__`, a random sampler (adapted
+        Will use no sampler if `self.train_dataset` does not implement `__len__`, a random sampler (adapted
         to distributed training if necessary) otherwise.
 
         Subclass and override this method if you want to inject some custom behavior.
@@ -699,14 +702,14 @@ class Trainer:
 
     def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoader:
         """
-        Returns the evaluation :class:`~torch.utils.data.DataLoader`.
+        Returns the evaluation [`~torch.utils.data.DataLoader`].
 
         Subclass and override this method if you want to inject some custom behavior.
 
         Args:
-            eval_dataset (:obj:`torch.utils.data.Dataset`, `optional`):
-                If provided, will override :obj:`self.eval_dataset`. If it is an :obj:`datasets.Dataset`, columns not
-                accepted by the ``model.forward()`` method are automatically removed. It must implement :obj:`__len__`.
+            eval_dataset (`torch.utils.data.Dataset`, *optional*):
+                If provided, will override `self.eval_dataset`. If it is an `datasets.Dataset`, columns not
+                accepted by the `model.forward()` method are automatically removed. It must implement `__len__`.
         """
         if eval_dataset is None and self.eval_dataset is None:
             raise ValueError("Trainer: evaluation requires an eval_dataset.")
@@ -746,14 +749,14 @@ class Trainer:
 
     def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
         """
-        Returns the test :class:`~torch.utils.data.DataLoader`.
+        Returns the test [`~torch.utils.data.DataLoader`].
 
         Subclass and override this method if you want to inject some custom behavior.
 
         Args:
-            test_dataset (:obj:`torch.utils.data.Dataset`, `optional`):
-                The test dataset to use. If it is an :obj:`datasets.Dataset`, columns not accepted by the
-                ``model.forward()`` method are automatically removed. It must implement :obj:`__len__`.
+            test_dataset (`torch.utils.data.Dataset`, *optional*):
+                The test dataset to use. If it is an `datasets.Dataset`, columns not accepted by the
+                `model.forward()` method are automatically removed. It must implement `__len__`.
         """
         if is_datasets_available() and isinstance(test_dataset, datasets.Dataset):
             test_dataset = self._remove_unused_columns(test_dataset, description="test")
@@ -792,8 +795,8 @@ class Trainer:
         Setup the optimizer and the learning rate scheduler.
 
         We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
-        Trainer's init through :obj:`optimizers`, or subclass and override this method (or :obj:`create_optimizer`
-        and/or :obj:`create_scheduler`) in a subclass.
+        Trainer's init through `optimizers`, or subclass and override this method (or `create_optimizer`
+        and/or `create_scheduler`) in a subclass.
         """
         self.create_optimizer()
         self.create_scheduler(num_training_steps=num_training_steps, optimizer=self.optimizer)
@@ -803,7 +806,7 @@ class Trainer:
         Setup the optimizer.
 
         We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
-        Trainer's init through :obj:`optimizers`, or subclass and override this method in a subclass.
+        Trainer's init through `optimizers`, or subclass and override this method in a subclass.
         """
         if self.optimizer is None:
             decay_parameters = get_parameter_names(self.model, [nn.LayerNorm])
@@ -862,9 +865,9 @@ class Trainer:
 
     def num_examples(self, dataloader: DataLoader) -> int:
         """
-        Helper to get number of samples in a :class:`~torch.utils.data.DataLoader` by accessing its dataset.
+        Helper to get number of samples in a [`~torch.utils.data.DataLoader`] by accessing its dataset.
 
-        Will raise an exception if the underlying dataset does not implement method :obj:`__len__`
+        Will raise an exception if the underlying dataset does not implement method `__len__`
         """
         return len(dataloader.dataset)
 
@@ -1031,14 +1034,14 @@ class Trainer:
         Main training entry point.
 
         Args:
-            resume_from_checkpoint (:obj:`str` or :obj:`bool`, `optional`):
-                If a :obj:`str`, local path to a saved checkpoint as saved by a previous instance of
-                :class:`~transformers.Trainer`. If a :obj:`bool` and equals `True`, load the last checkpoint in
-                `args.output_dir` as saved by a previous instance of :class:`~transformers.Trainer`. If present,
+            resume_from_checkpoint (`str` or `bool`, *optional*):
+                If a `str`, local path to a saved checkpoint as saved by a previous instance of
+                [`Trainer`]. If a `bool` and equals *True*, load the last checkpoint in
+                *args.output_dir* as saved by a previous instance of [`Trainer`]. If present,
                 training will resume from the model/optimizer/scheduler states loaded here.
-            trial (:obj:`optuna.Trial` or :obj:`Dict[str, Any]`, `optional`):
+            trial (`optuna.Trial` or `Dict[str, Any]`, *optional*):
                 The trial run or the hyperparameter dictionary for hyperparameter search.
-            ignore_keys_for_eval (:obj:`List[str]`, `optional`)
+            ignore_keys_for_eval (`List[str]`, *optional*)
                 A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                 gathering predictions for evaluation during the training.
             kwargs:
@@ -1717,47 +1720,47 @@ class Trainer:
         **kwargs,
     ) -> BestRun:
         """
-        Launch an hyperparameter search using ``optuna`` or ``Ray Tune`` or ``SigOpt``. The optimized quantity is
-        determined by :obj:`compute_objective`, which defaults to a function returning the evaluation loss when no
+        Launch an hyperparameter search using `optuna` or `Ray Tune` or `SigOpt`. The optimized quantity is
+        determined by `compute_objective`, which defaults to a function returning the evaluation loss when no
         metric is provided, the sum of all metrics otherwise.
 
-        .. warning::
+        <Tip warning={true}>
 
-            To use this method, you need to have provided a ``model_init`` when initializing your
-            :class:`~transformers.Trainer`: we need to reinitialize the model at each new run. This is incompatible
-            with the ``optimizers`` argument, so you need to subclass :class:`~transformers.Trainer` and override the
-            method :meth:`~transformers.Trainer.create_optimizer_and_scheduler` for custom optimizer/scheduler.
+        To use this method, you need to have provided a `model_init` when initializing your
+        [`Trainer`]: we need to reinitialize the model at each new run. This is incompatible
+        with the `optimizers` argument, so you need to subclass [`Trainer`] and override the
+        method [`~Trainer.create_optimizer_and_scheduler`] for custom optimizer/scheduler.
+
+        </Tip>
 
         Args:
-            hp_space (:obj:`Callable[["optuna.Trial"], Dict[str, float]]`, `optional`):
+            hp_space (`Callable[["optuna.Trial"], Dict[str, float]]`, *optional*):
                 A function that defines the hyperparameter search space. Will default to
-                :func:`~transformers.trainer_utils.default_hp_space_optuna` or
-                :func:`~transformers.trainer_utils.default_hp_space_ray` or
-                :func:`~transformers.trainer_utils.default_hp_space_sigopt` depending on your backend.
-            compute_objective (:obj:`Callable[[Dict[str, float]], float]`, `optional`):
+                [`~trainer_utils.default_hp_space_optuna`] or
+                [`~trainer_utils.default_hp_space_ray`] or
+                [`~trainer_utils.default_hp_space_sigopt`] depending on your backend.
+            compute_objective (`Callable[[Dict[str, float]], float]`, *optional*):
                 A function computing the objective to minimize or maximize from the metrics returned by the
-                :obj:`evaluate` method. Will default to :func:`~transformers.trainer_utils.default_compute_objective`.
-            n_trials (:obj:`int`, `optional`, defaults to 100):
+                `evaluate` method. Will default to [`~trainer_utils.default_compute_objective`].
+            n_trials (`int`, *optional*, defaults to 100):
                 The number of trial runs to test.
-            direction(:obj:`str`, `optional`, defaults to :obj:`"minimize"`):
-                Whether to optimize greater or lower objects. Can be :obj:`"minimize"` or :obj:`"maximize"`, you should
-                pick :obj:`"minimize"` when optimizing the validation loss, :obj:`"maximize"` when optimizing one or
+            direction(`str`, *optional*, defaults to `"minimize"`):
+                Whether to optimize greater or lower objects. Can be `"minimize"` or `"maximize"`, you should
+                pick `"minimize"` when optimizing the validation loss, `"maximize"` when optimizing one or
                 several metrics.
-            backend(:obj:`str` or :class:`~transformers.training_utils.HPSearchBackend`, `optional`):
+            backend(`str` or [`~training_utils.HPSearchBackend`], *optional*):
                 The backend to use for hyperparameter search. Will default to optuna or Ray Tune or SigOpt, depending
                 on which one is installed. If all are installed, will default to optuna.
             kwargs:
-                Additional keyword arguments passed along to :obj:`optuna.create_study` or :obj:`ray.tune.run`. For
+                Additional keyword arguments passed along to `optuna.create_study` or `ray.tune.run`. For
                 more information see:
 
-                - the documentation of `optuna.create_study
-                  <https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.create_study.html>`__
-                - the documentation of `tune.run
-                  <https://docs.ray.io/en/latest/tune/api_docs/execution.html#tune-run>`__
-                - the documentation of `sigopt <https://app.sigopt.com/docs/endpoints/experiments/create>`__
+                - the documentation of [optuna.create_study](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.create_study.html)
+                - the documentation of [tune.run](https://docs.ray.io/en/latest/tune/api_docs/execution.html#tune-run)
+                - the documentation of [sigopt](https://app.sigopt.com/docs/endpoints/experiments/create)
 
         Returns:
-            :class:`transformers.trainer_utils.BestRun`: All the information about the best run.
+            [`trainer_utils.BestRun`]: All the information about the best run.
         """
         if backend is None:
             backend = default_hp_search_backend()
@@ -1799,12 +1802,12 @@ class Trainer:
 
     def log(self, logs: Dict[str, float]) -> None:
         """
-        Log :obj:`logs` on the various objects watching training.
+        Log `logs` on the various objects watching training.
 
         Subclass and override this method to inject custom behavior.
 
         Args:
-            logs (:obj:`Dict[str, float]`):
+            logs (`Dict[str, float]`):
                 The values to log.
         """
         if self.state.epoch is not None:
@@ -1816,7 +1819,7 @@ class Trainer:
 
     def _prepare_input(self, data: Union[torch.Tensor, Any]) -> Union[torch.Tensor, Any]:
         """
-        Prepares one :obj:`data` before feeding it to the model, be it a tensor or a nested list/dictionary of tensors.
+        Prepares one `data` before feeding it to the model, be it a tensor or a nested list/dictionary of tensors.
         """
         if isinstance(data, Mapping):
             return type(data)({k: self._prepare_input(v) for k, v in data.items()})
@@ -1834,7 +1837,7 @@ class Trainer:
 
     def _prepare_inputs(self, inputs: Dict[str, Union[torch.Tensor, Any]]) -> Dict[str, Union[torch.Tensor, Any]]:
         """
-        Prepare :obj:`inputs` before feeding them to the model, converting them to tensors if they are not already and
+        Prepare `inputs` before feeding them to the model, converting them to tensors if they are not already and
         handling potential state.
         """
         inputs = self._prepare_input(inputs)
@@ -1845,7 +1848,7 @@ class Trainer:
 
     def autocast_smart_context_manager(self):
         """
-        A helper wrapper that creates an appropriate context manager for :obj:`autocast` while feeding it the desired
+        A helper wrapper that creates an appropriate context manager for `autocast` while feeding it the desired
         arguments, depending on the situation.
         """
         if self.use_amp:
@@ -1865,16 +1868,16 @@ class Trainer:
         Subclass and override to inject custom behavior.
 
         Args:
-            model (:obj:`nn.Module`):
+            model (`nn.Module`):
                 The model to train.
-            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
+            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
                 The inputs and targets of the model.
 
                 The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
-                argument :obj:`labels`. Check your model's documentation for all accepted arguments.
+                argument `labels`. Check your model's documentation for all accepted arguments.
 
         Return:
-            :obj:`torch.Tensor`: The tensor with training loss on this batch.
+            `torch.Tensor`: The tensor with training loss on this batch.
         """
         model.train()
         inputs = self._prepare_inputs(inputs)
@@ -1941,7 +1944,7 @@ class Trainer:
     def is_world_process_zero(self) -> bool:
         """
         Whether or not this process is the global main process (when training in a distributed fashion on several
-        machines, this is only going to be :obj:`True` for one process).
+        machines, this is only going to be `True` for one process).
         """
         # Special case for SageMaker ModelParallel since there process_index is dp_process_index, not the global
         # process index.
@@ -1952,7 +1955,7 @@ class Trainer:
 
     def save_model(self, output_dir: Optional[str] = None):
         """
-        Will save the model, so you can reload it using :obj:`from_pretrained()`.
+        Will save the model, so you can reload it using `from_pretrained()`.
 
         Will only save from the main process.
         """
@@ -2125,19 +2128,19 @@ class Trainer:
         Run evaluation and returns metrics.
 
         The calling script will be responsible for providing a method to compute metrics, as they are task-dependent
-        (pass it to the init :obj:`compute_metrics` argument).
+        (pass it to the init `compute_metrics` argument).
 
         You can also subclass and override this method to inject custom behavior.
 
         Args:
-            eval_dataset (:obj:`Dataset`, `optional`):
-                Pass a dataset if you wish to override :obj:`self.eval_dataset`. If it is an :obj:`datasets.Dataset`,
-                columns not accepted by the ``model.forward()`` method are automatically removed. It must implement the
-                :obj:`__len__` method.
-            ignore_keys (:obj:`Lst[str]`, `optional`):
+            eval_dataset (`Dataset`, *optional*):
+                Pass a dataset if you wish to override `self.eval_dataset`. If it is an `datasets.Dataset`,
+                columns not accepted by the `model.forward()` method are automatically removed. It must implement the
+                `__len__` method.
+            ignore_keys (`Lst[str]`, *optional*):
                 A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                 gathering predictions.
-            metric_key_prefix (:obj:`str`, `optional`, defaults to :obj:`"eval"`):
+            metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
                 An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
                 "eval_bleu" if the prefix is "eval" (default)
 
@@ -2191,30 +2194,32 @@ class Trainer:
         Run prediction and returns predictions and potential metrics.
 
         Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method
-        will also return metrics, like in :obj:`evaluate()`.
+        will also return metrics, like in `evaluate()`.
 
         Args:
-            test_dataset (:obj:`Dataset`):
-                Dataset to run the predictions on. If it is an :obj:`datasets.Dataset`, columns not accepted by the
-                ``model.forward()`` method are automatically removed. Has to implement the method :obj:`__len__`
-            ignore_keys (:obj:`Lst[str]`, `optional`):
+            test_dataset (`Dataset`):
+                Dataset to run the predictions on. If it is an `datasets.Dataset`, columns not accepted by the
+                `model.forward()` method are automatically removed. Has to implement the method `__len__`
+            ignore_keys (`Lst[str]`, *optional*):
                 A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                 gathering predictions.
-            metric_key_prefix (:obj:`str`, `optional`, defaults to :obj:`"test"`):
+            metric_key_prefix (`str`, *optional*, defaults to `"test"`):
                 An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
                 "test_bleu" if the prefix is "test" (default)
 
-        .. note::
+        <Tip>
 
-            If your predictions or labels have different sequence length (for instance because you're doing dynamic
-            padding in a token classification task) the predictions will be padded (on the right) to allow for
-            concatenation into one array. The padding index is -100.
+        If your predictions or labels have different sequence length (for instance because you're doing dynamic
+        padding in a token classification task) the predictions will be padded (on the right) to allow for
+        concatenation into one array. The padding index is -100.
 
-        Returns: `NamedTuple` A namedtuple with the following keys:
+        </Tip>
 
-            - predictions (:obj:`np.ndarray`): The predictions on :obj:`test_dataset`.
-            - label_ids (:obj:`np.ndarray`, `optional`): The labels (if the dataset contained some).
-            - metrics (:obj:`Dict[str, float]`, `optional`): The potential dictionary of metrics (if the dataset
+        Returns: *NamedTuple* A namedtuple with the following keys:
+
+            - predictions (`np.ndarray`): The predictions on `test_dataset`.
+            - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some).
+            - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset
               contained labels).
         """
         # memory metrics - must set up as early as possible
@@ -2250,7 +2255,7 @@ class Trainer:
         metric_key_prefix: str = "eval",
     ) -> EvalLoopOutput:
         """
-        Prediction/evaluation loop, shared by :obj:`Trainer.evaluate()` and :obj:`Trainer.predict()`.
+        Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`.
 
         Works both with or without labels.
         """
@@ -2468,21 +2473,21 @@ class Trainer:
         ignore_keys: Optional[List[str]] = None,
     ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
         """
-        Perform an evaluation step on :obj:`model` using obj:`inputs`.
+        Perform an evaluation step on `model` using obj:*inputs*.
 
         Subclass and override to inject custom behavior.
 
         Args:
-            model (:obj:`nn.Module`):
+            model (`nn.Module`):
                 The model to evaluate.
-            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
+            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
                 The inputs and targets of the model.
 
                 The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
-                argument :obj:`labels`. Check your model's documentation for all accepted arguments.
-            prediction_loss_only (:obj:`bool`):
+                argument `labels`. Check your model's documentation for all accepted arguments.
+            prediction_loss_only (`bool`):
                 Whether or not to return the loss only.
-            ignore_keys (:obj:`Lst[str]`, `optional`):
+            ignore_keys (`Lst[str]`, *optional*):
                 A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                 gathering predictions.
 
@@ -2559,16 +2564,16 @@ class Trainer:
 
     def floating_point_ops(self, inputs: Dict[str, Union[torch.Tensor, Any]]):
         """
-        For models that inherit from :class:`~transformers.PreTrainedModel`, uses that method to compute the number of
+        For models that inherit from [`PreTrainedModel`], uses that method to compute the number of
         floating point operations for every backward + forward pass. If using another model, either implement such a
         method in the model or subclass and override this method.
 
         Args:
-            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
+            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
                 The inputs and targets of the model.
 
         Returns:
-            :obj:`int`: The number of floating-point operations.
+            `int`: The number of floating-point operations.
         """
         if hasattr(self.model, "floating_point_ops"):
             return self.model.floating_point_ops(inputs)
@@ -2577,7 +2582,7 @@ class Trainer:
 
     def init_git_repo(self):
         """
-        Initializes a git repo in :obj:`self.args.hub_model_id`.
+        Initializes a git repo in `self.args.hub_model_id`.
         """
         if not self.is_world_process_zero():
             return
@@ -2694,19 +2699,19 @@ class Trainer:
 
     def push_to_hub(self, commit_message: Optional[str] = "End of training", blocking: bool = True, **kwargs) -> str:
         """
-        Upload `self.model` and `self.tokenizer` to the 🤗 model hub on the repo `self.args.hub_model_id`.
+        Upload *self.model* and *self.tokenizer* to the 🤗 model hub on the repo *self.args.hub_model_id*.
 
         Parameters:
-            commit_message (:obj:`str`, `optional`, defaults to :obj:`"End of training"`):
+            commit_message (`str`, *optional*, defaults to `"End of training"`):
                 Message to commit while pushing.
-            blocking (:obj:`bool`, `optional`, defaults to :obj:`True`):
-                Whether the function should return only when the :obj:`git push` has finished.
+            blocking (`bool`, *optional*, defaults to `True`):
+                Whether the function should return only when the `git push` has finished.
             kwargs:
-                Additional keyword arguments passed along to :meth:`~transformers.Trainer.create_model_card`.
+                Additional keyword arguments passed along to [`~Trainer.create_model_card`].
 
         Returns:
-            The url of the commit of your model in the given repository if :obj:`blocking=False`, a tuple with the url
-            of the commit and an object to track the progress of the commit if :obj:`blocking=True`
+            The url of the commit of your model in the given repository if `blocking=False`, a tuple with the url
+            of the commit and an object to track the progress of the commit if `blocking=True`
         """
 
         if self.args.should_save:
@@ -2750,7 +2755,7 @@ class Trainer:
         metric_key_prefix: str = "eval",
     ) -> PredictionOutput:
         """
-        Prediction/evaluation loop, shared by :obj:`Trainer.evaluate()` and :obj:`Trainer.predict()`.
+        Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`.
 
         Works both with or without labels.
         """
diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py
index b6904e6296..1a4a0eae75 100644
--- a/src/transformers/trainer_callback.py
+++ b/src/transformers/trainer_callback.py
@@ -35,40 +35,42 @@ logger = logging.get_logger(__name__)
 @dataclass
 class TrainerState:
     """
-    A class containing the :class:`~transformers.Trainer` inner state that will be saved along the model and optimizer
-    when checkpointing and passed to the :class:`~transformers.TrainerCallback`.
+    A class containing the [`Trainer`] inner state that will be saved along the model and optimizer
+    when checkpointing and passed to the [`TrainerCallback`].
 
-    .. note::
+    <Tip>
 
-        In all this class, one step is to be understood as one update step. When using gradient accumulation, one
-        update step may require several forward and backward passes: if you use :obj:`gradient_accumulation_steps=n`,
-        then one update step requires going through `n` batches.
+    In all this class, one step is to be understood as one update step. When using gradient accumulation, one
+    update step may require several forward and backward passes: if you use `gradient_accumulation_steps=n`,
+    then one update step requires going through *n* batches.
+
+    </Tip>
 
     Args:
-        epoch (:obj:`float`, `optional`):
+        epoch (`float`, *optional*):
             Only set during training, will represent the epoch the training is at (the decimal part being the
             percentage of the current epoch completed).
-        global_step (:obj:`int`, `optional`, defaults to 0):
+        global_step (`int`, *optional*, defaults to 0):
             During training, represents the number of update steps completed.
-        max_steps (:obj:`int`, `optional`, defaults to 0):
+        max_steps (`int`, *optional*, defaults to 0):
             The number of update steps to do during the current training.
-        total_flos (:obj:`float`, `optional`, defaults to 0):
+        total_flos (`float`, *optional*, defaults to 0):
             The total number of floating operations done by the model since the beginning of training (stored as floats
             to avoid overflow).
-        log_history (:obj:`List[Dict[str, float]]`, `optional`):
+        log_history (`List[Dict[str, float]]`, *optional*):
             The list of logs done since the beginning of training.
-        best_metric (:obj:`float`, `optional`):
+        best_metric (`float`, *optional*):
             When tracking the best model, the value of the best metric encountered so far.
-        best_model_checkpoint (:obj:`str`, `optional`):
+        best_model_checkpoint (`str`, *optional*):
             When tracking the best model, the value of the name of the checkpoint for the best model encountered so
             far.
-        is_local_process_zero (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        is_local_process_zero (`bool`, *optional*, defaults to `True`):
             Whether or not this process is the local (e.g., on one machine if training in a distributed fashion on
             several machines) main process.
-        is_world_process_zero (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        is_world_process_zero (`bool`, *optional*, defaults to `True`):
             Whether or not this process is the global main process (when training in a distributed fashion on several
-            machines, this is only going to be :obj:`True` for one process).
-        is_hyper_param_search (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            machines, this is only going to be `True` for one process).
+        is_hyper_param_search (`bool`, *optional*, defaults to `False`):
             Whether we are in the process of a hyper parameter search using Trainer.hyperparameter_search. This will
             impact the way data will be logged in TensorBoard.
     """
@@ -92,14 +94,14 @@ class TrainerState:
             self.log_history = []
 
     def save_to_json(self, json_path: str):
-        """Save the content of this instance in JSON format inside :obj:`json_path`."""
+        """Save the content of this instance in JSON format inside `json_path`."""
         json_string = json.dumps(dataclasses.asdict(self), indent=2, sort_keys=True) + "\n"
         with open(json_path, "w", encoding="utf-8") as f:
             f.write(json_string)
 
     @classmethod
     def load_from_json(cls, json_path: str):
-        """Create an instance from the content of :obj:`json_path`."""
+        """Create an instance from the content of `json_path`."""
         with open(json_path, "r", encoding="utf-8") as f:
             text = f.read()
         return cls(**json.loads(text))
@@ -108,30 +110,30 @@ class TrainerState:
 @dataclass
 class TrainerControl:
     """
-    A class that handles the :class:`~transformers.Trainer` control flow. This class is used by the
-    :class:`~transformers.TrainerCallback` to activate some switches in the training loop.
+    A class that handles the [`Trainer`] control flow. This class is used by the
+    [`TrainerCallback`] to activate some switches in the training loop.
 
     Args:
-        should_training_stop (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        should_training_stop (`bool`, *optional*, defaults to `False`):
             Whether or not the training should be interrupted.
 
-            If :obj:`True`, this variable will not be set back to :obj:`False`. The training will just stop.
-        should_epoch_stop (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If `True`, this variable will not be set back to `False`. The training will just stop.
+        should_epoch_stop (`bool`, *optional*, defaults to `False`):
             Whether or not the current epoch should be interrupted.
 
-            If :obj:`True`, this variable will be set back to :obj:`False` at the beginning of the next epoch.
-        should_save (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If `True`, this variable will be set back to `False` at the beginning of the next epoch.
+        should_save (`bool`, *optional*, defaults to `False`):
             Whether or not the model should be saved at this step.
 
-            If :obj:`True`, this variable will be set back to :obj:`False` at the beginning of the next step.
-        should_evaluate (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If `True`, this variable will be set back to `False` at the beginning of the next step.
+        should_evaluate (`bool`, *optional*, defaults to `False`):
             Whether or not the model should be evaluated at this step.
 
-            If :obj:`True`, this variable will be set back to :obj:`False` at the beginning of the next step.
-        should_log (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If `True`, this variable will be set back to `False` at the beginning of the next step.
+        should_log (`bool`, *optional*, defaults to `False`):
             Whether or not the logs should be reported at this step.
 
-            If :obj:`True`, this variable will be set back to :obj:`False` at the beginning of the next step.
+            If `True`, this variable will be set back to `False` at the beginning of the next step.
     """
 
     should_training_stop: bool = False
@@ -161,53 +163,54 @@ class TrainerCallback:
     each of those events the following arguments are available:
 
     Args:
-        args (:class:`~transformers.TrainingArguments`):
-            The training arguments used to instantiate the :class:`~transformers.Trainer`.
-        state (:class:`~transformers.TrainerState`):
-            The current state of the :class:`~transformers.Trainer`.
-        control (:class:`~transformers.TrainerControl`):
-            The object that is returned to the :class:`~transformers.Trainer` and can be used to make some decisions.
-        model (:class:`~transformers.PreTrainedModel` or :obj:`torch.nn.Module`):
+        args ([`TrainingArguments`]):
+            The training arguments used to instantiate the [`Trainer`].
+        state ([`TrainerState`]):
+            The current state of the [`Trainer`].
+        control ([`TrainerControl`]):
+            The object that is returned to the [`Trainer`] and can be used to make some decisions.
+        model ([`PreTrainedModel`] or `torch.nn.Module`):
             The model being trained.
-        tokenizer (:class:`~transformers.PreTrainedTokenizer`):
+        tokenizer ([`PreTrainedTokenizer`]):
             The tokenizer used for encoding the data.
-        optimizer (:obj:`torch.optim.Optimizer`):
+        optimizer (`torch.optim.Optimizer`):
             The optimizer used for the training steps.
-        lr_scheduler (:obj:`torch.optim.lr_scheduler.LambdaLR`):
+        lr_scheduler (`torch.optim.lr_scheduler.LambdaLR`):
             The scheduler used for setting the learning rate.
-        train_dataloader (:obj:`torch.utils.data.DataLoader`, `optional`):
+        train_dataloader (`torch.utils.data.DataLoader`, *optional*):
             The current dataloader used for training.
-        eval_dataloader (:obj:`torch.utils.data.DataLoader`, `optional`):
+        eval_dataloader (`torch.utils.data.DataLoader`, *optional*):
             The current dataloader used for training.
-        metrics (:obj:`Dict[str, float]`):
+        metrics (`Dict[str, float]`):
             The metrics computed by the last evaluation phase.
 
-            Those are only accessible in the event :obj:`on_evaluate`.
-        logs  (:obj:`Dict[str, float]`):
+            Those are only accessible in the event `on_evaluate`.
+        logs  (`Dict[str, float]`):
             The values to log.
 
-            Those are only accessible in the event :obj:`on_log`.
+            Those are only accessible in the event `on_log`.
 
-    The :obj:`control` object is the only one that can be changed by the callback, in which case the event that changes
+    The `control` object is the only one that can be changed by the callback, in which case the event that changes
     it should return the modified version.
 
-    The argument :obj:`args`, :obj:`state` and :obj:`control` are positionals for all events, all the others are
-    grouped in :obj:`kwargs`. You can unpack the ones you need in the signature of the event using them. As an example,
-    see the code of the simple :class:`~transformer.PrinterCallback`.
+    The argument `args`, `state` and `control` are positionals for all events, all the others are
+    grouped in `kwargs`. You can unpack the ones you need in the signature of the event using them. As an example,
+    see the code of the simple [`~transformer.PrinterCallback`].
 
-    Example::
+    Example:
 
-        class PrinterCallback(TrainerCallback):
+    ```python
+    class PrinterCallback(TrainerCallback):
 
-            def on_log(self, args, state, control, logs=None, **kwargs):
-                _ = logs.pop("total_flos", None)
-                if state.is_local_process_zero:
-                    print(logs)
-    """
+        def on_log(self, args, state, control, logs=None, **kwargs):
+            _ = logs.pop("total_flos", None)
+            if state.is_local_process_zero:
+                print(logs)
+    ```"""
 
     def on_init_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
         """
-        Event called at the end of the initialization of the :class:`~transformers.Trainer`.
+        Event called at the end of the initialization of the [`Trainer`].
         """
         pass
 
@@ -404,7 +407,7 @@ class CallbackHandler(TrainerCallback):
 
 class DefaultFlowCallback(TrainerCallback):
     """
-    A :class:`~transformers.TrainerCallback` that handles the default flow of the training loop for logs, evaluation
+    A [`TrainerCallback`] that handles the default flow of the training loop for logs, evaluation
     and checkpoints.
     """
 
@@ -451,7 +454,7 @@ class DefaultFlowCallback(TrainerCallback):
 
 class ProgressCallback(TrainerCallback):
     """
-    A :class:`~transformers.TrainerCallback` that displays the progress of training or evaluation.
+    A [`TrainerCallback`] that displays the progress of training or evaluation.
     """
 
     def __init__(self):
@@ -493,7 +496,7 @@ class ProgressCallback(TrainerCallback):
 
 class PrinterCallback(TrainerCallback):
     """
-    A bare :class:`~transformers.TrainerCallback` that just prints the logs.
+    A bare [`TrainerCallback`] that just prints the logs.
     """
 
     def on_log(self, args, state, control, logs=None, **kwargs):
@@ -504,18 +507,18 @@ class PrinterCallback(TrainerCallback):
 
 class EarlyStoppingCallback(TrainerCallback):
     """
-    A :class:`~transformers.TrainerCallback` that handles early stopping.
+    A [`TrainerCallback`] that handles early stopping.
 
     Args:
-       early_stopping_patience (:obj:`int`):
-            Use with :obj:`metric_for_best_model` to stop training when the specified metric worsens for
-            :obj:`early_stopping_patience` evaluation calls.
-       early_stopping_threshold(:obj:`float`, `optional`):
-            Use with TrainingArguments :obj:`metric_for_best_model` and :obj:`early_stopping_patience` to denote how
+       early_stopping_patience (`int`):
+            Use with `metric_for_best_model` to stop training when the specified metric worsens for
+            `early_stopping_patience` evaluation calls.
+       early_stopping_threshold(`float`, *optional*):
+            Use with TrainingArguments `metric_for_best_model` and `early_stopping_patience` to denote how
             much the specified metric must improve to satisfy early stopping conditions. `
 
-    This callback depends on :class:`~transformers.TrainingArguments` argument `load_best_model_at_end` functionality
-    to set best_metric in :class:`~transformers.TrainerState`.
+    This callback depends on [`TrainingArguments`] argument *load_best_model_at_end* functionality
+    to set best_metric in [`TrainerState`].
     """
 
     def __init__(self, early_stopping_patience: int = 1, early_stopping_threshold: Optional[float] = 0.0):
diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py
index c9c61ac7ba..0743fc610f 100644
--- a/src/transformers/trainer_pt_utils.py
+++ b/src/transformers/trainer_pt_utils.py
@@ -215,7 +215,7 @@ def torch_distributed_zero_first(local_rank: int):
     Decorator to make all processes in distributed training wait for each local_master to do something.
 
     Args:
-        local_rank (:obj:`int`): The rank of the local process.
+        local_rank (`int`): The rank of the local process.
     """
     if local_rank not in [-1, 0]:
         dist.barrier()
@@ -230,12 +230,12 @@ class DistributedSamplerWithLoop(DistributedSampler):
     shuffled samples to make each process have a round multiple of batch_size samples.
 
     Args:
-        dataset (:obj:`torch.utils.data.Dataset`):
+        dataset (`torch.utils.data.Dataset`):
             Dataset used for sampling.
-        batch_size (:obj:`int`):
+        batch_size (`int`):
             The batch size used with this sampler
         kwargs:
-            All other keyword arguments passed to :obj:`DistributedSampler`.
+            All other keyword arguments passed to `DistributedSampler`.
     """
 
     def __init__(self, dataset, batch_size, **kwargs):
@@ -342,43 +342,43 @@ class DistributedTensorGatherer:
     If our dataset has 16 samples with a batch size of 2 on 3 processes and we gather then transfer on CPU at every
     step, our sampler will generate the following indices:
 
-        :obj:`[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1]`
+        `[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1]`
 
     to get something of size a multiple of 3 (so that each process gets the same dataset length). Then process 0, 1 and
     2 will be responsible of making predictions for the following samples:
 
-        - P0: :obj:`[0, 1, 2, 3, 4, 5]`
-        - P1: :obj:`[6, 7, 8, 9, 10, 11]`
-        - P2: :obj:`[12, 13, 14, 15, 0, 1]`
+        - P0: `[0, 1, 2, 3, 4, 5]`
+        - P1: `[6, 7, 8, 9, 10, 11]`
+        - P2: `[12, 13, 14, 15, 0, 1]`
 
     The first batch treated on each process will be
 
-        - P0: :obj:`[0, 1]`
-        - P1: :obj:`[6, 7]`
-        - P2: :obj:`[12, 13]`
+        - P0: `[0, 1]`
+        - P1: `[6, 7]`
+        - P2: `[12, 13]`
 
     So if we gather at the end of the first batch, we will get a tensor (nested list/tuple of tensor) corresponding to
     the following indices:
 
-        :obj:`[0, 1, 6, 7, 12, 13]`
+        `[0, 1, 6, 7, 12, 13]`
 
     If we directly concatenate our results without taking any precautions, the user will then get the predictions for
     the indices in this order at the end of the prediction loop:
 
-        :obj:`[0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1]`
+        `[0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1]`
 
     For some reason, that's not going to roll their boat. This class is there to solve that problem.
 
     Args:
 
-        world_size (:obj:`int`):
+        world_size (`int`):
             The number of processes used in the distributed training.
-        num_samples (:obj:`int`):
+        num_samples (`int`):
             The number of samples in our dataset.
-        make_multiple_of (:obj:`int`, `optional`):
+        make_multiple_of (`int`, *optional*):
             If passed, the class assumes the datasets passed to each process are made to be a multiple of this argument
             (by adding samples).
-        padding_index (:obj:`int`, `optional`, defaults to -100):
+        padding_index (`int`, *optional*, defaults to -100):
             The padding index to use if the arrays don't all have the same sequence length.
     """
 
@@ -398,7 +398,7 @@ class DistributedTensorGatherer:
 
     def add_arrays(self, arrays):
         """
-        Add :obj:`arrays` to the internal storage, Will initialize the storage to the full size at the first arrays
+        Add `arrays` to the internal storage, Will initialize the storage to the full size at the first arrays
         passed so that if we're bound to get an OOM, it happens at the beginning.
         """
         if arrays is None:
@@ -450,9 +450,9 @@ class LabelSmoother:
     Adds label-smoothing on a pre-computed output from a Transformers model.
 
     Args:
-        epsilon (:obj:`float`, `optional`, defaults to 0.1):
+        epsilon (`float`, *optional*, defaults to 0.1):
             The label smoothing factor.
-        ignore_index (:obj:`int`, `optional`, defaults to -100):
+        ignore_index (`int`, *optional*, defaults to -100):
             The index in the labels to ignore when computing the loss.
     """
 
@@ -485,14 +485,14 @@ class LabelSmoother:
 
 def get_length_grouped_indices(lengths, batch_size, mega_batch_mult=None, generator=None):
     """
-    Return a list of indices so that each slice of :obj:`batch_size` consecutive indices correspond to elements of
+    Return a list of indices so that each slice of `batch_size` consecutive indices correspond to elements of
     similar lengths. To do this, the indices are:
 
     - randomly permuted
-    - grouped in mega-batches of size :obj:`mega_batch_mult * batch_size`
+    - grouped in mega-batches of size `mega_batch_mult * batch_size`
     - sorted by length in each mega-batch
 
-    The result is the concatenation of all mega-batches, with the batch of :obj:`batch_size` containing the element of
+    The result is the concatenation of all mega-batches, with the batch of `batch_size` containing the element of
     maximum length placed first, so that an OOM happens sooner rather than later.
     """
     # Default for mega_batch_mult: 50 or the number to get 4 megabatches, whichever is smaller.
@@ -641,11 +641,10 @@ class DistributedLengthGroupedSampler(DistributedSampler):
 class ShardSampler(Sampler):
     """
     Sampler that shards batches between several processes. Dispatches indices batch by batch: on 2 processes with batch
-    size 4, the first two batches are :obj:`[0, 1, 2, 3, 4, 5, 6, 7]` and :obj:`[8, 9, 10, 11, 12, 13, 14, 15]`, which
-    shard into :obj:`[0, 1, 2, 3]` and :obj:`[8, 9, 10, 11]` for GPU-0 and :obj:`[4, 5, 6, 7]` and :obj:`[12, 13, 14,
-    15]` for GPU-1.
+    size 4, the first two batches are `[0, 1, 2, 3, 4, 5, 6, 7]` and `[8, 9, 10, 11, 12, 13, 14, 15]`, which
+    shard into `[0, 1, 2, 3]` and `[8, 9, 10, 11]` for GPU-0 and `[4, 5, 6, 7]` and `[12, 13, 14, 15]` for GPU-1.
 
-    The sampler thus yields :obj:`[0, 1, 2, 3, 8, 9, 10, 11]` on GPU-0 and :obj:`[4, 5, 6, 7, 12, 13, 14, 15]` on
+    The sampler thus yields `[0, 1, 2, 3, 8, 9, 10, 11]` on GPU-0 and `[4, 5, 6, 7, 12, 13, 14, 15]` on
     GPU-1.
     """
 
@@ -689,44 +688,44 @@ class ShardSampler(Sampler):
 
 class IterableDatasetShard(IterableDataset):
     """
-    Wraps a PyTorch :obj:`IterableDataset` to generate samples for one of the processes only. Instances of this class
-    will always yield a number of samples that is a round multiple of the actual batch size (which is :obj:`batch_size
-    x num_processes`). Depending on the value of the :obj:`drop_last` attribute, it will either stop the iteration at
+    Wraps a PyTorch `IterableDataset` to generate samples for one of the processes only. Instances of this class
+    will always yield a number of samples that is a round multiple of the actual batch size (which is `batch_size x num_processes`). Depending on the value of the `drop_last` attribute, it will either stop the iteration at
     the first batch that would be too small or loop with indices from the beginning.
 
-    On two processes with an iterable dataset yielding of :obj:`[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]` with a batch
+    On two processes with an iterable dataset yielding of `[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]` with a batch
     size of 2:
 
-    - the shard on process 0 will yield :obj:`[0, 1, 4, 5, 8, 9]` so will see batches :obj:`[0, 1]`, :obj:`[4, 5]`,
-      :obj:`[8, 9]`
-    - the shard on process 1 will yield :obj:`[2, 3, 6, 7, 10, 11]` so will see batches :obj:`[2, 3]`, :obj:`[6, 7]`,
-      :obj:`[10, 11]`
+    - the shard on process 0 will yield `[0, 1, 4, 5, 8, 9]` so will see batches `[0, 1]`, `[4, 5]`,
+      `[8, 9]`
+    - the shard on process 1 will yield `[2, 3, 6, 7, 10, 11]` so will see batches `[2, 3]`, `[6, 7]`,
+      `[10, 11]`
 
-    .. warning:
+    <Tip warning={true}>
 
         If your IterableDataset implements some randomization that needs to be applied the same way on all processes
-        (for instance, a shuffling), you should use a :obj:`torch.Generator` in a :obj:`generator` attribute of the
-        :obj:`dataset` to generate your random numbers and call the
-        :meth:`~transformers.trainer_pt_utils.IterableDatasetShard.set_epoch` method of this object. It will set the
-        seed of this :obj:`generator` to :obj:`seed + epoch` on all processes before starting the iteration.
-        Alternatively, you can also implement a :obj:`set_epoch()` method in your iterable dataset to deal with this.
+        (for instance, a shuffling), you should use a `torch.Generator` in a `generator` attribute of the
+        `dataset` to generate your random numbers and call the
+        [`~trainer_pt_utils.IterableDatasetShard.set_epoch`] method of this object. It will set the
+        seed of this `generator` to `seed + epoch` on all processes before starting the iteration.
+        Alternatively, you can also implement a `set_epoch()` method in your iterable dataset to deal with this.
 
+    </Tip>
 
     Args:
-        dataset (:obj:`torch.utils.data.IterableDataset`):
+        dataset (`torch.utils.data.IterableDataset`):
             The batch sampler to split in several shards.
-        batch_size (:obj:`int`, `optional`, defaults to 1):
+        batch_size (`int`, *optional*, defaults to 1):
             The size of the batches per shard.
-        drop_last (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        drop_last (`bool`, *optional*, defaults to `False`):
             Whether or not to drop the last incomplete batch or complete the last batches by using the samples from the
             beginning.
-        num_processes (:obj:`int`, `optional`, defaults to 1):
+        num_processes (`int`, *optional*, defaults to 1):
             The number of processes running concurrently.
-        process_index (:obj:`int`, `optional`, defaults to 0):
+        process_index (`int`, *optional*, defaults to 0):
             The index of the current process.
-        seed (:obj:`int`, `optional`, defaults to 0):
+        seed (`int`, *optional*, defaults to 0):
             A random seed that will be used for the random number generation in
-            :meth:`~transformers.trainer_pt_utils.IterableDatasetShard.set_epoch`.
+            [`~trainer_pt_utils.IterableDatasetShard.set_epoch`].
     """
 
     def __init__(
@@ -834,11 +833,11 @@ def metrics_format(self, metrics: Dict[str, float]) -> Dict[str, float]:
     Reformat Trainer metrics values to a human-readable format
 
     Args:
-        metrics (:obj:`Dict[str, float]`):
+        metrics (`Dict[str, float]`):
             The metrics returned from train/evaluate/predict
 
     Returns:
-        metrics (:obj:`Dict[str, float]`): The reformatted metrics
+        metrics (`Dict[str, float]`): The reformatted metrics
     """
 
     metrics_copy = metrics.copy()
@@ -862,38 +861,40 @@ def log_metrics(self, split, metrics):
     Under distributed environment this is done only for a process with rank 0.
 
     Args:
-        split (:obj:`str`):
-            Mode/split name: one of ``train``, ``eval``, ``test``
-        metrics (:obj:`Dict[str, float]`):
+        split (`str`):
+            Mode/split name: one of `train`, `eval`, `test`
+        metrics (`Dict[str, float]`):
             The metrics returned from train/evaluate/predictmetrics: metrics dict
 
     Notes on memory reports:
 
-    In order to get memory usage report you need to install ``psutil``. You can do that with ``pip install psutil``.
+    In order to get memory usage report you need to install `psutil`. You can do that with `pip install psutil`.
 
-    Now when this method is run, you will see a report that will include: ::
+    Now when this method is run, you will see a report that will include: :
 
-        init_mem_cpu_alloc_delta   =     1301MB
-        init_mem_cpu_peaked_delta  =      154MB
-        init_mem_gpu_alloc_delta   =      230MB
-        init_mem_gpu_peaked_delta  =        0MB
-        train_mem_cpu_alloc_delta  =     1345MB
-        train_mem_cpu_peaked_delta =        0MB
-        train_mem_gpu_alloc_delta  =      693MB
-        train_mem_gpu_peaked_delta =        7MB
+    ```python
+    init_mem_cpu_alloc_delta   =     1301MB
+    init_mem_cpu_peaked_delta  =      154MB
+    init_mem_gpu_alloc_delta   =      230MB
+    init_mem_gpu_peaked_delta  =        0MB
+    train_mem_cpu_alloc_delta  =     1345MB
+    train_mem_cpu_peaked_delta =        0MB
+    train_mem_gpu_alloc_delta  =      693MB
+    train_mem_gpu_peaked_delta =        7MB
+    ```
 
     **Understanding the reports:**
 
-    - the first segment, e.g., ``train__``, tells you which stage the metrics are for. Reports starting with ``init_``
-      will be added to the first stage that gets run. So that if only evaluation is run, the memory usage for the
-      ``__init__`` will be reported along with the ``eval_`` metrics.
-    - the third segment, is either ``cpu`` or ``gpu``, tells you whether it's the general RAM or the gpu0 memory
-      metric.
-    - ``*_alloc_delta`` - is the difference in the used/allocated memory counter between the end and the start of the
-      stage - it can be negative if a function released more memory than it allocated.
-    - ``*_peaked_delta`` - is any extra memory that was consumed and then freed - relative to the current allocated
-      memory counter - it is never negative. When you look at the metrics of any stage you add up ``alloc_delta`` +
-      ``peaked_delta`` and you know how much memory was needed to complete that stage.
+    - the first segment, e.g., `train__`, tells you which stage the metrics are for. Reports starting with `init_`
+        will be added to the first stage that gets run. So that if only evaluation is run, the memory usage for the
+        `__init__` will be reported along with the `eval_` metrics.
+    - the third segment, is either `cpu` or `gpu`, tells you whether it's the general RAM or the gpu0 memory
+        metric.
+    - `*_alloc_delta` - is the difference in the used/allocated memory counter between the end and the start of the
+        stage - it can be negative if a function released more memory than it allocated.
+    - `*_peaked_delta` - is any extra memory that was consumed and then freed - relative to the current allocated
+        memory counter - it is never negative. When you look at the metrics of any stage you add up `alloc_delta` +
+        `peaked_delta` and you know how much memory was needed to complete that stage.
 
     The reporting happens only for process of rank 0 and gpu 0 (if there is a gpu). Typically this is enough since the
     main process does the bulk of work, but it could be not quite so if model parallel is used and then other GPUs may
@@ -907,29 +908,29 @@ def log_metrics(self, split, metrics):
 
     The CPU peak memory is measured using a sampling thread. Due to python's GIL it may miss some of the peak memory if
     that thread didn't get a chance to run when the highest memory was used. Therefore this report can be less than
-    reality. Using ``tracemalloc`` would have reported the exact peak memory, but it doesn't report memory allocations
+    reality. Using `tracemalloc` would have reported the exact peak memory, but it doesn't report memory allocations
     outside of python. So if some C++ CUDA extension allocated its own memory it won't be reported. And therefore it
     was dropped in favor of the memory sampling approach, which reads the current process memory usage.
 
-    The GPU allocated and peak memory reporting is done with ``torch.cuda.memory_allocated()`` and
-    ``torch.cuda.max_memory_allocated()``. This metric reports only "deltas" for pytorch-specific allocations, as
-    ``torch.cuda`` memory management system doesn't track any memory allocated outside of pytorch. For example, the
+    The GPU allocated and peak memory reporting is done with `torch.cuda.memory_allocated()` and
+    `torch.cuda.max_memory_allocated()`. This metric reports only "deltas" for pytorch-specific allocations, as
+    `torch.cuda` memory management system doesn't track any memory allocated outside of pytorch. For example, the
     very first cuda call typically loads CUDA kernels, which may take from 0.5 to 2GB of GPU memory.
 
-    Note that this tracker doesn't account for memory allocations outside of :class:`~transformers.Trainer`'s
-    ``__init__``, ``train``, ``evaluate`` and ``predict`` calls.
+    Note that this tracker doesn't account for memory allocations outside of [`Trainer`]'s
+    `__init__`, `train`, `evaluate` and `predict` calls.
 
-    Because ``evaluation`` calls may happen during ``train``, we can't handle nested invocations because
-    ``torch.cuda.max_memory_allocated`` is a single counter, so if it gets reset by a nested eval call, ``train``'s
-    tracker will report incorrect info. If this `pytorch issue <https://github.com/pytorch/pytorch/issues/16266>`__
+    Because `evaluation` calls may happen during `train`, we can't handle nested invocations because
+    `torch.cuda.max_memory_allocated` is a single counter, so if it gets reset by a nested eval call, `train`'s
+    tracker will report incorrect info. If this [pytorch issue](https://github.com/pytorch/pytorch/issues/16266)
     gets resolved it will be possible to change this class to be re-entrant. Until then we will only track the outer
-    level of ``train``, ``evaluate`` and ``predict`` methods. Which means that if ``eval`` is called during ``train``,
+    level of `train`, `evaluate` and `predict` methods. Which means that if `eval` is called during `train`,
     it's the latter that will account for its memory usage and that of the former.
 
-    This also means that if any other tool that is used along the :class:`~transformers.Trainer` calls
-    ``torch.cuda.reset_peak_memory_stats``, the gpu peak memory stats could be invalid. And the
-    :class:`~transformers.Trainer` will disrupt the normal behavior of any such tools that rely on calling
-    ``torch.cuda.reset_peak_memory_stats`` themselves.
+    This also means that if any other tool that is used along the [`Trainer`] calls
+    `torch.cuda.reset_peak_memory_stats`, the gpu peak memory stats could be invalid. And the
+    [`Trainer`] will disrupt the normal behavior of any such tools that rely on calling
+    `torch.cuda.reset_peak_memory_stats` themselves.
 
     For best performance you may want to consider turning the memory profiling off for production runs.
     """
@@ -946,19 +947,19 @@ def log_metrics(self, split, metrics):
 
 def save_metrics(self, split, metrics, combined=True):
     """
-    Save metrics into a json file for that split, e.g. ``train_results.json``.
+    Save metrics into a json file for that split, e.g. `train_results.json`.
 
     Under distributed environment this is done only for a process with rank 0.
 
     Args:
-        split (:obj:`str`):
-            Mode/split name: one of ``train``, ``eval``, ``test``, ``all``
-        metrics (:obj:`Dict[str, float]`):
+        split (`str`):
+            Mode/split name: one of `train`, `eval`, `test`, `all`
+        metrics (`Dict[str, float]`):
             The metrics returned from train/evaluate/predict
-        combined (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Creates combined metrics by updating ``all_results.json`` with metrics of this call
+        combined (`bool`, *optional*, defaults to `True`):
+            Creates combined metrics by updating `all_results.json` with metrics of this call
 
-    To understand the metrics please read the docstring of :meth:`~transformers.Trainer.log_metrics`. The only
+    To understand the metrics please read the docstring of [`~Trainer.log_metrics`]. The only
     difference is that raw unformatted numbers are saved in the current method.
 
     """
diff --git a/src/transformers/trainer_seq2seq.py b/src/transformers/trainer_seq2seq.py
index 0666f840a2..0a814fc0f6 100644
--- a/src/transformers/trainer_seq2seq.py
+++ b/src/transformers/trainer_seq2seq.py
@@ -40,24 +40,24 @@ class Seq2SeqTrainer(Trainer):
         Run evaluation and returns metrics.
 
         The calling script will be responsible for providing a method to compute metrics, as they are task-dependent
-        (pass it to the init :obj:`compute_metrics` argument).
+        (pass it to the init `compute_metrics` argument).
 
         You can also subclass and override this method to inject custom behavior.
 
         Args:
-            eval_dataset (:obj:`Dataset`, `optional`):
-                Pass a dataset if you wish to override :obj:`self.eval_dataset`. If it is an :obj:`datasets.Dataset`,
-                columns not accepted by the ``model.forward()`` method are automatically removed. It must implement the
-                :obj:`__len__` method.
-            ignore_keys (:obj:`List[str]`, `optional`):
+            eval_dataset (`Dataset`, *optional*):
+                Pass a dataset if you wish to override `self.eval_dataset`. If it is an `datasets.Dataset`,
+                columns not accepted by the `model.forward()` method are automatically removed. It must implement the
+                `__len__` method.
+            ignore_keys (`List[str]`, *optional*):
                 A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                 gathering predictions.
-            metric_key_prefix (:obj:`str`, `optional`, defaults to :obj:`"eval"`):
+            metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
                 An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
-                "eval_bleu" if the prefix is ``"eval"`` (default)
-            max_length (:obj:`int`, `optional`):
+                "eval_bleu" if the prefix is `"eval"` (default)
+            max_length (`int`, *optional*):
                 The maximum target length to use when predicting with the generate method.
-            num_beams (:obj:`int`, `optional`):
+            num_beams (`int`, *optional*):
                 Number of beams for beam search that will be used when predicting with the generate method. 1 means no
                 beam search.
 
@@ -81,35 +81,37 @@ class Seq2SeqTrainer(Trainer):
         Run prediction and returns predictions and potential metrics.
 
         Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method
-        will also return metrics, like in :obj:`evaluate()`.
+        will also return metrics, like in `evaluate()`.
 
         Args:
-            test_dataset (:obj:`Dataset`):
-                Dataset to run the predictions on. If it is an :obj:`datasets.Dataset`, columns not accepted by the
-                ``model.forward()`` method are automatically removed. Has to implement the method :obj:`__len__`
-            ignore_keys (:obj:`List[str]`, `optional`):
+            test_dataset (`Dataset`):
+                Dataset to run the predictions on. If it is an `datasets.Dataset`, columns not accepted by the
+                `model.forward()` method are automatically removed. Has to implement the method `__len__`
+            ignore_keys (`List[str]`, *optional*):
                 A list of keys in the output of your model (if it is a dictionary) that should be ignored when
                 gathering predictions.
-            metric_key_prefix (:obj:`str`, `optional`, defaults to :obj:`"eval"`):
+            metric_key_prefix (`str`, *optional*, defaults to `"eval"`):
                 An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named
-                "eval_bleu" if the prefix is ``"eval"`` (default)
-            max_length (:obj:`int`, `optional`):
+                "eval_bleu" if the prefix is `"eval"` (default)
+            max_length (`int`, *optional*):
                 The maximum target length to use when predicting with the generate method.
-            num_beams (:obj:`int`, `optional`):
+            num_beams (`int`, *optional*):
                 Number of beams for beam search that will be used when predicting with the generate method. 1 means no
                 beam search.
 
-        .. note::
+        <Tip>
 
-            If your predictions or labels have different sequence lengths (for instance because you're doing dynamic
-            padding in a token classification task) the predictions will be padded (on the right) to allow for
-            concatenation into one array. The padding index is -100.
+        If your predictions or labels have different sequence lengths (for instance because you're doing dynamic
+        padding in a token classification task) the predictions will be padded (on the right) to allow for
+        concatenation into one array. The padding index is -100.
 
-        Returns: `NamedTuple` A namedtuple with the following keys:
+        </Tip>
 
-            - predictions (:obj:`np.ndarray`): The predictions on :obj:`test_dataset`.
-            - label_ids (:obj:`np.ndarray`, `optional`): The labels (if the dataset contained some).
-            - metrics (:obj:`Dict[str, float]`, `optional`): The potential dictionary of metrics (if the dataset
+        Returns: *NamedTuple* A namedtuple with the following keys:
+
+            - predictions (`np.ndarray`): The predictions on `test_dataset`.
+            - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some).
+            - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset
               contained labels).
         """
         self._max_length = max_length if max_length is not None else self.args.generation_max_length
@@ -124,19 +126,19 @@ class Seq2SeqTrainer(Trainer):
         ignore_keys: Optional[List[str]] = None,
     ) -> Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
         """
-        Perform an evaluation step on :obj:`model` using obj:`inputs`.
+        Perform an evaluation step on `model` using obj:*inputs*.
 
         Subclass and override to inject custom behavior.
 
         Args:
-            model (:obj:`nn.Module`):
+            model (`nn.Module`):
                 The model to evaluate.
-            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
+            inputs (`Dict[str, Union[torch.Tensor, Any]]`):
                 The inputs and targets of the model.
 
                 The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
-                argument :obj:`labels`. Check your model's documentation for all accepted arguments.
-            prediction_loss_only (:obj:`bool`):
+                argument `labels`. Check your model's documentation for all accepted arguments.
+            prediction_loss_only (`bool`):
                 Whether or not to return the loss only.
 
         Return:
diff --git a/src/transformers/trainer_tf.py b/src/transformers/trainer_tf.py
index b2d5373636..03591cbb76 100644
--- a/src/transformers/trainer_tf.py
+++ b/src/transformers/trainer_tf.py
@@ -53,33 +53,33 @@ class TFTrainer:
     TFTrainer is a simple but feature-complete training and eval loop for TensorFlow, optimized for 🤗 Transformers.
 
     Args:
-        model (:class:`~transformers.TFPreTrainedModel`):
+        model ([`TFPreTrainedModel`]):
             The model to train, evaluate or use for predictions.
-        args (:class:`~transformers.TFTrainingArguments`):
+        args ([`TFTrainingArguments`]):
             The arguments to tweak training.
-        train_dataset (:class:`~tf.data.Dataset`, `optional`):
-            The dataset to use for training. The dataset should yield tuples of ``(features, labels)`` where
-            ``features`` is a dict of input features and ``labels`` is the labels. If ``labels`` is a tensor, the loss
-            is calculated by the model by calling ``model(features, labels=labels)``. If ``labels`` is a dict, such as
+        train_dataset ([`~tf.data.Dataset`], *optional*):
+            The dataset to use for training. The dataset should yield tuples of `(features, labels)` where
+            `features` is a dict of input features and `labels` is the labels. If `labels` is a tensor, the loss
+            is calculated by the model by calling `model(features, labels=labels)`. If `labels` is a dict, such as
             when using a QuestionAnswering head model with multiple targets, the loss is instead calculated by calling
-            ``model(features, **labels)``.
-        eval_dataset (:class:`~tf.data.Dataset`, `optional`):
-            The dataset to use for evaluation. The dataset should yield tuples of ``(features, labels)`` where
-            ``features`` is a dict of input features and ``labels`` is the labels. If ``labels`` is a tensor, the loss
-            is calculated by the model by calling ``model(features, labels=labels)``. If ``labels`` is a dict, such as
+            `model(features, **labels)`.
+        eval_dataset ([`~tf.data.Dataset`], *optional*):
+            The dataset to use for evaluation. The dataset should yield tuples of `(features, labels)` where
+            `features` is a dict of input features and `labels` is the labels. If `labels` is a tensor, the loss
+            is calculated by the model by calling `model(features, labels=labels)`. If `labels` is a dict, such as
             when using a QuestionAnswering head model with multiple targets, the loss is instead calculated by calling
-            ``model(features, **labels)``.
-        compute_metrics (:obj:`Callable[[EvalPrediction], Dict]`, `optional`):
+            `model(features, **labels)`.
+        compute_metrics (`Callable[[EvalPrediction], Dict]`, *optional*):
             The function that will be used to compute metrics at evaluation. Must take a
-            :class:`~transformers.EvalPrediction` and return a dictionary string to metric values.
-        tb_writer (:obj:`tf.summary.SummaryWriter`, `optional`):
+            [`EvalPrediction`] and return a dictionary string to metric values.
+        tb_writer (`tf.summary.SummaryWriter`, *optional*):
             Object to write to TensorBoard.
-        optimizers (:obj:`Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule]`, `optional`):
+        optimizers (`Tuple[tf.keras.optimizers.Optimizer, tf.keras.optimizers.schedules.LearningRateSchedule]`, *optional*):
             A tuple containing the optimizer and the scheduler to use. The optimizer default to an instance of
-            :class:`tf.keras.optimizers.Adam` if :obj:`args.weight_decay_rate` is 0 else an instance of
-            :class:`~transformers.AdamWeightDecay`. The scheduler will default to an instance of
-            :class:`tf.keras.optimizers.schedules.PolynomialDecay` if :obj:`args.num_warmup_steps` is 0 else an
-            instance of :class:`~transformers.WarmUp`.
+            [`tf.keras.optimizers.Adam`] if `args.weight_decay_rate` is 0 else an instance of
+            [`AdamWeightDecay`]. The scheduler will default to an instance of
+            [`tf.keras.optimizers.schedules.PolynomialDecay`] if `args.num_warmup_steps` is 0 else an
+            instance of [`WarmUp`].
     """
 
     def __init__(
@@ -139,7 +139,7 @@ class TFTrainer:
 
     def get_train_tfdataset(self) -> tf.data.Dataset:
         """
-        Returns the training :class:`~tf.data.Dataset`.
+        Returns the training [`~tf.data.Dataset`].
 
         Subclass and override this method if you want to inject some custom behavior.
         """
@@ -163,15 +163,14 @@ class TFTrainer:
 
     def get_eval_tfdataset(self, eval_dataset: Optional[tf.data.Dataset] = None) -> tf.data.Dataset:
         """
-        Returns the evaluation :class:`~tf.data.Dataset`.
+        Returns the evaluation [`~tf.data.Dataset`].
 
         Args:
-            eval_dataset (:class:`~tf.data.Dataset`, `optional`):
-                If provided, will override `self.eval_dataset`. The dataset should yield tuples of ``(features,
-                labels)`` where ``features`` is a dict of input features and ``labels`` is the labels. If ``labels`` is
-                a tensor, the loss is calculated by the model by calling ``model(features, labels=labels)``. If
-                ``labels`` is a dict, such as when using a QuestionAnswering head model with multiple targets, the loss
-                is instead calculated by calling ``model(features, **labels)``.
+            eval_dataset ([`~tf.data.Dataset`], *optional*):
+                If provided, will override *self.eval_dataset*. The dataset should yield tuples of `(features, labels)` where `features` is a dict of input features and `labels` is the labels. If `labels` is
+                a tensor, the loss is calculated by the model by calling `model(features, labels=labels)`. If
+                `labels` is a dict, such as when using a QuestionAnswering head model with multiple targets, the loss
+                is instead calculated by calling `model(features, **labels)`.
 
         Subclass and override this method if you want to inject some custom behavior.
         """
@@ -196,15 +195,15 @@ class TFTrainer:
 
     def get_test_tfdataset(self, test_dataset: tf.data.Dataset) -> tf.data.Dataset:
         """
-        Returns a test :class:`~tf.data.Dataset`.
+        Returns a test [`~tf.data.Dataset`].
 
         Args:
-            test_dataset (:class:`~tf.data.Dataset`):
-                The dataset to use. The dataset should yield tuples of ``(features, labels)`` where ``features`` is a
-                dict of input features and ``labels`` is the labels. If ``labels`` is a tensor, the loss is calculated
-                by the model by calling ``model(features, labels=labels)``. If ``labels`` is a dict, such as when using
+            test_dataset ([`~tf.data.Dataset`]):
+                The dataset to use. The dataset should yield tuples of `(features, labels)` where `features` is a
+                dict of input features and `labels` is the labels. If `labels` is a tensor, the loss is calculated
+                by the model by calling `model(features, labels=labels)`. If `labels` is a dict, such as when using
                 a QuestionAnswering head model with multiple targets, the loss is instead calculated by calling
-                ``model(features, **labels)``.
+                `model(features, **labels)`.
 
         Subclass and override this method if you want to inject some custom behavior.
         """
@@ -224,7 +223,7 @@ class TFTrainer:
         Setup the optimizer and the learning rate scheduler.
 
         We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
-        TFTrainer's init through :obj:`optimizers`, or subclass and override this method.
+        TFTrainer's init through `optimizers`, or subclass and override this method.
         """
         if not self.optimizer and not self.lr_scheduler:
             warmup_steps = (
@@ -302,8 +301,8 @@ class TFTrainer:
         prediction_loss_only: Optional[bool] = None,
     ) -> PredictionOutput:
         """
-        Prediction/evaluation loop, shared by :func:`~transformers.TFTrainer.evaluate` and
-        :func:`~transformers.TFTrainer.predict`.
+        Prediction/evaluation loop, shared by [`~TFTrainer.evaluate`] and
+        [`~TFTrainer.predict`].
 
         Works both with or without labels.
         """
@@ -382,12 +381,12 @@ class TFTrainer:
 
     def log(self, logs: Dict[str, float]) -> None:
         """
-        Log :obj:`logs` on the various objects watching training.
+        Log `logs` on the various objects watching training.
 
         Subclass and override this method to inject custom behavior.
 
         Args:
-            logs (:obj:`Dict[str, float]`):
+            logs (`Dict[str, float]`):
                 The values to log.
         """
         logs["epoch"] = self.epoch_logging
@@ -417,15 +416,14 @@ class TFTrainer:
         Run evaluation and returns metrics.
 
         The calling script will be responsible for providing a method to compute metrics, as they are task-dependent
-        (pass it to the init :obj:`compute_metrics` argument).
+        (pass it to the init `compute_metrics` argument).
 
         Args:
-            eval_dataset (:class:`~tf.data.Dataset`, `optional`):
-                Pass a dataset if you wish to override :obj:`self.eval_dataset`. The dataset should yield tuples of
-                ``(features, labels)`` where ``features`` is a dict of input features and ``labels`` is the labels. If
-                ``labels`` is a tensor, the loss is calculated by the model by calling ``model(features,
-                labels=labels)``. If ``labels`` is a dict, such as when using a QuestionAnswering head model with
-                multiple targets, the loss is instead calculated by calling ``model(features, **labels)``.
+            eval_dataset ([`~tf.data.Dataset`], *optional*):
+                Pass a dataset if you wish to override `self.eval_dataset`. The dataset should yield tuples of
+                `(features, labels)` where `features` is a dict of input features and `labels` is the labels. If
+                `labels` is a tensor, the loss is calculated by the model by calling `model(features, labels=labels)`. If `labels` is a dict, such as when using a QuestionAnswering head model with
+                multiple targets, the loss is instead calculated by calling `model(features, **labels)`.
 
         Returns:
             A dictionary containing the evaluation loss and the potential metrics computed from the predictions.
@@ -736,12 +734,12 @@ class TFTrainer:
         Subclass and override this method if you want to inject some custom behavior.
 
         Args:
-            features (:obj:`tf.Tensor`): A batch of input features.
-            labels (:obj:`tf.Tensor`): A batch of labels.
-            training (:obj:`bool`): Whether or not to run the model in training mode.
+            features (`tf.Tensor`): A batch of input features.
+            labels (`tf.Tensor`): A batch of labels.
+            training (`bool`): Whether or not to run the model in training mode.
 
         Returns:
-            A tuple of two :obj:`tf.Tensor`: The loss and logits.
+            A tuple of two `tf.Tensor`: The loss and logits.
         """
 
         if self.args.past_index >= 0 and getattr(self, "_past", None) is not None:
@@ -764,21 +762,21 @@ class TFTrainer:
         Run prediction and returns predictions and potential metrics.
 
         Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method
-        will also return metrics, like in :obj:`evaluate()`.
+        will also return metrics, like in `evaluate()`.
 
         Args:
-            test_dataset (:class:`~tf.data.Dataset`):
-                Dataset to run the predictions on. The dataset should yield tuples of ``(features, labels)`` where
-                ``features`` is a dict of input features and ``labels`` is the labels. If ``labels`` is a tensor, the
-                loss is calculated by the model by calling ``model(features, labels=labels)``. If ``labels`` is a dict,
+            test_dataset ([`~tf.data.Dataset`]):
+                Dataset to run the predictions on. The dataset should yield tuples of `(features, labels)` where
+                `features` is a dict of input features and `labels` is the labels. If `labels` is a tensor, the
+                loss is calculated by the model by calling `model(features, labels=labels)`. If `labels` is a dict,
                 such as when using a QuestionAnswering head model with multiple targets, the loss is instead calculated
-                by calling ``model(features, **labels)``
+                by calling `model(features, **labels)`
 
-        Returns: `NamedTuple` A namedtuple with the following keys:
+        Returns: *NamedTuple* A namedtuple with the following keys:
 
-            - predictions (:obj:`np.ndarray`): The predictions on :obj:`test_dataset`.
-            - label_ids (:obj:`np.ndarray`, `optional`): The labels (if the dataset contained some).
-            - metrics (:obj:`Dict[str, float]`, `optional`): The potential dictionary of metrics (if the dataset
+            - predictions (`np.ndarray`): The predictions on `test_dataset`.
+            - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some).
+            - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset
               contained labels).
         """
         test_ds, steps, num_examples = self.get_test_tfdataset(test_dataset)
@@ -787,7 +785,7 @@ class TFTrainer:
 
     def save_model(self, output_dir: Optional[str] = None):
         """
-        Will save the model, so you can reload it using :obj:`from_pretrained()`.
+        Will save the model, so you can reload it using `from_pretrained()`.
         """
         output_dir = output_dir if output_dir is not None else self.args.output_dir
 
diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py
index f5baad61ba..3c29b221eb 100644
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -49,11 +49,11 @@ if is_tf_available():
 
 def set_seed(seed: int):
     """
-    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if
+    Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch` and/or `tf` (if
     installed).
 
     Args:
-        seed (:obj:`int`): The seed to set.
+        seed (`int`): The seed to set.
     """
     random.seed(seed)
     np.random.seed(seed)
@@ -70,8 +70,8 @@ class EvalPrediction(NamedTuple):
     Evaluation output (always contains labels), to be used to compute metrics.
 
     Parameters:
-        predictions (:obj:`np.ndarray`): Predictions of the model.
-        label_ids (:obj:`np.ndarray`): Targets to be matched.
+        predictions (`np.ndarray`): Predictions of the model.
+        label_ids (`np.ndarray`): Targets to be matched.
     """
 
     predictions: Union[np.ndarray, Tuple[np.ndarray]]
@@ -134,15 +134,15 @@ class HubStrategy(ExplicitEnum):
 
 class BestRun(NamedTuple):
     """
-    The best run found by an hyperparameter search (see :class:`~transformers.Trainer.hyperparameter_search`).
+    The best run found by an hyperparameter search (see [`~Trainer.hyperparameter_search`]).
 
     Parameters:
-        run_id (:obj:`str`):
+        run_id (`str`):
             The id of the best run (if models were saved, the corresponding checkpoint will be in the folder ending
             with run-{run_id}).
-        objective (:obj:`float`):
+        objective (`float`):
             The objective that was obtained for this run.
-        hyperparameters (:obj:`Dict[str, Any]`):
+        hyperparameters (`Dict[str, Any]`):
             The hyperparameters picked to get this run.
     """
 
@@ -154,13 +154,13 @@ class BestRun(NamedTuple):
 def default_compute_objective(metrics: Dict[str, float]) -> float:
     """
     The default objective to maximize/minimize when doing an hyperparameter search. It is the evaluation loss if no
-    metrics are provided to the :class:`~transformers.Trainer`, the sum of all metrics otherwise.
+    metrics are provided to the [`Trainer`], the sum of all metrics otherwise.
 
     Args:
-        metrics (:obj:`Dict[str, float]`): The metrics returned by the evaluate method.
+        metrics (`Dict[str, float]`): The metrics returned by the evaluate method.
 
     Return:
-        :obj:`float`: The objective to minimize or maximize
+        `float`: The objective to minimize or maximize
     """
     metrics = copy.deepcopy(metrics)
     loss = metrics.pop("eval_loss", None)
@@ -292,22 +292,23 @@ class TrainerMemoryTracker:
     """
     A helper class that tracks cpu and gpu memory.
 
-    This class will silently skip unless ``psutil`` is available. Install with ``pip install psutil``.
+    This class will silently skip unless `psutil` is available. Install with `pip install psutil`.
 
     When a stage completes, it can pass metrics dict to update with the memory metrics gathered during this stage.
 
-    Example ::
+    Example :
 
-        self._memory_tracker = TrainerMemoryTracker(self.args.skip_memory_metrics)
-        self._memory_tracker.start()
-        code ...
-        metrics = {"train_runtime": 10.5}
-        self._memory_tracker.stop_and_update_metrics(metrics)
+    ```python
+    self._memory_tracker = TrainerMemoryTracker(self.args.skip_memory_metrics)
+    self._memory_tracker.start()
+    code ...
+    metrics = {"train_runtime": 10.5}
+    self._memory_tracker.stop_and_update_metrics(metrics)
+    ```
 
-    At the moment GPU tracking is only for ``pytorch``, but can be extended to support ``tensorflow``.
-
-    To understand this class' intricacies please read the documentation of :meth:`~transformers.Trainer.log_metrics`.
+    At the moment GPU tracking is only for `pytorch`, but can be extended to support `tensorflow`.
 
+    To understand this class' intricacies please read the documentation of [`~Trainer.log_metrics`].
     """
 
     # map trainer methods to metrics prefix
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 71404a83ba..d0c4ed4ce3 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -75,324 +75,328 @@ class TrainingArguments:
     TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop
     itself**.
 
-    Using :class:`~transformers.HfArgumentParser` we can turn this class into `argparse
-    <https://docs.python.org/3/library/argparse.html#module-argparse>`__ arguments that can be specified on the command
+    Using [`HfArgumentParser`] we can turn this class into [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the command
     line.
 
     Parameters:
-        output_dir (:obj:`str`):
+        output_dir (`str`):
             The output directory where the model predictions and checkpoints will be written.
-        overwrite_output_dir (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            If :obj:`True`, overwrite the content of the output directory. Use this to continue training if
-            :obj:`output_dir` points to a checkpoint directory.
-        do_train (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to run training or not. This argument is not directly used by :class:`~transformers.Trainer`, it's
-            intended to be used by your training/evaluation scripts instead. See the `example scripts
-            <https://github.com/huggingface/transformers/tree/master/examples>`__ for more details.
-        do_eval (:obj:`bool`, `optional`):
-            Whether to run evaluation on the validation set or not. Will be set to :obj:`True` if
-            :obj:`evaluation_strategy` is different from :obj:`"no"`. This argument is not directly used by
-            :class:`~transformers.Trainer`, it's intended to be used by your training/evaluation scripts instead. See
-            the `example scripts <https://github.com/huggingface/transformers/tree/master/examples>`__ for more
+        overwrite_output_dir (`bool`, *optional*, defaults to `False`):
+            If `True`, overwrite the content of the output directory. Use this to continue training if
+            `output_dir` points to a checkpoint directory.
+        do_train (`bool`, *optional*, defaults to `False`):
+            Whether to run training or not. This argument is not directly used by [`Trainer`], it's
+            intended to be used by your training/evaluation scripts instead. See the [example scripts](https://github.com/huggingface/transformers/tree/master/examples) for more details.
+        do_eval (`bool`, *optional*):
+            Whether to run evaluation on the validation set or not. Will be set to `True` if
+            `evaluation_strategy` is different from `"no"`. This argument is not directly used by
+            [`Trainer`], it's intended to be used by your training/evaluation scripts instead. See
+            the [example scripts](https://github.com/huggingface/transformers/tree/master/examples) for more
             details.
-        do_predict (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        do_predict (`bool`, *optional*, defaults to `False`):
             Whether to run predictions on the test set or not. This argument is not directly used by
-            :class:`~transformers.Trainer`, it's intended to be used by your training/evaluation scripts instead. See
-            the `example scripts <https://github.com/huggingface/transformers/tree/master/examples>`__ for more
+            [`Trainer`], it's intended to be used by your training/evaluation scripts instead. See
+            the [example scripts](https://github.com/huggingface/transformers/tree/master/examples) for more
             details.
-        evaluation_strategy (:obj:`str` or :class:`~transformers.trainer_utils.IntervalStrategy`, `optional`, defaults to :obj:`"no"`):
+        evaluation_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"no"`):
             The evaluation strategy to adopt during training. Possible values are:
 
-                * :obj:`"no"`: No evaluation is done during training.
-                * :obj:`"steps"`: Evaluation is done (and logged) every :obj:`eval_steps`.
-                * :obj:`"epoch"`: Evaluation is done at the end of each epoch.
+                - `"no"`: No evaluation is done during training.
+                - `"steps"`: Evaluation is done (and logged) every `eval_steps`.
+                - `"epoch"`: Evaluation is done at the end of each epoch.
 
-        prediction_loss_only (:obj:`bool`, `optional`, defaults to `False`):
+        prediction_loss_only (`bool`, *optional*, defaults to *False*):
             When performing evaluation and generating predictions, only returns the loss.
-        per_device_train_batch_size (:obj:`int`, `optional`, defaults to 8):
+        per_device_train_batch_size (`int`, *optional*, defaults to 8):
             The batch size per GPU/TPU core/CPU for training.
-        per_device_eval_batch_size (:obj:`int`, `optional`, defaults to 8):
+        per_device_eval_batch_size (`int`, *optional*, defaults to 8):
             The batch size per GPU/TPU core/CPU for evaluation.
-        gradient_accumulation_steps (:obj:`int`, `optional`, defaults to 1):
+        gradient_accumulation_steps (`int`, *optional*, defaults to 1):
             Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
 
-            .. warning::
+            <Tip warning={true}>
 
-                When using gradient accumulation, one step is counted as one step with backward pass. Therefore,
-                logging, evaluation, save will be conducted every ``gradient_accumulation_steps * xxx_step`` training
-                examples.
-        eval_accumulation_steps (:obj:`int`, `optional`):
+            When using gradient accumulation, one step is counted as one step with backward pass. Therefore,
+            logging, evaluation, save will be conducted every `gradient_accumulation_steps * xxx_step` training
+            examples.
+
+            </Tip>
+
+        eval_accumulation_steps (`int`, *optional*):
             Number of predictions steps to accumulate the output tensors for, before moving the results to the CPU. If
             left unset, the whole predictions are accumulated on GPU/TPU before being moved to the CPU (faster but
             requires more memory).
-        learning_rate (:obj:`float`, `optional`, defaults to 5e-5):
-            The initial learning rate for :class:`~transformers.AdamW` optimizer.
-        weight_decay (:obj:`float`, `optional`, defaults to 0):
+        learning_rate (`float`, *optional*, defaults to 5e-5):
+            The initial learning rate for [`AdamW`] optimizer.
+        weight_decay (`float`, *optional*, defaults to 0):
             The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in
-            :class:`~transformers.AdamW` optimizer.
-        adam_beta1 (:obj:`float`, `optional`, defaults to 0.9):
-            The beta1 hyperparameter for the :class:`~transformers.AdamW` optimizer.
-        adam_beta2 (:obj:`float`, `optional`, defaults to 0.999):
-            The beta2 hyperparameter for the :class:`~transformers.AdamW` optimizer.
-        adam_epsilon (:obj:`float`, `optional`, defaults to 1e-8):
-            The epsilon hyperparameter for the :class:`~transformers.AdamW` optimizer.
-        max_grad_norm (:obj:`float`, `optional`, defaults to 1.0):
+            [`AdamW`] optimizer.
+        adam_beta1 (`float`, *optional*, defaults to 0.9):
+            The beta1 hyperparameter for the [`AdamW`] optimizer.
+        adam_beta2 (`float`, *optional*, defaults to 0.999):
+            The beta2 hyperparameter for the [`AdamW`] optimizer.
+        adam_epsilon (`float`, *optional*, defaults to 1e-8):
+            The epsilon hyperparameter for the [`AdamW`] optimizer.
+        max_grad_norm (`float`, *optional*, defaults to 1.0):
             Maximum gradient norm (for gradient clipping).
-        num_train_epochs(:obj:`float`, `optional`, defaults to 3.0):
+        num_train_epochs(`float`, *optional*, defaults to 3.0):
             Total number of training epochs to perform (if not an integer, will perform the decimal part percents of
             the last epoch before stopping training).
-        max_steps (:obj:`int`, `optional`, defaults to -1):
+        max_steps (`int`, *optional*, defaults to -1):
             If set to a positive number, the total number of training steps to perform. Overrides
-            :obj:`num_train_epochs`. In case of using a finite iterable dataset the training may stop before reaching
+            `num_train_epochs`. In case of using a finite iterable dataset the training may stop before reaching
             the set number of steps when all data is exhausted
-        lr_scheduler_type (:obj:`str` or :class:`~transformers.SchedulerType`, `optional`, defaults to :obj:`"linear"`):
-            The scheduler type to use. See the documentation of :class:`~transformers.SchedulerType` for all possible
+        lr_scheduler_type (`str` or [`SchedulerType`], *optional*, defaults to `"linear"`):
+            The scheduler type to use. See the documentation of [`SchedulerType`] for all possible
             values.
-        warmup_ratio (:obj:`float`, `optional`, defaults to 0.0):
-            Ratio of total training steps used for a linear warmup from 0 to :obj:`learning_rate`.
-        warmup_steps (:obj:`int`, `optional`, defaults to 0):
-            Number of steps used for a linear warmup from 0 to :obj:`learning_rate`. Overrides any effect of
-            :obj:`warmup_ratio`.
-        log_level (:obj:`str`, `optional`, defaults to ``passive``):
+        warmup_ratio (`float`, *optional*, defaults to 0.0):
+            Ratio of total training steps used for a linear warmup from 0 to `learning_rate`.
+        warmup_steps (`int`, *optional*, defaults to 0):
+            Number of steps used for a linear warmup from 0 to `learning_rate`. Overrides any effect of
+            `warmup_ratio`.
+        log_level (`str`, *optional*, defaults to `passive`):
             Logger log level to use on the main process. Possible choices are the log levels as strings: 'debug',
             'info', 'warning', 'error' and 'critical', plus a 'passive' level which doesn't set anything and lets the
             application set the level.
-        log_level_replica (:obj:`str`, `optional`, defaults to ``passive``):
-            Logger log level to use on replicas. Same choices as ``log_level``"
-        log_on_each_node (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            In multinode distributed training, whether to log using :obj:`log_level` once per node, or only on the main
+        log_level_replica (`str`, *optional*, defaults to `passive`):
+            Logger log level to use on replicas. Same choices as `log_level`"
+        log_on_each_node (`bool`, *optional*, defaults to `True`):
+            In multinode distributed training, whether to log using `log_level` once per node, or only on the main
             node.
-        logging_dir (:obj:`str`, `optional`):
-            `TensorBoard <https://www.tensorflow.org/tensorboard>`__ log directory. Will default to
-            `output_dir/runs/**CURRENT_DATETIME_HOSTNAME**`.
-        logging_strategy (:obj:`str` or :class:`~transformers.trainer_utils.IntervalStrategy`, `optional`, defaults to :obj:`"steps"`):
+        logging_dir (`str`, *optional*):
+            [TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to
+            *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***.
+        logging_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`):
             The logging strategy to adopt during training. Possible values are:
 
-                * :obj:`"no"`: No logging is done during training.
-                * :obj:`"epoch"`: Logging is done at the end of each epoch.
-                * :obj:`"steps"`: Logging is done every :obj:`logging_steps`.
+                - `"no"`: No logging is done during training.
+                - `"epoch"`: Logging is done at the end of each epoch.
+                - `"steps"`: Logging is done every `logging_steps`.
 
-        logging_first_step (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to log and evaluate the first :obj:`global_step` or not.
-        logging_steps (:obj:`int`, `optional`, defaults to 500):
-            Number of update steps between two logs if :obj:`logging_strategy="steps"`.
-        logging_nan_inf_filter (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether to filter :obj:`nan` and :obj:`inf` losses for logging. If set to obj:`True` the loss of every step
-            that is :obj:`nan` or :obj:`inf` is filtered and the average loss of the current logging window is taken
+        logging_first_step (`bool`, *optional*, defaults to `False`):
+            Whether to log and evaluate the first `global_step` or not.
+        logging_steps (`int`, *optional*, defaults to 500):
+            Number of update steps between two logs if `logging_strategy="steps"`.
+        logging_nan_inf_filter (`bool`, *optional*, defaults to `True`):
+            Whether to filter `nan` and `inf` losses for logging. If set to obj:*True* the loss of every step
+            that is `nan` or `inf` is filtered and the average loss of the current logging window is taken
             instead.
 
-            .. note::
+            <Tip>
 
-                :obj:`logging_nan_inf_filter` only influences the logging of loss values, it does not change the
-                behavior the gradient is computed or applied to the model.
+            `logging_nan_inf_filter` only influences the logging of loss values, it does not change the
+            behavior the gradient is computed or applied to the model.
 
-        save_strategy (:obj:`str` or :class:`~transformers.trainer_utils.IntervalStrategy`, `optional`, defaults to :obj:`"steps"`):
+            </Tip>
+
+        save_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`):
             The checkpoint save strategy to adopt during training. Possible values are:
 
-                * :obj:`"no"`: No save is done during training.
-                * :obj:`"epoch"`: Save is done at the end of each epoch.
-                * :obj:`"steps"`: Save is done every :obj:`save_steps`.
-        save_steps (:obj:`int`, `optional`, defaults to 500):
-            Number of updates steps before two checkpoint saves if :obj:`save_strategy="steps"`.
-        save_total_limit (:obj:`int`, `optional`):
+                - `"no"`: No save is done during training.
+                - `"epoch"`: Save is done at the end of each epoch.
+                - `"steps"`: Save is done every `save_steps`.
+        save_steps (`int`, *optional*, defaults to 500):
+            Number of updates steps before two checkpoint saves if `save_strategy="steps"`.
+        save_total_limit (`int`, *optional*):
             If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
-            :obj:`output_dir`.
-        save_on_each_node (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            `output_dir`.
+        save_on_each_node (`bool`, *optional*, defaults to `False`):
             When doing multi-node distributed training, whether to save models and checkpoints on each node, or only on
             the main one.
 
             This should not be activated when the different nodes use the same storage as the files will be saved with
             the same names for each node.
-        no_cuda (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        no_cuda (`bool`, *optional*, defaults to `False`):
             Whether to not use CUDA even when it is available or not.
-        seed (:obj:`int`, `optional`, defaults to 42):
+        seed (`int`, *optional*, defaults to 42):
             Random seed that will be set at the beginning of training. To ensure reproducibility across runs, use the
-            :func:`~transformers.Trainer.model_init` function to instantiate the model if it has some randomly
+            [`~Trainer.model_init`] function to instantiate the model if it has some randomly
             initialized parameters.
-        bf16 (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        bf16 (`bool`, *optional*, defaults to `False`):
             Whether to use bf16 16-bit (mixed) precision training instead of 32-bit training. Requires Ampere or higher
             NVIDIA architecture. This is an experimental API and it may change.
-        fp16 (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        fp16 (`bool`, *optional*, defaults to `False`):
             Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training.
-        fp16_opt_level (:obj:`str`, `optional`, defaults to 'O1'):
-            For :obj:`fp16` training, Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details
-            on the `Apex documentation <https://nvidia.github.io/apex/amp.html>`__.
-        fp16_backend (:obj:`str`, `optional`, defaults to :obj:`"auto"`):
-            This argument is deprecated. Use ``half_precision_backend`` instead.
-        half_precision_backend (:obj:`str`, `optional`, defaults to :obj:`"auto"`):
-            The backend to use for mixed precision training. Must be one of :obj:`"auto"`, :obj:`"amp"` or
-            :obj:`"apex"`. :obj:`"auto"` will use AMP or APEX depending on the PyTorch version detected, while the
+        fp16_opt_level (`str`, *optional*, defaults to 'O1'):
+            For `fp16` training, Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details
+            on the [Apex documentation](https://nvidia.github.io/apex/amp).
+        fp16_backend (`str`, *optional*, defaults to `"auto"`):
+            This argument is deprecated. Use `half_precision_backend` instead.
+        half_precision_backend (`str`, *optional*, defaults to `"auto"`):
+            The backend to use for mixed precision training. Must be one of `"auto"`, `"amp"` or
+            `"apex"`. `"auto"` will use AMP or APEX depending on the PyTorch version detected, while the
             other choices will force the requested backend.
-        bf16_full_eval (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        bf16_full_eval (`bool`, *optional*, defaults to `False`):
             Whether to use full bfloat16 evaluation instead of 32-bit. This will be faster and save memory but can harm
             metric values. This is an experimental API and it may change.
-        fp16_full_eval (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        fp16_full_eval (`bool`, *optional*, defaults to `False`):
             Whether to use full float16 evaluation instead of 32-bit. This will be faster and save memory but can harm
             metric values.
-        tf32 (:obj:`bool`, `optional`):
+        tf32 (`bool`, *optional*):
             Whether to enable tf32 mode, available in Ampere and newer GPU architectures. This is an experimental API
             and it may change.
-        local_rank (:obj:`int`, `optional`, defaults to -1):
+        local_rank (`int`, *optional*, defaults to -1):
             Rank of the process during distributed training.
-        xpu_backend (:obj:`str`, `optional`):
-            The backend to use for xpu distributed training. Must be one of :obj:`"mpi"` or :obj:`"ccl"`.
-        tpu_num_cores (:obj:`int`, `optional`):
+        xpu_backend (`str`, *optional*):
+            The backend to use for xpu distributed training. Must be one of `"mpi"` or `"ccl"`.
+        tpu_num_cores (`int`, *optional*):
             When training on TPU, the number of TPU cores (automatically passed by launcher script).
-        dataloader_drop_last (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        dataloader_drop_last (`bool`, *optional*, defaults to `False`):
             Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size)
             or not.
-        eval_steps (:obj:`int`, `optional`):
-            Number of update steps between two evaluations if :obj:`evaluation_strategy="steps"`. Will default to the
-            same value as :obj:`logging_steps` if not set.
-        dataloader_num_workers (:obj:`int`, `optional`, defaults to 0):
+        eval_steps (`int`, *optional*):
+            Number of update steps between two evaluations if `evaluation_strategy="steps"`. Will default to the
+            same value as `logging_steps` if not set.
+        dataloader_num_workers (`int`, *optional*, defaults to 0):
             Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the
             main process.
-        past_index (:obj:`int`, `optional`, defaults to -1):
-            Some models like :doc:`TransformerXL <../model_doc/transformerxl>` or :doc:`XLNet <../model_doc/xlnet>` can
+        past_index (`int`, *optional*, defaults to -1):
+            Some models like [TransformerXL](../model_doc/transformerxl) or [XLNet](../model_doc/xlnet) can
             make use of the past hidden states for their predictions. If this argument is set to a positive int, the
-            ``Trainer`` will use the corresponding output (usually index 2) as the past state and feed it to the model
-            at the next training step under the keyword argument ``mems``.
-        run_name (:obj:`str`, `optional`):
-            A descriptor for the run. Typically used for `wandb <https://www.wandb.com/>`_ logging.
-        disable_tqdm (:obj:`bool`, `optional`):
+            `Trainer` will use the corresponding output (usually index 2) as the past state and feed it to the model
+            at the next training step under the keyword argument `mems`.
+        run_name (`str`, *optional*):
+            A descriptor for the run. Typically used for [wandb](https://www.wandb.com/) logging.
+        disable_tqdm (`bool`, *optional*):
             Whether or not to disable the tqdm progress bars and table of metrics produced by
-            :class:`~transformers.notebook.NotebookTrainingTracker` in Jupyter Notebooks. Will default to :obj:`True`
-            if the logging level is set to warn or lower (default), :obj:`False` otherwise.
-        remove_unused_columns (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            If using :obj:`datasets.Dataset` datasets, whether or not to automatically remove the columns unused by the
+            [`~notebook.NotebookTrainingTracker`] in Jupyter Notebooks. Will default to `True`
+            if the logging level is set to warn or lower (default), `False` otherwise.
+        remove_unused_columns (`bool`, *optional*, defaults to `True`):
+            If using `datasets.Dataset` datasets, whether or not to automatically remove the columns unused by the
             model forward method.
 
-            (Note that this behavior is not implemented for :class:`~transformers.TFTrainer` yet.)
-        label_names (:obj:`List[str]`, `optional`):
+            (Note that this behavior is not implemented for [`TFTrainer`] yet.)
+        label_names (`List[str]`, *optional*):
             The list of keys in your dictionary of inputs that correspond to the labels.
 
-            Will eventually default to :obj:`["labels"]` except if the model used is one of the
-            :obj:`XxxForQuestionAnswering` in which case it will default to :obj:`["start_positions",
-            "end_positions"]`.
-        load_best_model_at_end (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Will eventually default to `["labels"]` except if the model used is one of the
+            `XxxForQuestionAnswering` in which case it will default to `["start_positions", "end_positions"]`.
+        load_best_model_at_end (`bool`, *optional*, defaults to `False`):
             Whether or not to load the best model found during training at the end of training.
 
-            .. note::
+            <Tip>
 
-                When set to :obj:`True`, the parameters :obj:`save_strategy` needs to be the same as
-                :obj:`eval_strategy`, and in the case it is "steps", :obj:`save_steps` must be a round multiple of
-                :obj:`eval_steps`.
-        metric_for_best_model (:obj:`str`, `optional`):
-            Use in conjunction with :obj:`load_best_model_at_end` to specify the metric to use to compare two different
-            models. Must be the name of a metric returned by the evaluation with or without the prefix :obj:`"eval_"`.
-            Will default to :obj:`"loss"` if unspecified and :obj:`load_best_model_at_end=True` (to use the evaluation
+            When set to `True`, the parameters `save_strategy` needs to be the same as
+            `eval_strategy`, and in the case it is "steps", `save_steps` must be a round multiple of
+            `eval_steps`.
+
+            </Tip>
+
+        metric_for_best_model (`str`, *optional*):
+            Use in conjunction with `load_best_model_at_end` to specify the metric to use to compare two different
+            models. Must be the name of a metric returned by the evaluation with or without the prefix `"eval_"`.
+            Will default to `"loss"` if unspecified and `load_best_model_at_end=True` (to use the evaluation
             loss).
 
-            If you set this value, :obj:`greater_is_better` will default to :obj:`True`. Don't forget to set it to
-            :obj:`False` if your metric is better when lower.
-        greater_is_better (:obj:`bool`, `optional`):
-            Use in conjunction with :obj:`load_best_model_at_end` and :obj:`metric_for_best_model` to specify if better
+            If you set this value, `greater_is_better` will default to `True`. Don't forget to set it to
+            `False` if your metric is better when lower.
+        greater_is_better (`bool`, *optional*):
+            Use in conjunction with `load_best_model_at_end` and `metric_for_best_model` to specify if better
             models should have a greater metric or not. Will default to:
 
-            - :obj:`True` if :obj:`metric_for_best_model` is set to a value that isn't :obj:`"loss"` or
-              :obj:`"eval_loss"`.
-            - :obj:`False` if :obj:`metric_for_best_model` is not set, or set to :obj:`"loss"` or :obj:`"eval_loss"`.
-        ignore_data_skip (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            - `True` if `metric_for_best_model` is set to a value that isn't `"loss"` or
+              `"eval_loss"`.
+            - `False` if `metric_for_best_model` is not set, or set to `"loss"` or `"eval_loss"`.
+        ignore_data_skip (`bool`, *optional*, defaults to `False`):
             When resuming training, whether or not to skip the epochs and batches to get the data loading at the same
-            stage as in the previous training. If set to :obj:`True`, the training will begin faster (as that skipping
+            stage as in the previous training. If set to `True`, the training will begin faster (as that skipping
             step can take a long time) but will not yield the same results as the interrupted training would have.
-        sharded_ddp (:obj:`bool`, :obj:`str` or list of :class:`~transformers.trainer_utils.ShardedDDPOption`, `optional`, defaults to :obj:`False`):
-            Use Sharded DDP training from `FairScale <https://github.com/facebookresearch/fairscale>`__ (in distributed
+        sharded_ddp (`bool`, `str` or list of [`~trainer_utils.ShardedDDPOption`], *optional*, defaults to `False`):
+            Use Sharded DDP training from [FairScale](https://github.com/facebookresearch/fairscale) (in distributed
             training only). This is an experimental feature.
 
             A list of options along the following:
 
-            - :obj:`"simple"`: to use first instance of sharded DDP released by fairscale (:obj:`ShardedDDP`) similar
+            - `"simple"`: to use first instance of sharded DDP released by fairscale (`ShardedDDP`) similar
               to ZeRO-2.
-            - :obj:`"zero_dp_2"`: to use the second instance of sharded DPP released by fairscale
-              (:obj:`FullyShardedDDP`) in Zero-2 mode (with :obj:`reshard_after_forward=False`).
-            - :obj:`"zero_dp_3"`: to use the second instance of sharded DPP released by fairscale
-              (:obj:`FullyShardedDDP`) in Zero-3 mode (with :obj:`reshard_after_forward=True`).
-            - :obj:`"offload"`: to add ZeRO-offload (only compatible with :obj:`"zero_dp_2"` and :obj:`"zero_dp_3"`).
+            - `"zero_dp_2"`: to use the second instance of sharded DPP released by fairscale
+              (`FullyShardedDDP`) in Zero-2 mode (with `reshard_after_forward=False`).
+            - `"zero_dp_3"`: to use the second instance of sharded DPP released by fairscale
+              (`FullyShardedDDP`) in Zero-3 mode (with `reshard_after_forward=True`).
+            - `"offload"`: to add ZeRO-offload (only compatible with `"zero_dp_2"` and `"zero_dp_3"`).
 
             If a string is passed, it will be split on space. If a bool is passed, it will be converted to an empty
-            list for :obj:`False` and :obj:`["simple"]` for :obj:`True`.
-        deepspeed (:obj:`str` or :obj:`dict`, `optional`):
-            Use `Deepspeed <https://github.com/microsoft/deepspeed>`__. This is an experimental feature and its API may
+            list for `False` and `["simple"]` for `True`.
+        deepspeed (`str` or `dict`, *optional*):
+            Use [Deepspeed](https://github.com/microsoft/deepspeed). This is an experimental feature and its API may
             evolve in the future. The value is either the location of DeepSpeed json config file (e.g.,
-            ``ds_config.json``) or an already loaded json file as a :obj:`dict`"
-        label_smoothing_factor (:obj:`float`, `optional`, defaults to 0.0):
+            `ds_config.json`) or an already loaded json file as a `dict`"
+        label_smoothing_factor (`float`, *optional*, defaults to 0.0):
             The label smoothing factor to use. Zero means no label smoothing, otherwise the underlying onehot-encoded
-            labels are changed from 0s and 1s to :obj:`label_smoothing_factor/num_labels` and :obj:`1 -
-            label_smoothing_factor + label_smoothing_factor/num_labels` respectively.
-        debug (:obj:`str` or list of :class:`~transformers.debug_utils.DebugOption`, `optional`, defaults to :obj:`""`):
+            labels are changed from 0s and 1s to `label_smoothing_factor/num_labels` and `1 - label_smoothing_factor + label_smoothing_factor/num_labels` respectively.
+        debug (`str` or list of [`~debug_utils.DebugOption`], *optional*, defaults to `""`):
             Enable one or more debug features. This is an experimental feature.
 
             Possible options are:
 
-            - :obj:`"underflow_overflow"`: detects overflow in model's input/outputs and reports the last frames that
+            - `"underflow_overflow"`: detects overflow in model's input/outputs and reports the last frames that
               led to the event
-            - :obj:`"tpu_metrics_debug"`: print debug metrics on TPU
+            - `"tpu_metrics_debug"`: print debug metrics on TPU
 
             The options should be separated by whitespaces.
-        adafactor (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to use the :class:`~transformers.Adafactor` optimizer instead of
-            :class:`~transformers.AdamW`.
-        group_by_length (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        adafactor (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the [`Adafactor`] optimizer instead of
+            [`AdamW`].
+        group_by_length (`bool`, *optional*, defaults to `False`):
             Whether or not to group together samples of roughly the same length in the training dataset (to minimize
             padding applied and be more efficient). Only useful if applying dynamic padding.
-        length_column_name (:obj:`str`, `optional`, defaults to :obj:`"length"`):
+        length_column_name (`str`, *optional*, defaults to `"length"`):
             Column name for precomputed lengths. If the column exists, grouping by length will use these values rather
-            than computing them on train startup. Ignored unless :obj:`group_by_length` is :obj:`True` and the dataset
-            is an instance of :obj:`Dataset`.
-        report_to (:obj:`str` or :obj:`List[str]`, `optional`, defaults to :obj:`"all"`):
-            The list of integrations to report the results and logs to. Supported platforms are :obj:`"azure_ml"`,
-            :obj:`"comet_ml"`, :obj:`"mlflow"`, :obj:`"tensorboard"` and :obj:`"wandb"`. Use :obj:`"all"` to report to
-            all integrations installed, :obj:`"none"` for no integrations.
-        ddp_find_unused_parameters (:obj:`bool`, `optional`):
-            When using distributed training, the value of the flag :obj:`find_unused_parameters` passed to
-            :obj:`DistributedDataParallel`. Will default to :obj:`False` if gradient checkpointing is used, :obj:`True`
+            than computing them on train startup. Ignored unless `group_by_length` is `True` and the dataset
+            is an instance of `Dataset`.
+        report_to (`str` or `List[str]`, *optional*, defaults to `"all"`):
+            The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,
+            `"comet_ml"`, `"mlflow"`, `"tensorboard"` and `"wandb"`. Use `"all"` to report to
+            all integrations installed, `"none"` for no integrations.
+        ddp_find_unused_parameters (`bool`, *optional*):
+            When using distributed training, the value of the flag `find_unused_parameters` passed to
+            `DistributedDataParallel`. Will default to `False` if gradient checkpointing is used, `True`
             otherwise.
-        ddp_bucket_cap_mb (:obj:`int`, `optional`):
-            When using distributed training, the value of the flag :obj:`bucket_cap_mb` passed to
-            :obj:`DistributedDataParallel`.
-        dataloader_pin_memory (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether you want to pin memory in data loaders or not. Will default to :obj:`True`.
-        skip_memory_metrics (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        ddp_bucket_cap_mb (`int`, *optional*):
+            When using distributed training, the value of the flag `bucket_cap_mb` passed to
+            `DistributedDataParallel`.
+        dataloader_pin_memory (`bool`, *optional*, defaults to `True`):
+            Whether you want to pin memory in data loaders or not. Will default to `True`.
+        skip_memory_metrics (`bool`, *optional*, defaults to `True`):
             Whether to skip adding of memory profiler reports to metrics. This is skipped by default because it slows
             down the training and evaluation speed.
-        push_to_hub (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        push_to_hub (`bool`, *optional*, defaults to `False`):
             Whether or not to upload the trained model to the hub after training. If this is activated, and
-            :obj:`output_dir` exists, it needs to be a local clone of the repository to which the
-            :class:`~transformers.Trainer` will be pushed.
-        resume_from_checkpoint (:obj:`str`, `optional`):
+            `output_dir` exists, it needs to be a local clone of the repository to which the
+            [`Trainer`] will be pushed.
+        resume_from_checkpoint (`str`, *optional*):
             The path to a folder with a valid checkpoint for your model. This argument is not directly used by
-            :class:`~transformers.Trainer`, it's intended to be used by your training/evaluation scripts instead. See
-            the `example scripts <https://github.com/huggingface/transformers/tree/master/examples>`__ for more
+            [`Trainer`], it's intended to be used by your training/evaluation scripts instead. See
+            the [example scripts](https://github.com/huggingface/transformers/tree/master/examples) for more
             details.
-        hub_model_id (:obj:`str`, `optional`):
-            The name of the repository to keep in sync with the local `output_dir`. It can be a simple model ID in
+        hub_model_id (`str`, *optional*):
+            The name of the repository to keep in sync with the local *output_dir*. It can be a simple model ID in
             which case the model will be pushed in your namespace. Otherwise it should be the whole repository name,
-            for instance :obj:`"user_name/model"`, which allows you to push to an organization you are a member of with
-            :obj:`"organization_name/model"`. Will default to :obj:`user_name/output_dir_name` with `output_dir_name`
-            being the name of :obj:`output_dir`.
+            for instance `"user_name/model"`, which allows you to push to an organization you are a member of with
+            `"organization_name/model"`. Will default to `user_name/output_dir_name` with *output_dir_name*
+            being the name of `output_dir`.
 
-            Will default to to the name of :obj:`output_dir`.
-        hub_strategy (:obj:`str` or :class:`~transformers.trainer_utils.HubStrategy`, `optional`, defaults to :obj:`"every_save"`):
+            Will default to to the name of `output_dir`.
+        hub_strategy (`str` or [`~trainer_utils.HubStrategy`], *optional*, defaults to `"every_save"`):
             Defines the scope of what is pushed to the Hub and when. Possible values are:
 
-            - :obj:`"end"`: push the model, its configuration, the tokenizer (if passed along to the
-              :class:`~transformers.Trainer`) and a draft of a model card at the end of training.
-            - :obj:`"every_save"`: push the model, its configuration, the tokenizer (if passed along to the
-              :class:`~transformers.Trainer`) and a draft of a model card each time there is a model save. The pushes
+            - `"end"`: push the model, its configuration, the tokenizer (if passed along to the
+              [`Trainer`]) and a draft of a model card at the end of training.
+            - `"every_save"`: push the model, its configuration, the tokenizer (if passed along to the
+              [`Trainer`]) and a draft of a model card each time there is a model save. The pushes
               are asynchronous to not block training, and in case the save are very frequent, a new push is only
               attempted if the previous one is finished. A last push is made with the final model at the end of
               training.
-            - :obj:`"checkpoint"`: like :obj:`"every_save"` but the latest checkpoint is also pushed in a subfolder
+            - `"checkpoint"`: like `"every_save"` but the latest checkpoint is also pushed in a subfolder
               named last-checkpoint, allowing you to resume training easily with
-              :obj:`trainer.train(resume_from_checkpoint="last-checkpoint")`.
-            - :obj:`"all_checkpoints"`: like :obj:`"checkpoint"` but all checkpoints are pushed like they appear in the
+              `trainer.train(resume_from_checkpoint="last-checkpoint")`.
+            - `"all_checkpoints"`: like `"checkpoint"` but all checkpoints are pushed like they appear in the
               output folder (so you will get one checkpoint folder per folder in your final repository)
 
-        hub_token (:obj:`str`, `optional`):
+        hub_token (`str`, *optional*):
             The token to use to push the model to the Hub. Will default to the token in the cache folder obtained with
-            :obj:`huggingface-cli login`.
-        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            `huggingface-cli login`.
+        gradient_checkpointing (`bool`, *optional*, defaults to `False`):
             If True, use gradient checkpointing to save memory at the expense of slower backward pass.
     """
 
@@ -948,7 +952,7 @@ class TrainingArguments:
     @property
     def train_batch_size(self) -> int:
         """
-        The actual batch size for training (may differ from :obj:`per_gpu_train_batch_size` in distributed training).
+        The actual batch size for training (may differ from `per_gpu_train_batch_size` in distributed training).
         """
         if self.per_gpu_train_batch_size:
             logger.warning(
@@ -962,7 +966,7 @@ class TrainingArguments:
     @property
     def eval_batch_size(self) -> int:
         """
-        The actual batch size for evaluation (may differ from :obj:`per_gpu_eval_batch_size` in distributed training).
+        The actual batch size for evaluation (may differ from `per_gpu_eval_batch_size` in distributed training).
         """
         if self.per_gpu_eval_batch_size:
             logger.warning(
@@ -1068,11 +1072,11 @@ class TrainingArguments:
         """
         The current mode used for parallelism if multiple GPUs/TPU cores are available. One of:
 
-        - :obj:`ParallelMode.NOT_PARALLEL`: no parallelism (CPU or one GPU).
-        - :obj:`ParallelMode.NOT_DISTRIBUTED`: several GPUs in one single process (uses :obj:`torch.nn.DataParallel`).
-        - :obj:`ParallelMode.DISTRIBUTED`: several GPUs, each having its own process (uses
-          :obj:`torch.nn.DistributedDataParallel`).
-        - :obj:`ParallelMode.TPU`: several TPU cores.
+        - `ParallelMode.NOT_PARALLEL`: no parallelism (CPU or one GPU).
+        - `ParallelMode.NOT_DISTRIBUTED`: several GPUs in one single process (uses `torch.nn.DataParallel`).
+        - `ParallelMode.DISTRIBUTED`: several GPUs, each having its own process (uses
+          `torch.nn.DistributedDataParallel`).
+        - `ParallelMode.TPU`: several TPU cores.
         """
         if is_torch_tpu_available():
             return ParallelMode.TPU
@@ -1166,13 +1170,13 @@ class TrainingArguments:
         Returns the log level to be used depending on whether this process is the main process of node 0, main process
         of node non-0, or a non-main process.
 
-        For the main process the log level defaults to ``logging.INFO`` unless overridden by ``log_level`` argument.
+        For the main process the log level defaults to `logging.INFO` unless overridden by `log_level` argument.
 
-        For the replica processes the log level defaults to ``logging.WARNING`` unless overridden by
-        ``log_level_replica`` argument.
+        For the replica processes the log level defaults to `logging.WARNING` unless overridden by
+        `log_level_replica` argument.
 
         The choice between the main and replica process settings is made according to the return value of
-        ``should_log``.
+        `should_log`.
         """
 
         log_level_main_node = logging.INFO if self.log_level == -1 else self.log_level
@@ -1196,21 +1200,21 @@ class TrainingArguments:
     @contextlib.contextmanager
     def main_process_first(self, local=True, desc="work"):
         """
-            A context manager for torch distributed environment where on needs to do something on the main process,
-            while blocking replicas, and when it's finished releasing the replicas.
+        A context manager for torch distributed environment where on needs to do something on the main process,
+        while blocking replicas, and when it's finished releasing the replicas.
 
-            One such use is for ``datasets``'s ``map`` feature which to be efficient should be run once on the main
-            process, which upon completion saves a cached version of results and which then automatically gets loaded
-            by the replicas.
+        One such use is for `datasets`'s `map` feature which to be efficient should be run once on the main
+        process, which upon completion saves a cached version of results and which then automatically gets loaded
+        by the replicas.
 
         Args:
-            local (:obj:`bool`, `optional`, defaults to :obj:`True`):
-                if :obj:`True` first means process of rank 0 of each node if :obj:`False` first means process of rank 0
+            local (`bool`, *optional*, defaults to `True`):
+                if `True` first means process of rank 0 of each node if `False` first means process of rank 0
                 of node rank 0 In multi-node environment with a shared filesystem you most likely will want to use
-                ``local=False`` so that only the main process of the first node will do the processing. If however, the
+                `local=False` so that only the main process of the first node will do the processing. If however, the
                 filesystem is not shared, then the main process of each node will need to do the processing, which is
                 the default behavior.
-            desc (:obj:`str`, `optional`, defaults to ``"work"``):
+            desc (`str`, *optional*, defaults to `"work"`):
                 a work description to be used in debug logs
 
         """
diff --git a/src/transformers/training_args_seq2seq.py b/src/transformers/training_args_seq2seq.py
index 02b9a77be0..4f2154d86e 100644
--- a/src/transformers/training_args_seq2seq.py
+++ b/src/transformers/training_args_seq2seq.py
@@ -27,20 +27,20 @@ logger = logging.getLogger(__name__)
 @add_start_docstrings(TrainingArguments.__doc__)
 class Seq2SeqTrainingArguments(TrainingArguments):
     """
-    sortish_sampler (:obj:`bool`, `optional`, defaults to :obj:`False`):
-        Whether to use a `sortish sampler` or not. Only possible if the underlying datasets are `Seq2SeqDataset` for
+    sortish_sampler (`bool`, *optional*, defaults to `False`):
+        Whether to use a *sortish sampler* or not. Only possible if the underlying datasets are *Seq2SeqDataset* for
         now but will become generally available in the near future.
 
         It sorts the inputs according to lengths in order to minimize the padding size, with a bit of randomness for
         the training set.
-    predict_with_generate (:obj:`bool`, `optional`, defaults to :obj:`False`):
+    predict_with_generate (`bool`, *optional*, defaults to `False`):
         Whether to use generate to calculate generative metrics (ROUGE, BLEU).
-    generation_max_length (:obj:`int`, `optional`):
-        The :obj:`max_length` to use on each evaluation loop when :obj:`predict_with_generate=True`. Will default to
-        the :obj:`max_length` value of the model configuration.
-    generation_num_beams (:obj:`int`, `optional`):
-        The :obj:`num_beams` to use on each evaluation loop when :obj:`predict_with_generate=True`. Will default to the
-        :obj:`num_beams` value of the model configuration.
+    generation_max_length (`int`, *optional*):
+        The `max_length` to use on each evaluation loop when `predict_with_generate=True`. Will default to
+        the `max_length` value of the model configuration.
+    generation_num_beams (`int`, *optional*):
+        The `num_beams` to use on each evaluation loop when `predict_with_generate=True`. Will default to the
+        `num_beams` value of the model configuration.
     """
 
     sortish_sampler: bool = field(default=False, metadata={"help": "Whether to use SortishSampler or not."})
diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py
index 9d8f95cb2e..64776525f1 100644
--- a/src/transformers/training_args_tf.py
+++ b/src/transformers/training_args_tf.py
@@ -33,134 +33,135 @@ class TFTrainingArguments(TrainingArguments):
     TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop
     itself**.
 
-    Using :class:`~transformers.HfArgumentParser` we can turn this class into `argparse
-    <https://docs.python.org/3/library/argparse.html#module-argparse>`__ arguments that can be specified on the command
+    Using [`HfArgumentParser`] we can turn this class into [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the command
     line.
 
     Parameters:
-        output_dir (:obj:`str`):
+        output_dir (`str`):
             The output directory where the model predictions and checkpoints will be written.
-        overwrite_output_dir (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            If :obj:`True`, overwrite the content of the output directory. Use this to continue training if
-            :obj:`output_dir` points to a checkpoint directory.
-        do_train (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to run training or not. This argument is not directly used by :class:`~transformers.Trainer`, it's
-            intended to be used by your training/evaluation scripts instead. See the `example scripts
-            <https://github.com/huggingface/transformers/tree/master/examples>`__ for more details.
-        do_eval (:obj:`bool`, `optional`):
-            Whether to run evaluation on the validation set or not. Will be set to :obj:`True` if
-            :obj:`evaluation_strategy` is different from :obj:`"no"`. This argument is not directly used by
-            :class:`~transformers.Trainer`, it's intended to be used by your training/evaluation scripts instead. See
-            the `example scripts <https://github.com/huggingface/transformers/tree/master/examples>`__ for more
+        overwrite_output_dir (`bool`, *optional*, defaults to `False`):
+            If `True`, overwrite the content of the output directory. Use this to continue training if
+            `output_dir` points to a checkpoint directory.
+        do_train (`bool`, *optional*, defaults to `False`):
+            Whether to run training or not. This argument is not directly used by [`Trainer`], it's
+            intended to be used by your training/evaluation scripts instead. See the [example scripts](https://github.com/huggingface/transformers/tree/master/examples) for more details.
+        do_eval (`bool`, *optional*):
+            Whether to run evaluation on the validation set or not. Will be set to `True` if
+            `evaluation_strategy` is different from `"no"`. This argument is not directly used by
+            [`Trainer`], it's intended to be used by your training/evaluation scripts instead. See
+            the [example scripts](https://github.com/huggingface/transformers/tree/master/examples) for more
             details.
-        do_predict (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        do_predict (`bool`, *optional*, defaults to `False`):
             Whether to run predictions on the test set or not. This argument is not directly used by
-            :class:`~transformers.Trainer`, it's intended to be used by your training/evaluation scripts instead. See
-            the `example scripts <https://github.com/huggingface/transformers/tree/master/examples>`__ for more
+            [`Trainer`], it's intended to be used by your training/evaluation scripts instead. See
+            the [example scripts](https://github.com/huggingface/transformers/tree/master/examples) for more
             details.
-        evaluation_strategy (:obj:`str` or :class:`~transformers.trainer_utils.IntervalStrategy`, `optional`, defaults to :obj:`"no"`):
+        evaluation_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"no"`):
             The evaluation strategy to adopt during training. Possible values are:
 
-                * :obj:`"no"`: No evaluation is done during training.
-                * :obj:`"steps"`: Evaluation is done (and logged) every :obj:`eval_steps`.
-                * :obj:`"epoch"`: Evaluation is done at the end of each epoch.
+                - `"no"`: No evaluation is done during training.
+                - `"steps"`: Evaluation is done (and logged) every `eval_steps`.
+                - `"epoch"`: Evaluation is done at the end of each epoch.
 
-        per_device_train_batch_size (:obj:`int`, `optional`, defaults to 8):
+        per_device_train_batch_size (`int`, *optional*, defaults to 8):
             The batch size per GPU/TPU core/CPU for training.
-        per_device_eval_batch_size (:obj:`int`, `optional`, defaults to 8):
+        per_device_eval_batch_size (`int`, *optional*, defaults to 8):
             The batch size per GPU/TPU core/CPU for evaluation.
-        gradient_accumulation_steps: (:obj:`int`, `optional`, defaults to 1):
+        gradient_accumulation_steps: (`int`, *optional*, defaults to 1):
             Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
 
-            .. warning::
+            <Tip warning={true}>
 
-                When using gradient accumulation, one step is counted as one step with backward pass. Therefore,
-                logging, evaluation, save will be conducted every ``gradient_accumulation_steps * xxx_step`` training
-                examples.
-        learning_rate (:obj:`float`, `optional`, defaults to 5e-5):
+            When using gradient accumulation, one step is counted as one step with backward pass. Therefore,
+            logging, evaluation, save will be conducted every `gradient_accumulation_steps * xxx_step` training
+            examples.
+
+            </Tip>
+
+        learning_rate (`float`, *optional*, defaults to 5e-5):
             The initial learning rate for Adam.
-        weight_decay (:obj:`float`, `optional`, defaults to 0):
+        weight_decay (`float`, *optional*, defaults to 0):
             The weight decay to apply (if not zero).
-        adam_beta1 (:obj:`float`, `optional`, defaults to 0.9):
+        adam_beta1 (`float`, *optional*, defaults to 0.9):
             The beta1 hyperparameter for the Adam optimizer.
-        adam_beta2 (:obj:`float`, `optional`, defaults to 0.999):
+        adam_beta2 (`float`, *optional*, defaults to 0.999):
             The beta2 hyperparameter for the Adam optimizer.
-        adam_epsilon (:obj:`float`, `optional`, defaults to 1e-8):
+        adam_epsilon (`float`, *optional*, defaults to 1e-8):
             The epsilon hyperparameter for the Adam optimizer.
-        max_grad_norm (:obj:`float`, `optional`, defaults to 1.0):
+        max_grad_norm (`float`, *optional*, defaults to 1.0):
             Maximum gradient norm (for gradient clipping).
-        num_train_epochs(:obj:`float`, `optional`, defaults to 3.0):
+        num_train_epochs(`float`, *optional*, defaults to 3.0):
             Total number of training epochs to perform.
-        max_steps (:obj:`int`, `optional`, defaults to -1):
+        max_steps (`int`, *optional*, defaults to -1):
             If set to a positive number, the total number of training steps to perform. Overrides
-            :obj:`num_train_epochs`.
-        warmup_ratio (:obj:`float`, `optional`, defaults to 0.0):
-            Ratio of total training steps used for a linear warmup from 0 to :obj:`learning_rate`.
-        warmup_steps (:obj:`int`, `optional`, defaults to 0):
-            Number of steps used for a linear warmup from 0 to :obj:`learning_rate`. Overrides any effect of
-            :obj:`warmup_ratio`.
-        logging_dir (:obj:`str`, `optional`):
-            `TensorBoard <https://www.tensorflow.org/tensorboard>`__ log directory. Will default to
-            `runs/**CURRENT_DATETIME_HOSTNAME**`.
-        logging_strategy (:obj:`str` or :class:`~transformers.trainer_utils.IntervalStrategy`, `optional`, defaults to :obj:`"steps"`):
+            `num_train_epochs`.
+        warmup_ratio (`float`, *optional*, defaults to 0.0):
+            Ratio of total training steps used for a linear warmup from 0 to `learning_rate`.
+        warmup_steps (`int`, *optional*, defaults to 0):
+            Number of steps used for a linear warmup from 0 to `learning_rate`. Overrides any effect of
+            `warmup_ratio`.
+        logging_dir (`str`, *optional*):
+            [TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to
+            *runs/**CURRENT_DATETIME_HOSTNAME***.
+        logging_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`):
             The logging strategy to adopt during training. Possible values are:
 
-                * :obj:`"no"`: No logging is done during training.
-                * :obj:`"epoch"`: Logging is done at the end of each epoch.
-                * :obj:`"steps"`: Logging is done every :obj:`logging_steps`.
+                - `"no"`: No logging is done during training.
+                - `"epoch"`: Logging is done at the end of each epoch.
+                - `"steps"`: Logging is done every `logging_steps`.
 
-        logging_first_step (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to log and evaluate the first :obj:`global_step` or not.
-        logging_steps (:obj:`int`, `optional`, defaults to 500):
-            Number of update steps between two logs if :obj:`logging_strategy="steps"`.
-        save_strategy (:obj:`str` or :class:`~transformers.trainer_utils.IntervalStrategy`, `optional`, defaults to :obj:`"steps"`):
+        logging_first_step (`bool`, *optional*, defaults to `False`):
+            Whether to log and evaluate the first `global_step` or not.
+        logging_steps (`int`, *optional*, defaults to 500):
+            Number of update steps between two logs if `logging_strategy="steps"`.
+        save_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`):
             The checkpoint save strategy to adopt during training. Possible values are:
 
-                * :obj:`"no"`: No save is done during training.
-                * :obj:`"epoch"`: Save is done at the end of each epoch.
-                * :obj:`"steps"`: Save is done every :obj:`save_steps`.
+                - `"no"`: No save is done during training.
+                - `"epoch"`: Save is done at the end of each epoch.
+                - `"steps"`: Save is done every `save_steps`.
 
-        save_steps (:obj:`int`, `optional`, defaults to 500):
-            Number of updates steps before two checkpoint saves if :obj:`save_strategy="steps"`.
-        save_total_limit (:obj:`int`, `optional`):
+        save_steps (`int`, *optional*, defaults to 500):
+            Number of updates steps before two checkpoint saves if `save_strategy="steps"`.
+        save_total_limit (`int`, *optional*):
             If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in
-            :obj:`output_dir`.
-        no_cuda (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            `output_dir`.
+        no_cuda (`bool`, *optional*, defaults to `False`):
             Whether to not use CUDA even when it is available or not.
-        seed (:obj:`int`, `optional`, defaults to 42):
+        seed (`int`, *optional*, defaults to 42):
             Random seed that will be set at the beginning of training.
-        fp16 (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        fp16 (`bool`, *optional*, defaults to `False`):
             Whether to use 16-bit (mixed) precision training (through NVIDIA Apex) instead of 32-bit training.
-        fp16_opt_level (:obj:`str`, `optional`, defaults to 'O1'):
-            For :obj:`fp16` training, Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details
-            on the `Apex documentation <https://nvidia.github.io/apex/amp.html>`__.
-        local_rank (:obj:`int`, `optional`, defaults to -1):
+        fp16_opt_level (`str`, *optional*, defaults to 'O1'):
+            For `fp16` training, Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details
+            on the [Apex documentation](https://nvidia.github.io/apex/amp).
+        local_rank (`int`, *optional*, defaults to -1):
             During distributed training, the rank of the process.
-        tpu_num_cores (:obj:`int`, `optional`):
+        tpu_num_cores (`int`, *optional*):
             When training on TPU, the number of TPU cores (automatically passed by launcher script).
-        debug (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        debug (`bool`, *optional*, defaults to `False`):
             Whether to activate the trace to record computation graphs and profiling information or not.
-        dataloader_drop_last (:obj:`bool`, `optional`, defaults to :obj:`False`):
+        dataloader_drop_last (`bool`, *optional*, defaults to `False`):
             Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size)
             or not.
-        eval_steps (:obj:`int`, `optional`, defaults to 1000):
+        eval_steps (`int`, *optional*, defaults to 1000):
             Number of update steps before two evaluations.
-        past_index (:obj:`int`, `optional`, defaults to -1):
-            Some models like :doc:`TransformerXL <../model_doc/transformerxl>` or :doc`XLNet <../model_doc/xlnet>` can
+        past_index (`int`, *optional*, defaults to -1):
+            Some models like [TransformerXL](../model_doc/transformerxl) or :doc*XLNet <../model_doc/xlnet>* can
             make use of the past hidden states for their predictions. If this argument is set to a positive int, the
-            ``Trainer`` will use the corresponding output (usually index 2) as the past state and feed it to the model
-            at the next training step under the keyword argument ``mems``.
-        tpu_name (:obj:`str`, `optional`):
+            `Trainer` will use the corresponding output (usually index 2) as the past state and feed it to the model
+            at the next training step under the keyword argument `mems`.
+        tpu_name (`str`, *optional*):
             The name of the TPU the process is running on.
-        tpu_zone (:obj:`str`, `optional`):
+        tpu_zone (`str`, *optional*):
             The zone of the TPU the process is running on. If not specified, we will attempt to automatically detect
             from metadata.
-        gcp_project (:obj:`str`, `optional`):
+        gcp_project (`str`, *optional*):
             Google Cloud Project name for the Cloud TPU-enabled project. If not specified, we will attempt to
             automatically detect from metadata.
-        run_name (:obj:`str`, `optional`):
+        run_name (`str`, *optional*):
             A descriptor for the run. Notably used for wandb logging.
-        xla (:obj:`bool`, `optional`):
+        xla (`bool`, *optional*):
             Whether to activate the XLA compilation or not.
     """
 
@@ -259,7 +260,7 @@ class TFTrainingArguments(TrainingArguments):
     @property
     def train_batch_size(self) -> int:
         """
-        The actual batch size for training (may differ from :obj:`per_gpu_train_batch_size` in distributed training).
+        The actual batch size for training (may differ from `per_gpu_train_batch_size` in distributed training).
         """
         if self.per_gpu_train_batch_size:
             logger.warning(
@@ -272,7 +273,7 @@ class TFTrainingArguments(TrainingArguments):
     @property
     def eval_batch_size(self) -> int:
         """
-        The actual batch size for evaluation (may differ from :obj:`per_gpu_eval_batch_size` in distributed training).
+        The actual batch size for evaluation (may differ from `per_gpu_eval_batch_size` in distributed training).
         """
         if self.per_gpu_eval_batch_size:
             logger.warning(
diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py
index 38f2caa904..890a9b5a05 100644
--- a/src/transformers/utils/fx.py
+++ b/src/transformers/utils/fx.py
@@ -404,12 +404,12 @@ class HFTracer(Tracer):
 
     def path_of_module(self, mod: nn.Module) -> str:
         """
-        Helper method to find the qualified name of ``mod`` in the Module hierarchy of ``root``. For example, if
-        ``root`` has a submodule named ``foo``, which has a submodule named ``bar``, passing ``bar`` into this function
+        Helper method to find the qualified name of `mod` in the Module hierarchy of `root`. For example, if
+        `root` has a submodule named `foo`, which has a submodule named `bar`, passing `bar` into this function
         will return the string "foo.bar".
 
         Args:
-            mod (str): The ``Module`` to retrieve the qualified name for.
+            mod (str): The `Module` to retrieve the qualified name for.
         """
         # Prefer the O(1) algorithm
         if hasattr(self, "submodule_paths") and self.submodule_paths:
@@ -506,32 +506,32 @@ def symbolic_trace(
     Performs symbolic tracing on the model.
 
     Args:
-        model (:obj:`PretrainedModel`):
+        model ([`PretrainedModel`]):
             The model to trace.
-        input_names (:obj:`List[str]`, `optional`):
+        input_names (`List[str]`, *optional*):
             The names of the inputs of the traced model. If unset, model.dummy_inputs().keys() are used instead.
-        batch_size (:obj:`int`, `optional`, defaults to 1):
+        batch_size (`int`, *optional*, defaults to 1):
             The batch size of the traced model inputs.
-        sequence_length (:obj:`int` or :obj:`List[int]]`):
+        sequence_length (`int` or `List[int]]`):
             The sequence length of the traced model inputs. For sequence-to-sequence models with different sequence
-            lengths between the encoder and the decoder inputs, this must be :obj:`[encoder_sequence_length,
-            decoder_sequence_length]`.
-        num_choices (:obj:`int`, `optional`, defaults to -1):
+            lengths between the encoder and the decoder inputs, this must be `[encoder_sequence_length, decoder_sequence_length]`.
+        num_choices (`int`, *optional*, defaults to -1):
             The number of possible choices for a multiple choice task.
 
     Returns:
-        :obj:`torch.fx.GraphModule`: A GraphModule constructed by recording operations seen while tracing the model.
+        `torch.fx.GraphModule`: A GraphModule constructed by recording operations seen while tracing the model.
 
-    Example::
+    Example:
 
-        from transformers.utils.fx import symbolic_trace
-        traced_model = symbolic_trace(
-            model,
-            input_names=["input_ids", "attention_mask", "token_type_ids"],
-            batch_size=1,
-            sequence_length=128,
-        )
-    """
+    ```python
+    from transformers.utils.fx import symbolic_trace
+    traced_model = symbolic_trace(
+        model,
+        input_names=["input_ids", "attention_mask", "token_type_ids"],
+        batch_size=1,
+        sequence_length=128,
+    )
+    ```"""
     if input_names is None:
         input_names = model.dummy_inputs.keys()
 
diff --git a/src/transformers/utils/logging.py b/src/transformers/utils/logging.py
index eab03c19a5..638122af32 100644
--- a/src/transformers/utils/logging.py
+++ b/src/transformers/utils/logging.py
@@ -46,7 +46,7 @@ _default_log_level = logging.WARNING
 def _get_default_logging_level():
     """
     If TRANSFORMERS_VERBOSITY env var is set to one of the valid choices return that as the new default level. If it is
-    not - fall back to ``_default_log_level``
+    not - fall back to `_default_log_level`
     """
     env_level_str = os.getenv("TRANSFORMERS_VERBOSITY", None)
     if env_level_str:
@@ -125,18 +125,19 @@ def get_verbosity() -> int:
     Return the current level for the 🤗 Transformers's root logger as an int.
 
     Returns:
-        :obj:`int`: The logging level.
+        `int`: The logging level.
 
-    .. note::
+    <Tip>
 
-        🤗 Transformers has following logging levels:
+    🤗 Transformers has following logging levels:
 
-        - 50: ``transformers.logging.CRITICAL`` or ``transformers.logging.FATAL``
-        - 40: ``transformers.logging.ERROR``
-        - 30: ``transformers.logging.WARNING`` or ``transformers.logging.WARN``
-        - 20: ``transformers.logging.INFO``
-        - 10: ``transformers.logging.DEBUG``
-    """
+    - 50: `transformers.logging.CRITICAL` or `transformers.logging.FATAL`
+    - 40: `transformers.logging.ERROR`
+    - 30: `transformers.logging.WARNING` or `transformers.logging.WARN`
+    - 20: `transformers.logging.INFO`
+    - 10: `transformers.logging.DEBUG`
+
+    </Tip>"""
 
     _configure_library_root_logger()
     return _get_library_root_logger().getEffectiveLevel()
@@ -147,14 +148,14 @@ def set_verbosity(verbosity: int) -> None:
     Set the verbosity level for the 🤗 Transformers's root logger.
 
     Args:
-        verbosity (:obj:`int`):
+        verbosity (`int`):
             Logging level, e.g., one of:
 
-            - ``transformers.logging.CRITICAL`` or ``transformers.logging.FATAL``
-            - ``transformers.logging.ERROR``
-            - ``transformers.logging.WARNING`` or ``transformers.logging.WARN``
-            - ``transformers.logging.INFO``
-            - ``transformers.logging.DEBUG``
+            - `transformers.logging.CRITICAL` or `transformers.logging.FATAL`
+            - `transformers.logging.ERROR`
+            - `transformers.logging.WARNING` or `transformers.logging.WARN`
+            - `transformers.logging.INFO`
+            - `transformers.logging.DEBUG`
     """
 
     _configure_library_root_logger()
@@ -162,22 +163,22 @@ def set_verbosity(verbosity: int) -> None:
 
 
 def set_verbosity_info():
-    """Set the verbosity to the :obj:`INFO` level."""
+    """Set the verbosity to the `INFO` level."""
     return set_verbosity(INFO)
 
 
 def set_verbosity_warning():
-    """Set the verbosity to the :obj:`WARNING` level."""
+    """Set the verbosity to the `WARNING` level."""
     return set_verbosity(WARNING)
 
 
 def set_verbosity_debug():
-    """Set the verbosity to the :obj:`DEBUG` level."""
+    """Set the verbosity to the `DEBUG` level."""
     return set_verbosity(DEBUG)
 
 
 def set_verbosity_error():
-    """Set the verbosity to the :obj:`ERROR` level."""
+    """Set the verbosity to the `ERROR` level."""
     return set_verbosity(ERROR)
 
 
diff --git a/src/transformers/utils/notebook.py b/src/transformers/utils/notebook.py
index 5e7fc5b225..498e1ffe42 100644
--- a/src/transformers/utils/notebook.py
+++ b/src/transformers/utils/notebook.py
@@ -65,36 +65,37 @@ class NotebookProgressBar:
 
     Class attributes (overridden by derived classes)
 
-        - **warmup** (:obj:`int`) -- The number of iterations to do at the beginning while ignoring
-          :obj:`update_every`.
-        - **update_every** (:obj:`float`) -- Since calling the time takes some time, we only do it every presumed
-          :obj:`update_every` seconds. The progress bar uses the average time passed up until now to guess the next
+        - **warmup** (`int`) -- The number of iterations to do at the beginning while ignoring
+          `update_every`.
+        - **update_every** (`float`) -- Since calling the time takes some time, we only do it every presumed
+          `update_every` seconds. The progress bar uses the average time passed up until now to guess the next
           value for which it will call the update.
 
     Args:
-        total (:obj:`int`):
+        total (`int`):
             The total number of iterations to reach.
-        prefix (:obj:`str`, `optional`):
+        prefix (`str`, *optional*):
             A prefix to add before the progress bar.
-        leave (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        leave (`bool`, *optional*, defaults to `True`):
             Whether or not to leave the progress bar once it's completed. You can always call the
-            :meth:`~transformers.utils.notebook.NotebookProgressBar.close` method to make the bar disappear.
-        parent (:class:`~transformers.notebook.NotebookTrainingTracker`, `optional`):
-            A parent object (like :class:`~transformers.utils.notebook.NotebookTrainingTracker`) that spawns progress
-            bars and handle their display. If set, the object passed must have a :obj:`display()` method.
-        width (:obj:`int`, `optional`, defaults to 300):
+            [`~utils.notebook.NotebookProgressBar.close`] method to make the bar disappear.
+        parent ([`~notebook.NotebookTrainingTracker`], *optional*):
+            A parent object (like [`~utils.notebook.NotebookTrainingTracker`]) that spawns progress
+            bars and handle their display. If set, the object passed must have a `display()` method.
+        width (`int`, *optional*, defaults to 300):
             The width (in pixels) that the bar will take.
 
-    Example::
+    Example:
 
-        import time
+    ```python
+    import time
 
-        pbar = NotebookProgressBar(100)
-        for val in range(100):
-            pbar.update(val)
-            time.sleep(0.07)
-        pbar.update(100)
-    """
+    pbar = NotebookProgressBar(100)
+    for val in range(100):
+        pbar.update(val)
+        time.sleep(0.07)
+    pbar.update(100)
+    ```"""
 
     warmup = 5
     update_every = 0.2
@@ -118,17 +119,17 @@ class NotebookProgressBar:
 
     def update(self, value: int, force_update: bool = False, comment: str = None):
         """
-        The main method to update the progress bar to :obj:`value`.
+        The main method to update the progress bar to `value`.
 
         Args:
 
-            value (:obj:`int`):
-                The value to use. Must be between 0 and :obj:`total`.
-            force_update (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            value (`int`):
+                The value to use. Must be between 0 and `total`.
+            force_update (`bool`, *optional*, defaults to `False`):
                 Whether or not to force and update of the internal state and display (by default, the bar will wait for
-                :obj:`value` to reach the value it predicted corresponds to a time of more than the :obj:`update_every`
+                `value` to reach the value it predicted corresponds to a time of more than the `update_every`
                 attribute since the last update to avoid adding boilerplate).
-            comment (:obj:`str`, `optional`):
+            comment (`str`, *optional*):
                 A comment to add on the left of the progress bar.
         """
         self.value = value
@@ -203,10 +204,10 @@ class NotebookTrainingTracker(NotebookProgressBar):
 
     Args:
 
-        num_steps (:obj:`int`): The number of steps during training.
-        column_names (:obj:`List[str]`, `optional`):
+        num_steps (`int`): The number of steps during training.
+        column_names (`List[str]`, *optional*):
             The list of column names for the metrics table (will be inferred from the first call to
-            :meth:`~transformers.utils.notebook.NotebookTrainingTracker.write_line` if not set).
+            [`~utils.notebook.NotebookTrainingTracker.write_line`] if not set).
     """
 
     def __init__(self, num_steps, column_names=None):
@@ -230,7 +231,7 @@ class NotebookTrainingTracker(NotebookProgressBar):
         Write the values in the inner table.
 
         Args:
-            values (:obj:`Dict[str, float]`): The values to display.
+            values (`Dict[str, float]`): The values to display.
         """
         if self.inner_table is None:
             self.inner_table = [list(values.keys()), list(values.values())]
@@ -250,9 +251,9 @@ class NotebookTrainingTracker(NotebookProgressBar):
         easily updated).
 
         Args:
-            total (:obj:`int`): The number of iterations for the child progress bar.
-            prefix (:obj:`str`, `optional`): A prefix to write on the left of the progress bar.
-            width (:obj:`int`, `optional`, defaults to 300): The width (in pixels) of the progress bar.
+            total (`int`): The number of iterations for the child progress bar.
+            prefix (`str`, *optional*): A prefix to write on the left of the progress bar.
+            width (`int`, *optional*, defaults to 300): The width (in pixels) of the progress bar.
         """
         self.child_bar = NotebookProgressBar(total, prefix=prefix, parent=self, width=width)
         return self.child_bar
@@ -267,7 +268,7 @@ class NotebookTrainingTracker(NotebookProgressBar):
 
 class NotebookProgressCallback(TrainerCallback):
     """
-    A :class:`~transformers.TrainerCallback` that displays the progress of training or evaluation, optimized for
+    A [`TrainerCallback`] that displays the progress of training or evaluation, optimized for
     Jupyter Notebooks or Google colab.
     """
 
diff --git a/src/transformers/utils/versions.py b/src/transformers/utils/versions.py
index cb2fbdb9d8..5f4aece8ad 100644
--- a/src/transformers/utils/versions.py
+++ b/src/transformers/utils/versions.py
@@ -55,18 +55,18 @@ def require_version(requirement: str, hint: Optional[str] = None) -> None:
     """
     Perform a runtime check of the dependency versions, using the exact same syntax used by pip.
 
-    The installed module version comes from the `site-packages` dir via `importlib_metadata`.
+    The installed module version comes from the *site-packages* dir via *importlib_metadata*.
 
     Args:
-        requirement (:obj:`str`): pip style definition, e.g.,  "tokenizers==0.9.4", "tqdm>=4.27", "numpy"
-        hint (:obj:`str`, `optional`): what suggestion to print in case of requirements not being met
+        requirement (`str`): pip style definition, e.g.,  "tokenizers==0.9.4", "tqdm>=4.27", "numpy"
+        hint (`str`, *optional*): what suggestion to print in case of requirements not being met
 
-    Example::
+    Example:
 
-       require_version("pandas>1.1.2")
-       require_version("numpy>1.18.5", "this is important to have for whatever reason")
-
-    """
+    ```python
+    require_version("pandas>1.1.2")
+    require_version("numpy>1.18.5", "this is important to have for whatever reason")
+    ```"""
 
     hint = f"\n{hint}" if hint is not None else ""
 
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py
index 6978a3ddf3..3fce8a55f1 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py
@@ -28,108 +28,110 @@ logger = logging.get_logger(__name__)
 
 class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig):
     r"""
-    This is the configuration class to store the configuration of a :class:`~transformers.{{cookiecutter.camelcase_modelname}}Model`.
+    This is the configuration class to store the configuration of a [`~{{cookiecutter.camelcase_modelname}}Model`].
     It is used to instantiate an {{cookiecutter.modelname}} model according to the specified arguments, defining the model
     architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-    the {{cookiecutter.modelname}} `{{cookiecutter.checkpoint_identifier}} <https://huggingface.co/{{cookiecutter.checkpoint_identifier}}>`__ architecture.
+    the {{cookiecutter.modelname}} [{{cookiecutter.checkpoint_identifier}}](https://huggingface.co/{{cookiecutter.checkpoint_identifier}}) architecture.
 
-    Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
-    to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+    Configuration objects inherit from  [`PretrainedConfig`] and can be used
+    to control the model outputs. Read the documentation from  [`PretrainedConfig`]
     for more information.
 
 
     Args:
         {% if cookiecutter.is_encoder_decoder_model == "False" -%}
-        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+        vocab_size (`int`, *optional*, defaults to 30522):
             Vocabulary size of the {{cookiecutter.modelname}} model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.{{cookiecutter.camelcase_modelname}}Model` or
-            :class:`~transformers.TF{{cookiecutter.camelcase_modelname}}Model`.
-        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            `inputs_ids` passed when calling [`~{{cookiecutter.camelcase_modelname}}Model`] or
+            [`~TF{{cookiecutter.camelcase_modelname}}Model`].
+        hidden_size (`int`, *optional*, defaults to 768):
             Dimension of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+        num_hidden_layers (`int`, *optional*, defaults to 12):
             Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+        num_attention_heads (`int`, *optional*, defaults to 12):
             Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+        intermediate_size (`int`, *optional*, defaults to 3072):
             Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler.
-            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"selu"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
             The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (`int`, *optional*, defaults to 512):
             The maximum sequence length that this model might ever be used with.
             Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.{{cookiecutter.camelcase_modelname}}Model` or
-            :class:`~transformers.TF{{cookiecutter.camelcase_modelname}}Model`.
-        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`~{{cookiecutter.camelcase_modelname}}Model`] or
+            [`~TF{{cookiecutter.camelcase_modelname}}Model`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if ``config.is_decoder=True``.
+            relevant if `config.is_decoder=True`.
         {% else -%}
-        vocab_size (:obj:`int`, `optional`, defaults to 50265):
+        vocab_size (`int`, *optional*, defaults to 50265):
             Vocabulary size of the {{cookiecutter.modelname}} model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.{{cookiecutter.camelcase_modelname}}Model` or
-            :class:`~transformers.TF{{cookiecutter.camelcase_modelname}}Model`.
-        d_model (:obj:`int`, `optional`, defaults to 1024):
+            `inputs_ids` passed when calling [`~{{cookiecutter.camelcase_modelname}}Model`] or
+            [`~TF{{cookiecutter.camelcase_modelname}}Model`].
+        d_model (`int`, *optional*, defaults to 1024):
             Dimension of the layers and the pooler layer.
-        encoder_layers (:obj:`int`, `optional`, defaults to 12):
+        encoder_layers (`int`, *optional*, defaults to 12):
             Number of encoder layers.
-        decoder_layers (:obj:`int`, `optional`, defaults to 12):
+        decoder_layers (`int`, *optional*, defaults to 12):
             Number of decoder layers.
-        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
             Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
             Dimension of the "intermediate" (often named feed-forward) layer in decoder.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
             Dimension of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
             The non-linear activation function (function or string) in the encoder and pooler. If string,
-            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
-        dropout (:obj:`float`, `optional`, defaults to 0.1):
+            `"gelu"`, `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        attention_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
-        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        activation_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+        classifier_dropout (`float`, *optional*, defaults to 0.0):
             The dropout ratio for classifier.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
             The maximum sequence length that this model might ever be used with. Typically set this to something large
             just in case (e.g., 512 or 1024 or 2048).
-        init_std (:obj:`float`, `optional`, defaults to 0.02):
+        init_std (`float`, *optional*, defaults to 0.02):
             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
-            https://arxiv.org/abs/1909.11556>`__ for more details.
-        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+        encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
         {% endif -%}
 
-    Example::
+    Example:
 
-        >>> from transformers import {{cookiecutter.camelcase_modelname}}Model, {{cookiecutter.camelcase_modelname}}Config
+    ```python
+    >>> from transformers import {{cookiecutter.camelcase_modelname}}Model, {{cookiecutter.camelcase_modelname}}Config
 
-        >>> # Initializing a {{cookiecutter.modelname}} {{cookiecutter.checkpoint_identifier}} style configuration
-        >>> configuration = {{cookiecutter.camelcase_modelname}}Config()
+    >>> # Initializing a {{cookiecutter.modelname}} {{cookiecutter.checkpoint_identifier}} style configuration
+    >>> configuration = {{cookiecutter.camelcase_modelname}}Config()
 
-        >>> # Initializing a model from the {{cookiecutter.checkpoint_identifier}} style configuration
-        >>> model = {{cookiecutter.camelcase_modelname}}Model(configuration)
+    >>> # Initializing a model from the {{cookiecutter.checkpoint_identifier}} style configuration
+    >>> model = {{cookiecutter.camelcase_modelname}}Model(configuration)
 
-        >>> # Accessing the model configuration
-        >>> configuration = model.config
-    """
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+"""
     model_type = "{{cookiecutter.lowercase_modelname}}"
     {% if cookiecutter.is_encoder_decoder_model == "False" -%}
     {% else -%}
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_fast_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_fast_{{cookiecutter.lowercase_modelname}}.py
index f20ec4021c..4005a98328 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_fast_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_fast_{{cookiecutter.lowercase_modelname}}.py
@@ -42,12 +42,12 @@ PRETRAINED_INIT_CONFIGURATION = {
 
 class {{cookiecutter.camelcase_modelname}}TokenizerFast(BertTokenizerFast):
     r"""
-    Construct a "fast" {{cookiecutter.modelname}} tokenizer (backed by HuggingFace's `tokenizers` library).
+    Construct a "fast" {{cookiecutter.modelname}} tokenizer (backed by HuggingFace's *tokenizers* library).
 
-    :class:`~transformers.{{cookiecutter.camelcase_modelname}}TokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs
+    [`~{{cookiecutter.camelcase_modelname}}TokenizerFast`] is identical to [`BertTokenizerFast`] and runs
     end-to-end tokenization: punctuation splitting and wordpiece.
 
-    Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizerFast`] for usage examples and documentation concerning
     parameters.
     """
 
@@ -86,12 +86,12 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 
 class {{cookiecutter.camelcase_modelname}}TokenizerFast(BartTokenizerFast):
     r"""
-    Construct a "fast" {{cookiecutter.modelname}} tokenizer (backed by HuggingFace's `tokenizers` library).
+    Construct a "fast" {{cookiecutter.modelname}} tokenizer (backed by HuggingFace's *tokenizers* library).
 
-    :class:`~transformers.{{cookiecutter.camelcase_modelname}}TokenizerFast` is identical to :class:`~transformers.BartTokenizerFast` and runs
+    [`~{{cookiecutter.camelcase_modelname}}TokenizerFast`] is identical to [`BartTokenizerFast`] and runs
     end-to-end tokenization: punctuation splitting and wordpiece.
 
-    Refer to superclass :class:`~transformers.BartTokenizerFast` for usage examples and documentation concerning
+    Refer to superclass [`BartTokenizerFast`] for usage examples and documentation concerning
     parameters.
     """
 
@@ -129,10 +129,10 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 
 class {{cookiecutter.camelcase_modelname}}TokenizerFast(PreTrainedTokenizerFast):
     """
-    Construct a "fast" {{cookiecutter.modelname}} tokenizer (backed by HuggingFace's `tokenizers` library).
+    Construct a "fast" {{cookiecutter.modelname}} tokenizer (backed by HuggingFace's *tokenizers* library).
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             Path to the vocabulary file.
     """
 
@@ -182,13 +182,13 @@ class {{cookiecutter.camelcase_modelname}}TokenizerFast(PreTrainedTokenizerFast)
         {{cookiecutter.modelname}} does not make use of token type ids, therefore a list of zeros is returned.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`:  List of zeros.
+            `List[int]`:  List of zeros.
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
diff --git a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py
index ec154a9b1c..7b668b2d19 100644
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py
@@ -43,10 +43,10 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(BertTokenizer):
     r"""
     Construct a {{cookiecutter.modelname}} tokenizer.
 
-    :class:`~transformers.{{cookiecutter.camelcase_modelname}}Tokenizer` is identical to :class:`~transformers.BertTokenizer` and runs end-to-end
+    [`~{{cookiecutter.camelcase_modelname}}Tokenizer`] is identical to [`BertTokenizer`] and runs end-to-end
     tokenization: punctuation splitting and wordpiece.
 
-    Refer to superclass :class:`~transformers.BertTokenizer` for usage examples and documentation concerning
+    Refer to superclass [`BertTokenizer`] for usage examples and documentation concerning
     parameters.
     """
 
@@ -85,10 +85,10 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(BartTokenizer):
     """
     Construct a {{cookiecutter.modelname}} tokenizer.
 
-    :class:`~transformers.{{cookiecutter.camelcase_modelname}}Tokenizer` is identical to :class:`~transformers.BartTokenizer` and runs end-to-end
+    [`~{{cookiecutter.camelcase_modelname}}Tokenizer`] is identical to [`BartTokenizer`] and runs end-to-end
     tokenization: punctuation splitting and wordpiece.
 
-    Refer to superclass :class:`~transformers.BartTokenizer` for usage examples and documentation concerning
+    Refer to superclass [`BartTokenizer`] for usage examples and documentation concerning
     parameters.
     """
 
@@ -125,7 +125,7 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(PreTrainedTokenizer):
     Construct a {{cookiecutter.modelname}} tokenizer. Based on byte-level Byte-Pair-Encoding.
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             Path to the vocabulary file.
     """
 
@@ -173,11 +173,11 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(PreTrainedTokenizer):
         Save the vocabulary and special tokens file to a directory.
 
         Args:
-            save_directory (:obj:`str`):
+            save_directory (`str`):
                 The directory in which to save the vocabulary.
 
         Returns:
-            :obj:`Tuple(str)`: Paths to the files saved.
+            `Tuple(str)`: Paths to the files saved.
         """
 
     def build_inputs_with_special_tokens(
@@ -188,17 +188,17 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(PreTrainedTokenizer):
         by concatenating and adding special tokens.
         A {{cookiecutter.modelname}} sequence has the following format:
 
-        - single sequence: ``<s> X </s>``
-        - pair of sequences: ``<s> A </s></s> B </s>``
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs to which the special tokens will be added.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         """
         if token_ids_1 is None:
             return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
@@ -211,18 +211,18 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(PreTrainedTokenizer):
     ) -> List[int]:
         """
         Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
+        special tokens using the tokenizer `prepare_for_model` method.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                 Whether or not the token list is already formatted with special tokens for the model.
 
         Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         """
         if already_has_special_tokens:
             return super().get_special_tokens_mask(
@@ -241,13 +241,13 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(PreTrainedTokenizer):
         {{cookiecutter.modelname}} does not make use of token type ids, therefore a list of zeros is returned.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`:  List of zeros.
+            `List[int]`:  List of zeros.
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
@@ -264,10 +264,10 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(PreTrainedTokenizer):
 
 class {{cookiecutter.camelcase_modelname}}TokenizerFast(PreTrainedTokenizerFast):
     """
-    Construct a "fast" {{cookiecutter.modelname}} tokenizer (backed by HuggingFace's `tokenizers` library).
+    Construct a "fast" {{cookiecutter.modelname}} tokenizer (backed by HuggingFace's *tokenizers* library).
 
     Args:
-        vocab_file (:obj:`str`):
+        vocab_file (`str`):
             Path to the vocabulary file.
     """
 
@@ -317,13 +317,13 @@ class {{cookiecutter.camelcase_modelname}}TokenizerFast(PreTrainedTokenizerFast)
         {{cookiecutter.modelname}} does not make use of token type ids, therefore a list of zeros is returned.
 
         Args:
-            token_ids_0 (:obj:`List[int]`):
+            token_ids_0 (`List[int]`):
                 List of IDs.
-            token_ids_1 (:obj:`List[int]`, `optional`):
+            token_ids_1 (`List[int]`, *optional*):
                 Optional second list of IDs for sequence pairs.
 
         Returns:
-            :obj:`List[int]`:  List of zeros.
+            `List[int]`:  List of zeros.
         """
         sep = [self.sep_token_id]
         cls = [self.cls_token_id]
diff --git a/utils/check_repo.py b/utils/check_repo.py
index 988b8f2145..0a40a0376e 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -593,6 +593,51 @@ def check_all_objects_are_documented():
             "The following objects are in the public init so should be documented:\n - "
             + "\n - ".join(undocumented_objs)
         )
+    check_docstrings_are_in_md()
+
+
+# Re pattern to catch :obj:`xx`, :class:`xx`, :func:`xx` or :meth:`xx`.
+_re_rst_special_words = re.compile(r":(?:obj|func|class|meth):`([^`]+)`")
+# Re pattern to catch things between double backquotes.
+_re_double_backquotes = re.compile(r"(^|[^`])``([^`]+)``([^`]|$)")
+# Re pattern to catch example introduction.
+_re_rst_example = re.compile(r"^\s*Example.*::\s*$", flags=re.MULTILINE)
+
+
+def is_rst_docstring(docstring):
+    """
+    Returns `True` if `docstring` is written in rst.
+    """
+    if _re_rst_special_words.search(docstring) is not None:
+        return True
+    if _re_double_backquotes.search(docstring) is not None:
+        return True
+    if _re_rst_example.search(docstring) is not None:
+        return True
+    return False
+
+
+def check_docstrings_are_in_md():
+    """Check all docstrings are in md"""
+    files_with_rst = []
+    for file in Path(PATH_TO_TRANSFORMERS).glob("**/*.py"):
+        with open(file, "r") as f:
+            code = f.read()
+        docstrings = code.split('"""')
+
+        for idx, docstring in enumerate(docstrings):
+            if idx % 2 == 0 or not is_rst_docstring(docstring):
+                continue
+            files_with_rst.append(file)
+            break
+
+    if len(files_with_rst) > 0:
+        raise ValueError(
+            "The following files have docstrings written in rst:\n"
+            + "\n".join([f"- {f}" for f in files_with_rst])
+            + "To fix this run `doc_builder convert path_to_py_file` after installing `doc_builder`\n"
+            "(`pip install git+https://github.com/huggingface/doc-builder`)"
+        )
 
 
 def check_repo_quality():