From 133c5e40c4c34b54180f1f0f48791bece45f4418 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Mon, 27 Dec 2021 14:31:40 -0800
Subject: [PATCH] [doc] consistent True/False/None default format (#14951)

* [doc] consistent True/False/None default format

* Update src/transformers/models/xlnet/modeling_xlnet.py

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
---
 docs/source/model_doc/segformer.mdx           |  4 +-
 src/transformers/generation_tf_utils.py       |  8 ++--
 src/transformers/generation_utils.py          | 48 +++++++++----------
 src/transformers/integrations.py              |  2 +-
 src/transformers/modeling_utils.py            |  4 +-
 .../tokenization_bert_japanese.py             |  2 +-
 .../models/bertweet/tokenization_bertweet.py  |  2 +-
 src/transformers/models/detr/modeling_detr.py |  4 +-
 .../modeling_encoder_decoder.py               |  2 +-
 .../modeling_flax_encoder_decoder.py          |  2 +-
 .../modeling_tf_encoder_decoder.py            |  2 +-
 .../layoutlmv2/tokenization_layoutlmv2.py     |  4 +-
 .../models/luke/tokenization_luke.py          |  4 +-
 .../models/lxmert/modeling_tf_lxmert.py       |  4 +-
 .../models/mluke/tokenization_mluke.py        |  4 +-
 .../models/perceiver/modeling_perceiver.py    |  4 +-
 src/transformers/models/rag/modeling_rag.py   |  6 +--
 .../models/rag/modeling_tf_rag.py             | 12 ++---
 .../modeling_speech_encoder_decoder.py        |  2 +-
 .../modeling_flax_vision_encoder_decoder.py   |  2 +-
 .../modeling_vision_encoder_decoder.py        |  2 +-
 .../modeling_flax_vision_text_dual_encoder.py |  2 +-
 .../modeling_vision_text_dual_encoder.py      |  2 +-
 .../models/xlnet/modeling_xlnet.py            |  2 +-
 src/transformers/optimization.py              |  2 +-
 src/transformers/optimization_tf.py           |  2 +-
 .../pipelines/audio_classification.py         |  2 +-
 src/transformers/tokenization_utils_base.py   |  2 +-
 src/transformers/trainer.py                   |  2 +-
 src/transformers/training_args.py             |  4 +-
 30 files changed, 72 insertions(+), 72 deletions(-)

diff --git a/docs/source/model_doc/segformer.mdx b/docs/source/model_doc/segformer.mdx
index bc053ee52f..04e1d3bc02 100644
--- a/docs/source/model_doc/segformer.mdx
+++ b/docs/source/model_doc/segformer.mdx
@@ -57,13 +57,13 @@ Tips:
   important preprocessing step is that images and segmentation maps are randomly cropped and padded to the same size,
   such as 512x512 or 640x640, after which they are normalized.
 - One additional thing to keep in mind is that one can initialize [`SegformerFeatureExtractor`] with
-  `reduce_labels` set to *True* or *False*. In some datasets (like ADE20k), the 0 index is used in the annotated
+  `reduce_labels` set to `True` or `False`. In some datasets (like ADE20k), the 0 index is used in the annotated
   segmentation maps for background. However, ADE20k doesn't include the "background" class in its 150 labels.
   Therefore, `reduce_labels` is used to reduce all labels by 1, and to make sure no loss is computed for the
   background class (i.e. it replaces 0 in the annotated maps by 255, which is the *ignore_index* of the loss function
   used by [`SegformerForSemanticSegmentation`]). However, other datasets use the 0 index as
   background class and include this class as part of all labels. In that case, `reduce_labels` should be set to
-  *False*, as loss should also be computed for the background class.
+  `False`, as loss should also be computed for the background class.
 - As most models, SegFormer comes in different sizes, the details of which can be found in the table below.
 
 | **Model variant** | **Depths**    | **Hidden sizes**    | **Decoder hidden size** | **Params (M)** | **ImageNet-1k Top 1** |
diff --git a/src/transformers/generation_tf_utils.py b/src/transformers/generation_tf_utils.py
index 5ac56dae1d..656c289c34 100644
--- a/src/transformers/generation_tf_utils.py
+++ b/src/transformers/generation_tf_utils.py
@@ -446,15 +446,15 @@ class TFGenerationMixin:
             use_cache: (`bool`, *optional*, defaults to `True`):
                 Whether or not the model should use the past last key/values attentions (if applicable to the model) to
                 speed up decoding.
-            output_attentions (`bool`, *optional*, defaults to *False*):
+            output_attentions (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more details.
-            output_hidden_states (`bool`, *optional*, defaults to *False*):
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                 for more details.
-            output_scores (`bool`, *optional*, defaults to *False*):
+            output_scores (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
-            return_dict_in_generate (`bool`, *optional*, defaults to *False*):
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
                 Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
             forced_bos_token_id (`int`, *optional*):
                 The id of the token to force as the first generated token after the `decoder_start_token_id`. Useful
diff --git a/src/transformers/generation_utils.py b/src/transformers/generation_utils.py
index 5ef9cf50ac..6d809e6bb8 100644
--- a/src/transformers/generation_utils.py
+++ b/src/transformers/generation_utils.py
@@ -878,15 +878,15 @@ class GenerationMixin:
                  Custom stopping criteria that complement the default stopping criteria built from arguments and a
                  model's config. If a stopping criteria is passed that is already created with the arguments or a
                  model's config an error is thrown. This feature is intended for advanced users.
-            output_attentions (`bool`, *optional*, defaults to *False*):
+            output_attentions (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more details.
-            output_hidden_states (`bool`, *optional*, defaults to *False*):
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                 for more details.
-            output_scores (`bool`, *optional*, defaults to *False*):
+            output_scores (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
-            return_dict_in_generate (`bool`, *optional*, defaults to *False*):
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
                 Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
             forced_bos_token_id (`int`, *optional*):
                 The id of the token to force as the first generated token after the `decoder_start_token_id`. Useful
@@ -1302,15 +1302,15 @@ class GenerationMixin:
                 The id of the *padding* token.
             eos_token_id (`int`, *optional*):
                 The id of the *end-of-sequence* token.
-            output_attentions (`bool`, *optional*, defaults to *False*):
+            output_attentions (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more details.
-            output_hidden_states (`bool`, *optional*, defaults to *False*):
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                 for more details.
-            output_scores (`bool`, *optional*, defaults to *False*):
+            output_scores (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
-            return_dict_in_generate (`bool`, *optional*, defaults to *False*):
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
                 Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
             synced_gpus (`bool`, *optional*, defaults to `False`):
                 Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
@@ -1529,15 +1529,15 @@ class GenerationMixin:
                 The id of the *padding* token.
             eos_token_id (`int`, *optional*):
                 The id of the *end-of-sequence* token.
-            output_attentions (`bool`, *optional*, defaults to *False*):
+            output_attentions (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more details.
-            output_hidden_states (`bool`, *optional*, defaults to *False*):
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                 for more details.
-            output_scores (`bool`, *optional*, defaults to *False*):
+            output_scores (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
-            return_dict_in_generate (`bool`, *optional*, defaults to *False*):
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
                 Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
             synced_gpus (`bool`, *optional*, defaults to `False`):
                 Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
@@ -1767,15 +1767,15 @@ class GenerationMixin:
                 The id of the *padding* token.
             eos_token_id (`int`, *optional*):
                 The id of the *end-of-sequence* token.
-            output_attentions (`bool`, *optional*, defaults to *False*):
+            output_attentions (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more details.
-            output_hidden_states (`bool`, *optional*, defaults to *False*):
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                 for more details.
-            output_scores (`bool`, *optional*, defaults to *False*):
+            output_scores (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
-            return_dict_in_generate (`bool`, *optional*, defaults to *False*):
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
                 Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
             synced_gpus (`bool`, *optional*, defaults to `False`):
                 Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
@@ -2061,15 +2061,15 @@ class GenerationMixin:
                 The id of the *padding* token.
             eos_token_id (`int`, *optional*):
                 The id of the *end-of-sequence* token.
-            output_attentions (`bool`, *optional*, defaults to *False*):
+            output_attentions (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more details.
-            output_hidden_states (`bool`, *optional*, defaults to *False*):
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                 for more details.
-            output_scores (`bool`, *optional*, defaults to *False*):
+            output_scores (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
-            return_dict_in_generate (`bool`, *optional*, defaults to *False*):
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
                 Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
             synced_gpus (`bool`, *optional*, defaults to `False`):
                 Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
@@ -2356,15 +2356,15 @@ class GenerationMixin:
                 The id of the *padding* token.
             eos_token_id (`int`, *optional*):
                 The id of the *end-of-sequence* token.
-            output_attentions (`bool`, *optional*, defaults to *False*):
+            output_attentions (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more details.
-            output_hidden_states (`bool`, *optional*, defaults to *False*):
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                 for more details.
-            output_scores (`bool`, *optional*, defaults to *False*):
+            output_scores (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
-            return_dict_in_generate (`bool`, *optional*, defaults to *False*):
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
                 Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
             synced_gpus (`bool`, *optional*, defaults to `False`):
                 Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
diff --git a/src/transformers/integrations.py b/src/transformers/integrations.py
index b4fba50bad..a18d06fe41 100644
--- a/src/transformers/integrations.py
+++ b/src/transformers/integrations.py
@@ -707,7 +707,7 @@ class MLflowCallback(TrainerCallback):
             HF_MLFLOW_LOG_ARTIFACTS (`str`, *optional*):
                 Whether to use MLflow .log_artifact() facility to log artifacts.
 
-                This only makes sense if logging to a remote server, e.g. s3 or GCS. If set to *True* or *1*, will copy
+                This only makes sense if logging to a remote server, e.g. s3 or GCS. If set to `True` or *1*, will copy
                 whatever is in [`TrainingArguments`]'s `output_dir` to the local or remote artifact storage. Using it
                 without a remote storage will just copy the files to your artifact location.
         """
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 34f9f857ec..5be78132a4 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1840,7 +1840,7 @@ class PoolerEndLogits(nn.Module):
 
         <Tip>
 
-        One of `start_states` or `start_positions` should be not obj:*None*. If both are set, `start_positions`
+        One of `start_states` or `start_positions` should be not obj:`None`. If both are set, `start_positions`
         overrides `start_states`.
 
         </Tip>
@@ -1906,7 +1906,7 @@ class PoolerAnswerClass(nn.Module):
 
         <Tip>
 
-        One of `start_states` or `start_positions` should be not obj:*None*. If both are set, `start_positions`
+        One of `start_states` or `start_positions` should be not obj:`None`. If both are set, `start_positions`
         overrides `start_states`.
 
         </Tip>
diff --git a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
index 0d50dadd00..588612029a 100644
--- a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
+++ b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py
@@ -219,7 +219,7 @@ class MecabTokenizer:
                 Whether to apply unicode normalization to text before tokenization.
             **mecab_dic**: (*optional*) string (default "ipadic")
                 Name of dictionary to be used for MeCab initialization. If you are using a system-installed dictionary,
-                set this option to *None* and modify *mecab_option*.
+                set this option to `None` and modify *mecab_option*.
             **mecab_option**: (*optional*) string
                 String passed to MeCab constructor.
         """
diff --git a/src/transformers/models/bertweet/tokenization_bertweet.py b/src/transformers/models/bertweet/tokenization_bertweet.py
index f445b68ed7..71f4038ffe 100644
--- a/src/transformers/models/bertweet/tokenization_bertweet.py
+++ b/src/transformers/models/bertweet/tokenization_bertweet.py
@@ -632,7 +632,7 @@ def _replace_html_entities(text, keep=(), remove_illegal=True, encoding="utf-8")
             List of entity names which should not be replaced. This supports both numeric entities (`&#nnnn;` and
             `&#hhhh;`) and named entities (such as `&nbsp;` or `&gt;`).
         remove_illegal (bool):
-            If *True*, entities that can't be converted are removed. Otherwise, entities that can't be converted are
+            If `True`, entities that can't be converted are removed. Otherwise, entities that can't be converted are
             kept "as is".
 
     Returns: A unicode string with the entities removed.
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
index b05fd1d56c..e57ec6fb58 100644
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -150,7 +150,7 @@ class DetrObjectDetectionOutput(ModelOutput):
             possible padding). You can use [`~DetrFeatureExtractor.post_process`] to retrieve the unnormalized bounding
             boxes.
         auxiliary_outputs (`list[Dict]`, *optional*):
-            Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to *True*)
+            Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
             and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
             `pred_boxes`) for each decoder layer.
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
@@ -217,7 +217,7 @@ class DetrSegmentationOutput(ModelOutput):
             [`~DetrFeatureExtractor.post_process_panoptic`] to evaluate instance and panoptic segmentation masks
             respectively.
         auxiliary_outputs (`list[Dict]`, *optional*):
-            Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to *True*)
+            Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
             and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
             `pred_boxes`) for each decoder layer.
         last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
index 44c2568719..8105b8df43 100644
--- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
@@ -306,7 +306,7 @@ class EncoderDecoderModel(PreTrainedModel):
                       `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
                       PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
 
-            decoder_pretrained_model_name_or_path (:obj: *str*, *optional*, defaults to *None*):
+            decoder_pretrained_model_name_or_path (:obj: *str*, *optional*, defaults to `None`):
                 Information necessary to initiate the decoder. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
diff --git a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
index 1a1994392f..9ea8f31fc2 100644
--- a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
@@ -755,7 +755,7 @@ class FlaxEncoderDecoderModel(FlaxPreTrainedModel):
                     - A path to a *directory* containing model weights saved using
                       [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
 
-            decoder_pretrained_model_name_or_path (:obj: *Union[str, os.PathLike]*, *optional*, defaults to *None*):
+            decoder_pretrained_model_name_or_path (:obj: *Union[str, os.PathLike]*, *optional*, defaults to `None`):
                 Information necessary to initiate the decoder. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
diff --git a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
index e6c88cc02f..357d19b4ed 100644
--- a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
@@ -319,7 +319,7 @@ class TFEncoderDecoderModel(TFPreTrainedModel):
                     - A path or url to a *pytorch index checkpoint file* (e.g, `./pt_model/`). In this case,
                       `encoder_from_pt` should be set to `True`.
 
-            decoder_pretrained_model_name_or_path (:obj: *str*, *optional*, defaults to *None*):
+            decoder_pretrained_model_name_or_path (:obj: *str*, *optional*, defaults to `None`):
                 Information necessary to initiate the decoder. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
diff --git a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
index 3f9816dd62..e92190bf1f 100644
--- a/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
+++ b/src/transformers/models/layoutlmv2/tokenization_layoutlmv2.py
@@ -888,8 +888,8 @@ class LayoutLMv2Tokenizer(PreTrainedTokenizer):
         """
         Prepares a sequence or a pair of sequences so that it can be used by the model. It adds special tokens,
         truncates sequences if overflowing while taking into account the special tokens and manages a moving window
-        (with user defined stride) for overflowing tokens. Please Note, for *text_pair* different than *None* and
-        *truncation_strategy = longest_first* or *True*, it is not possible to return overflowing tokens. Such a
+        (with user defined stride) for overflowing tokens. Please Note, for *text_pair* different than `None` and
+        *truncation_strategy = longest_first* or `True`, it is not possible to return overflowing tokens. Such a
         combination of arguments will raise an error.
 
         Word-level `boxes` are turned into token-level `bbox`. If provided, word-level `word_labels` are turned into
diff --git a/src/transformers/models/luke/tokenization_luke.py b/src/transformers/models/luke/tokenization_luke.py
index 92ff3da4bd..a4033cda18 100644
--- a/src/transformers/models/luke/tokenization_luke.py
+++ b/src/transformers/models/luke/tokenization_luke.py
@@ -879,8 +879,8 @@ class LukeTokenizer(RobertaTokenizer):
         Prepares a sequence of input id, entity id and entity span, or a pair of sequences of inputs ids, entity ids,
         entity spans so that it can be used by the model. It adds special tokens, truncates sequences if overflowing
         while taking into account the special tokens and manages a moving window (with user defined stride) for
-        overflowing tokens. Please Note, for *pair_ids* different than *None* and *truncation_strategy = longest_first*
-        or *True*, it is not possible to return overflowing tokens. Such a combination of arguments will raise an
+        overflowing tokens. Please Note, for *pair_ids* different than `None` and *truncation_strategy = longest_first*
+        or `True`, it is not possible to return overflowing tokens. Such a combination of arguments will raise an
         error.
 
         Args:
diff --git a/src/transformers/models/lxmert/modeling_tf_lxmert.py b/src/transformers/models/lxmert/modeling_tf_lxmert.py
index b9f7c12764..09115d2c72 100644
--- a/src/transformers/models/lxmert/modeling_tf_lxmert.py
+++ b/src/transformers/models/lxmert/modeling_tf_lxmert.py
@@ -1324,7 +1324,7 @@ class TFLxmertForPreTraining(TFLxmertPreTrainedModel):
             Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
             config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
             loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
-        obj_labels: (`Dict[Str: Tuple[tf.Tensor, tf.Tensor]]`, *optional*, defaults to :obj: *None*):
+        obj_labels: (`Dict[Str: Tuple[tf.Tensor, tf.Tensor]]`, *optional*, defaults to :obj: `None`):
             each key is named after each one of the visual losses and each element of the tuple is of the shape
             `(batch_size, num_features)` and `(batch_size, num_features, visual_feature_dim)` for each the label id and
             the label score respectively
@@ -1334,7 +1334,7 @@ class TFLxmertForPreTraining(TFLxmertPreTrainedModel):
 
             - 0 indicates that the sentence does not match the image,
             - 1 indicates that the sentence does match the image.
-        ans (`Torch.Tensor` of shape `(batch_size)`, *optional*, defaults to :obj: *None*):
+        ans (`Torch.Tensor` of shape `(batch_size)`, *optional*, defaults to :obj: `None`):
             a one hot representation hof the correct answer *optional*
 
         Returns:
diff --git a/src/transformers/models/mluke/tokenization_mluke.py b/src/transformers/models/mluke/tokenization_mluke.py
index 679ad289ec..9e034615d5 100644
--- a/src/transformers/models/mluke/tokenization_mluke.py
+++ b/src/transformers/models/mluke/tokenization_mluke.py
@@ -991,8 +991,8 @@ class MLukeTokenizer(PreTrainedTokenizer):
         Prepares a sequence of input id, entity id and entity span, or a pair of sequences of inputs ids, entity ids,
         entity spans so that it can be used by the model. It adds special tokens, truncates sequences if overflowing
         while taking into account the special tokens and manages a moving window (with user defined stride) for
-        overflowing tokens. Please Note, for *pair_ids* different than *None* and *truncation_strategy = longest_first*
-        or *True*, it is not possible to return overflowing tokens. Such a combination of arguments will raise an
+        overflowing tokens. Please Note, for *pair_ids* different than `None` and *truncation_strategy = longest_first*
+        or `True`, it is not possible to return overflowing tokens. Such a combination of arguments will raise an
         error.
 
         Args:
diff --git a/src/transformers/models/perceiver/modeling_perceiver.py b/src/transformers/models/perceiver/modeling_perceiver.py
index 12b9749aba..ce14e16711 100755
--- a/src/transformers/models/perceiver/modeling_perceiver.py
+++ b/src/transformers/models/perceiver/modeling_perceiver.py
@@ -2035,7 +2035,7 @@ class PerceiverBasicDecoder(PerceiverAbstractDecoder):
         config ([*PerceiverConfig*]):
             Model configuration.
         output_num_channels (`int`, *optional*):
-            The number of channels in the output. Will only be used in case *final_project* is set to *True*.
+            The number of channels in the output. Will only be used in case *final_project* is set to `True`.
         position_encoding_type (`str`, *optional*, defaults to "trainable"):
             The type of position encoding to use. Can be either "trainable", "fourier", or "none".
         output_index_dims (`int`, *optional*):
@@ -2583,7 +2583,7 @@ def generate_fourier_features(pos, num_bands, max_resolution=(224, 224), concat_
 
     Returns:
       `torch.FloatTensor` of shape `(batch_size, sequence_length, n_channels)`: The Fourier position embeddings. If
-      `concat_pos` is *True* and `sine_only` is *False*, output dimensions are ordered as: [dim_1, dim_2, ..., dim_d,
+      `concat_pos` is `True` and `sine_only` is `False`, output dimensions are ordered as: [dim_1, dim_2, ..., dim_d,
       sin(pi*f_1*dim_1), ..., sin(pi*f_K*dim_1), ..., sin(pi*f_1*dim_d), ..., sin(pi*f_K*dim_d), cos(pi*f_1*dim_1),
       ..., cos(pi*f_K*dim_1), ..., cos(pi*f_1*dim_d), ..., cos(pi*f_K*dim_d)], where dim_i is pos[:, i] and f_k is the
       kth frequency band.
diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py
index 084601e920..f6869bb406 100644
--- a/src/transformers/models/rag/modeling_rag.py
+++ b/src/transformers/models/rag/modeling_rag.py
@@ -258,7 +258,7 @@ class RagPreTrainedModel(PreTrainedModel):
         the model, you need to first set it back in training mode with `model.train()`.
 
         Params:
-            question_encoder_pretrained_model_name_or_path (:obj: *str*, *optional*, defaults to *None*):
+            question_encoder_pretrained_model_name_or_path (:obj: *str*, *optional*, defaults to `None`):
                 Information necessary to initiate the question encoder. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
@@ -271,7 +271,7 @@ class RagPreTrainedModel(PreTrainedModel):
                       `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
                       PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
 
-            generator_pretrained_model_name_or_path (:obj: *str*, *optional*, defaults to *None*):
+            generator_pretrained_model_name_or_path (:obj: *str*, *optional*, defaults to `None`):
                 Information necessary to initiate the generator. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
@@ -444,7 +444,7 @@ RAG_FORWARD_INPUTS_DOCSTRING = r"""
 
             Used by the ([`RagModel`]) model during decoding.
         decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
-            Provide for generation tasks. *None* by default, construct as per instructions for the generator model
+            Provide for generation tasks. `None` by default, construct as per instructions for the generator model
             you're using with your RAG instance.
         decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size,  target_sequence_length)`, *optional*):
             Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
diff --git a/src/transformers/models/rag/modeling_tf_rag.py b/src/transformers/models/rag/modeling_tf_rag.py
index 948decc436..bb45f8e4c3 100644
--- a/src/transformers/models/rag/modeling_tf_rag.py
+++ b/src/transformers/models/rag/modeling_tf_rag.py
@@ -245,7 +245,7 @@ class TFRagPreTrainedModel(TFPreTrainedModel):
                     - A path or url to a *pytorch index checkpoint file* (e.g, `./pt_model/`). In this case,
                       `question_encoder_from_pt` should be set to `True`.
 
-            generator_pretrained_model_name_or_path (:obj: *str*, *optional*, defaults to *None*):
+            generator_pretrained_model_name_or_path (:obj: *str*, *optional*, defaults to `None`):
                 Information necessary to initiate the generator. Can be either:
 
                     - A string with the *shortcut name* of a pretrained model to load from cache or download, e.g.,
@@ -426,7 +426,7 @@ RAG_FORWARD_INPUTS_DOCSTRING = r"""
 
             Used by the ([`TFRagModel`]) model during decoding.
         decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
-            Provide for generation tasks. *None* by default, construct as per instructions for the generator model
+            Provide for generation tasks. `None` by default, construct as per instructions for the generator model
             you're using with your RAG instance.
         decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size,  target_sequence_length)`, *optional*):
             Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
@@ -1136,15 +1136,15 @@ class TFRagTokenForGeneration(TFRagPreTrainedModel, TFCausalLanguageModelingLoss
                 encoder-decoder model starts decoding with a different token than *bos*, the id of that token.
             n_docs (`int`, *optional*, defaults to `config.n_docs`)
                 Number of documents to retrieve and/or number of documents for which to generate an answer.
-            output_attentions (`bool`, *optional*, defaults to *False*):
+            output_attentions (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more details.
-            output_hidden_states (`bool`, *optional*, defaults to *False*):
+            output_hidden_states (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                 for more details.
-            output_scores (`bool`, *optional*, defaults to *False*):
+            output_scores (`bool`, *optional*, defaults to `False`):
                 Whether or not to return the prediction scores. See `scores` under returned tensors for more details.
-            return_dict_in_generate (`bool`, *optional*, defaults to *False*):
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
                 Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
             model_specific_kwargs:
                 Additional model specific kwargs will be forwarded to the `forward` function of the model.
diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
index dbd80963a8..5c930d8952 100644
--- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
@@ -300,7 +300,7 @@ class SpeechEncoderDecoderModel(PreTrainedModel):
                       `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
                       PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
 
-            decoder_pretrained_model_name_or_path (:obj: *str*, *optional*, defaults to *None*):
+            decoder_pretrained_model_name_or_path (:obj: *str*, *optional*, defaults to `None`):
                 Information necessary to initiate the decoder. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
index e97a8331f8..0cc68c4feb 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
@@ -720,7 +720,7 @@ class FlaxVisionEncoderDecoderModel(FlaxPreTrainedModel):
                     - A path to a *directory* containing model weights saved using
                       [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
 
-            decoder_pretrained_model_name_or_path (:obj: *Union[str, os.PathLike]*, *optional*, defaults to *None*):
+            decoder_pretrained_model_name_or_path (:obj: *Union[str, os.PathLike]*, *optional*, defaults to `None`):
                 Information necessary to initiate the decoder. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
index 23759c543d..fbc4492473 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_vision_encoder_decoder.py
@@ -272,7 +272,7 @@ class VisionEncoderDecoderModel(PreTrainedModel):
                       `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a
                       PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
 
-            decoder_pretrained_model_name_or_path (:obj: *str*, *optional*, defaults to *None*):
+            decoder_pretrained_model_name_or_path (:obj: *str*, *optional*, defaults to `None`):
                 Information necessary to initiate the text decoder. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
index 2b54e310b9..59b59dba99 100644
--- a/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
@@ -403,7 +403,7 @@ class FlaxVisionTextDualEncoderModel(FlaxPreTrainedModel):
     ) -> FlaxPreTrainedModel:
         """
         Params:
-            vision_model_name_or_path (:obj: *str*, *optional*, defaults to *None*):
+            vision_model_name_or_path (:obj: *str*, *optional*, defaults to `None`):
                 Information necessary to initiate the vision model. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
index 795af8be4c..26717a88de 100755
--- a/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py
@@ -404,7 +404,7 @@ class VisionTextDualEncoderModel(PreTrainedModel):
     ) -> PreTrainedModel:
         """
         Params:
-            vision_model_name_or_path (:obj: *str*, *optional*, defaults to *None*):
+            vision_model_name_or_path (:obj: *str*, *optional*, defaults to `None`):
                 Information necessary to initiate the vision model. Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
diff --git a/src/transformers/models/xlnet/modeling_xlnet.py b/src/transformers/models/xlnet/modeling_xlnet.py
index 828af81cd8..ebde4e5292 100755
--- a/src/transformers/models/xlnet/modeling_xlnet.py
+++ b/src/transformers/models/xlnet/modeling_xlnet.py
@@ -1383,7 +1383,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, num_predict)`, *optional*):
             Labels for masked language modeling. `num_predict` corresponds to `target_mapping.shape[1]`. If
-            `target_mapping` is :obj*None*, then `num_predict` corresponds to `sequence_length`.
+            `target_mapping` is `None`, then `num_predict` corresponds to `sequence_length`.
 
             The labels should correspond to the masked input words that should be predicted and depends on
             `target_mapping`. Note in order to perform standard auto-regressive language modeling a *<mask>* token has
diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py
index f21a240ad6..50e7d3cd00 100644
--- a/src/transformers/optimization.py
+++ b/src/transformers/optimization.py
@@ -285,7 +285,7 @@ class AdamW(Optimizer):
             Adam's epsilon for numerical stability.
         weight_decay (`float`, *optional*, defaults to 0):
             Decoupled weight decay to apply.
-        correct_bias (`bool`, *optional*, defaults to *True*):
+        correct_bias (`bool`, *optional*, defaults to `True`):
             Whether or not to correct bias in Adam (for instance, in Bert TF repository they use `False`).
     """
 
diff --git a/src/transformers/optimization_tf.py b/src/transformers/optimization_tf.py
index 02acb8aa5e..345b2eaf1f 100644
--- a/src/transformers/optimization_tf.py
+++ b/src/transformers/optimization_tf.py
@@ -168,7 +168,7 @@ class AdamWeightDecay(tf.keras.optimizers.Adam):
             The beta2 parameter in Adam, which is the exponential decay rate for the 2nd momentum estimates.
         epsilon (`float`, *optional*, defaults to 1e-7):
             The epsilon parameter in Adam, which is a small constant for numerical stability.
-        amsgrad (`bool`, *optional*, default to *False*):
+        amsgrad (`bool`, *optional*, default to `False`):
             Whether to apply AMSGrad variant of this algorithm or not, see [On the Convergence of Adam and
             Beyond](https://arxiv.org/abs/1904.09237).
         weight_decay_rate (`float`, *optional*, defaults to 0):
diff --git a/src/transformers/pipelines/audio_classification.py b/src/transformers/pipelines/audio_classification.py
index 6f86cb803c..3957d82beb 100644
--- a/src/transformers/pipelines/audio_classification.py
+++ b/src/transformers/pipelines/audio_classification.py
@@ -104,7 +104,7 @@ class AudioClassificationPipeline(Pipeline):
                 requires *ffmpeg* to be installed on the system. If *inputs* is `bytes` it is supposed to be the
                 content of an audio file and is interpreted by *ffmpeg* in the same way.
             top_k (`int`, *optional*, defaults to None):
-                The number of top labels that will be returned by the pipeline. If the provided number is *None* or
+                The number of top labels that will be returned by the pipeline. If the provided number is `None` or
                 higher than the number of labels available in the model configuration, it will default to the number of
                 labels.
 
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 8d9182a30c..0365f69cc7 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -2868,7 +2868,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
         adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
         manages a moving window (with user defined stride) for overflowing tokens. Please Note, for *pair_ids*
-        different than *None* and *truncation_strategy = longest_first* or *True*, it is not possible to return
+        different than `None` and *truncation_strategy = longest_first* or `True`, it is not possible to return
         overflowing tokens. Such a combination of arguments will raise an error.
 
         Args:
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 59118c75c0..ff0e9e6222 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -1034,7 +1034,7 @@ class Trainer:
         Args:
             resume_from_checkpoint (`str` or `bool`, *optional*):
                 If a `str`, local path to a saved checkpoint as saved by a previous instance of [`Trainer`]. If a
-                `bool` and equals *True*, load the last checkpoint in *args.output_dir* as saved by a previous instance
+                `bool` and equals `True`, load the last checkpoint in *args.output_dir* as saved by a previous instance
                 of [`Trainer`]. If present, training will resume from the model/optimizer/scheduler states loaded here.
             trial (`optuna.Trial` or `Dict[str, Any]`, *optional*):
                 The trial run or the hyperparameter dictionary for hyperparameter search.
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index 6209c742c6..b134905fe2 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -105,7 +105,7 @@ class TrainingArguments:
                 - `"steps"`: Evaluation is done (and logged) every `eval_steps`.
                 - `"epoch"`: Evaluation is done at the end of each epoch.
 
-        prediction_loss_only (`bool`, *optional*, defaults to *False*):
+        prediction_loss_only (`bool`, *optional*, defaults to `False`):
             When performing evaluation and generating predictions, only returns the loss.
         per_device_train_batch_size (`int`, *optional*, defaults to 8):
             The batch size per GPU/TPU core/CPU for training.
@@ -175,7 +175,7 @@ class TrainingArguments:
         logging_steps (`int`, *optional*, defaults to 500):
             Number of update steps between two logs if `logging_strategy="steps"`.
         logging_nan_inf_filter (`bool`, *optional*, defaults to `True`):
-            Whether to filter `nan` and `inf` losses for logging. If set to obj:*True* the loss of every step that is
+            Whether to filter `nan` and `inf` losses for logging. If set to obj:`True` the loss of every step that is
             `nan` or `inf` is filtered and the average loss of the current logging window is taken instead.
 
             <Tip>