Doc styler examples (#14953)

* Fix bad examples * Add black formatting to style_doc * Use first nonempty line * Put it at the right place * Don't add spaces to empty lines * Better templates * Deal with triple quotes in docstrings * Result of style_doc * Enable mdx treatment and fix code examples in MDXs * Result of doc styler on doc source files * Last fixes * Break copy from
2021-12-27 19:07:46 -05:00
parent e13f72fbff
commit b5e2b183af
211 changed files with 2738 additions and 1711 deletions
--- a/docs/source/model_doc/bart.mdx
+++ b/docs/source/model_doc/bart.mdx
@@ -64,12 +64,15 @@ The `facebook/bart-base` and `facebook/bart-large` checkpoints can be used to fi

 ```python
 from transformers import BartForConditionalGeneration, BartTokenizer
+
 model = BartForConditionalGeneration.from_pretrained("facebook/bart-large", forced_bos_token_id=0)
 tok = BartTokenizer.from_pretrained("facebook/bart-large")
 example_english_phrase = "UN Chief Says There Is No <mask> in Syria"
-batch = tok(example_english_phrase, return_tensors='pt')
-generated_ids = model.generate(batch['input_ids'])
-assert tok.batch_decode(generated_ids, skip_special_tokens=True) == ['UN Chief Says There Is No Plan to Stop Chemical Weapons in Syria']
+batch = tok(example_english_phrase, return_tensors="pt")
+generated_ids = model.generate(batch["input_ids"])
+assert tok.batch_decode(generated_ids, skip_special_tokens=True) == [
+    "UN Chief Says There Is No Plan to Stop Chemical Weapons in Syria"
+]
 ```

 ## BartConfig
--- a/docs/source/model_doc/bartpho.mdx
+++ b/docs/source/model_doc/bartpho.mdx
@@ -44,6 +44,7 @@ Example of use:

 >>> # With TensorFlow 2.0+:
 >>> from transformers import TFAutoModel
+
 >>> bartpho = TFAutoModel.from_pretrained("vinai/bartpho-syllable")
 >>> input_ids = tokenizer(line, return_tensors="tf")
 >>> features = bartpho(**input_ids)
@@ -58,9 +59,10 @@ Tips:

 ```python
 >>> from transformers import MBartForConditionalGeneration
+
 >>> bartpho = MBartForConditionalGeneration.from_pretrained("vinai/bartpho-syllable")
->>> TXT = 'Chúng tôi là <mask> nghiên cứu viên.'
->>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
+>>> TXT = "Chúng tôi là <mask> nghiên cứu viên."
+>>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
 >>> logits = bartpho(input_ids).logits
 >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
 >>> probs = logits[0, masked_index].softmax(dim=0)
--- a/docs/source/model_doc/bert_japanese.mdx
+++ b/docs/source/model_doc/bert_japanese.mdx
@@ -30,7 +30,7 @@ Example of using a model with MeCab and WordPiece tokenization:

 ```python
 >>> import torch
->>> from transformers import AutoModel, AutoTokenizer 
+>>> from transformers import AutoModel, AutoTokenizer

 >>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese")
 >>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
@@ -40,7 +40,7 @@ Example of using a model with MeCab and WordPiece tokenization:

 >>> inputs = tokenizer(line, return_tensors="pt")

->>> print(tokenizer.decode(inputs['input_ids'][0]))
+>>> print(tokenizer.decode(inputs["input_ids"][0]))
 [CLS] 吾輩 は 猫 で ある 。 [SEP]

 >>> outputs = bertjapanese(**inputs)
@@ -57,7 +57,7 @@ Example of using a model with Character tokenization:

 >>> inputs = tokenizer(line, return_tensors="pt")

->>> print(tokenizer.decode(inputs['input_ids'][0]))
+>>> print(tokenizer.decode(inputs["input_ids"][0]))
 [CLS] 吾 輩 は 猫 で あ る 。 [SEP]

 >>> outputs = bertjapanese(**inputs)
--- a/docs/source/model_doc/bertgeneration.mdx
+++ b/docs/source/model_doc/bertgeneration.mdx
@@ -39,14 +39,18 @@ Usage:
 >>> # use BERT's cls token as BOS token and sep token as EOS token
 >>> encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased", bos_token_id=101, eos_token_id=102)
 >>> # add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
->>> decoder = BertGenerationDecoder.from_pretrained("bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102)
+>>> decoder = BertGenerationDecoder.from_pretrained(
+...     "bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102
+... )
 >>> bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)

 >>> # create tokenizer...
 >>> tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")

->>> input_ids = tokenizer('This is a long article to summarize', add_special_tokens=False, return_tensors="pt").input_ids
->>> labels = tokenizer('This is a short summary', return_tensors="pt").input_ids
+>>> input_ids = tokenizer(
+...     "This is a long article to summarize", add_special_tokens=False, return_tensors="pt"
+>>> ).input_ids
+>>> labels = tokenizer("This is a short summary", return_tensors="pt").input_ids

 >>> # train...
 >>> loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss
@@ -61,7 +65,9 @@ Usage:
 >>> sentence_fuser = EncoderDecoderModel.from_pretrained("google/roberta2roberta_L-24_discofuse")
 >>> tokenizer = AutoTokenizer.from_pretrained("google/roberta2roberta_L-24_discofuse")

->>> input_ids = tokenizer('This is the first sentence. This is the second sentence.', add_special_tokens=False, return_tensors="pt").input_ids
+>>> input_ids = tokenizer(
+...     "This is the first sentence. This is the second sentence.", add_special_tokens=False, return_tensors="pt"
+>>> ).input_ids

 >>> outputs = sentence_fuser.generate(input_ids)

--- a/docs/source/model_doc/bertweet.mdx
+++ b/docs/source/model_doc/bertweet.mdx
@@ -28,14 +28,14 @@ Example of use:

 ```python
 >>> import torch
->>> from transformers import AutoModel, AutoTokenizer 
+>>> from transformers import AutoModel, AutoTokenizer

 >>> bertweet = AutoModel.from_pretrained("vinai/bertweet-base")

->>> # For transformers v4.x+: 
+>>> # For transformers v4.x+:
 >>> tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)

->>> # For transformers v3.x: 
+>>> # For transformers v3.x:
 >>> # tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")

 >>> # INPUT TWEET IS ALREADY NORMALIZED!
--- a/docs/source/model_doc/blenderbot.mdx
+++ b/docs/source/model_doc/blenderbot.mdx
@@ -50,11 +50,12 @@ Here is an example of model usage:

 ```python
 >>> from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration
->>> mname = 'facebook/blenderbot-400M-distill'
+
+>>> mname = "facebook/blenderbot-400M-distill"
 >>> model = BlenderbotForConditionalGeneration.from_pretrained(mname)
 >>> tokenizer = BlenderbotTokenizer.from_pretrained(mname)
 >>> UTTERANCE = "My friends are cool but they eat too many carbs."
->>> inputs = tokenizer([UTTERANCE], return_tensors='pt')
+>>> inputs = tokenizer([UTTERANCE], return_tensors="pt")
 >>> reply_ids = model.generate(**inputs)
 >>> print(tokenizer.batch_decode(reply_ids))
 ["<s> That's unfortunate. Are they trying to lose weight or are they just trying to be healthier?</s>"]
--- a/docs/source/model_doc/byt5.mdx
+++ b/docs/source/model_doc/byt5.mdx
@@ -51,12 +51,14 @@ ByT5 works on raw UTF-8 bytes, so it can be used without a tokenizer:
 from transformers import T5ForConditionalGeneration
 import torch

-model = T5ForConditionalGeneration.from_pretrained('google/byt5-small')
+model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")

 input_ids = torch.tensor([list("Life is like a box of chocolates.".encode("utf-8"))]) + 3  # add 3 for special tokens
-labels = torch.tensor([list("La vie est comme une boîte de chocolat.".encode("utf-8"))]) + 3  # add 3 for special tokens
+labels = (
+    torch.tensor([list("La vie est comme une boîte de chocolat.".encode("utf-8"))]) + 3
+)  # add 3 for special tokens

-loss = model(input_ids, labels=labels).loss # forward pass
+loss = model(input_ids, labels=labels).loss  # forward pass
 ```

 For batched inference and training it is however recommended to make use of the tokenizer:
@@ -64,13 +66,17 @@ For batched inference and training it is however recommended to make use of the
 ```python
 from transformers import T5ForConditionalGeneration, AutoTokenizer

-model = T5ForConditionalGeneration.from_pretrained('google/byt5-small')
-tokenizer = AutoTokenizer.from_pretrained('google/byt5-small')
+model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
+tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")

-model_inputs = tokenizer(["Life is like a box of chocolates.", "Today is Monday."], padding="longest", return_tensors="pt")
-labels = tokenizer(["La vie est comme une boîte de chocolat.", "Aujourd'hui c'est lundi."], padding="longest", return_tensors="pt").input_ids
+model_inputs = tokenizer(
+    ["Life is like a box of chocolates.", "Today is Monday."], padding="longest", return_tensors="pt"
+)
+labels = tokenizer(
+    ["La vie est comme une boîte de chocolat.", "Aujourd'hui c'est lundi."], padding="longest", return_tensors="pt"
+).input_ids

-loss = model(**model_inputs, labels=labels).loss # forward pass
+loss = model(**model_inputs, labels=labels).loss  # forward pass
 ```

 ## ByT5Tokenizer
--- a/docs/source/model_doc/canine.mdx
+++ b/docs/source/model_doc/canine.mdx
@@ -64,13 +64,13 @@ CANINE works on raw characters, so it can be used without a tokenizer:
 >>> from transformers import CanineModel
 >>> import torch

->>> model = CanineModel.from_pretrained('google/canine-c') # model pre-trained with autoregressive character loss
+>>> model = CanineModel.from_pretrained("google/canine-c")  # model pre-trained with autoregressive character loss

 >>> text = "hello world"
 >>> # use Python's built-in ord() function to turn each character into its unicode code point id
 >>> input_ids = torch.tensor([[ord(char) for char in text]])

->>> outputs = model(input_ids) # forward pass
+>>> outputs = model(input_ids)  # forward pass
 >>> pooled_output = outputs.pooler_output
 >>> sequence_output = outputs.last_hidden_state
 ```
@@ -81,13 +81,13 @@ sequences to the same length):
 ```python
 >>> from transformers import CanineTokenizer, CanineModel

->>> model = CanineModel.from_pretrained('google/canine-c')
->>> tokenizer = CanineTokenizer.from_pretrained('google/canine-c')
+>>> model = CanineModel.from_pretrained("google/canine-c")
+>>> tokenizer = CanineTokenizer.from_pretrained("google/canine-c")

 >>> inputs = ["Life is like a box of chocolates.", "You never know what you gonna get."]
 >>> encoding = tokenizer(inputs, padding="longest", truncation=True, return_tensors="pt")

->>> outputs = model(**encoding) # forward pass
+>>> outputs = model(**encoding)  # forward pass
 >>> pooled_output = outputs.pooler_output
 >>> sequence_output = outputs.last_hidden_state
 ```
--- a/docs/source/model_doc/clip.mdx
+++ b/docs/source/model_doc/clip.mdx
@@ -69,8 +69,8 @@ encode the text and prepare the images. The following example shows how to get t
 >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)

 >>> outputs = model(**inputs)
->>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
->>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
+>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
 ```

 This model was contributed by [valhalla](https://huggingface.co/valhalla). The original code can be found [here](https://github.com/openai/CLIP).
--- a/docs/source/model_doc/gpt_neo.mdx
+++ b/docs/source/model_doc/gpt_neo.mdx
@@ -29,16 +29,24 @@ The `generate()` method can be used to generate text using GPT Neo model.

 ```python
 >>> from transformers import GPTNeoForCausalLM, GPT2Tokenizer
+
 >>> model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B")
 >>> tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B")

->>> prompt = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, " \
-...          "previously unexplored valley, in the Andes Mountains. Even more surprising to the " \
-...          "researchers was the fact that the unicorns spoke perfect English."
+>>> prompt = (
+...     "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
+...     "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
+...     "researchers was the fact that the unicorns spoke perfect English."
+... )

 >>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids

->>> gen_tokens = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=100,)
+>>> gen_tokens = model.generate(
+...     input_ids,
+...     do_sample=True,
+...     temperature=0.9,
+...     max_length=100,
+... )
 >>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
 ```

--- a/docs/source/model_doc/gptj.mdx
+++ b/docs/source/model_doc/gptj.mdx
@@ -33,7 +33,9 @@ Tips:
 >>> from transformers import GPTJForCausalLM
 >>> import torch

->>> model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", revision="float16", torch_dtype=torch.float16, low_cpu_mem_usage=True)
+>>> model = GPTJForCausalLM.from_pretrained(
+...     "EleutherAI/gpt-j-6B", revision="float16", torch_dtype=torch.float16, low_cpu_mem_usage=True
+... )
 ```

 - The model should fit on 16GB GPU for inference. For training/fine-tuning it would take much more GPU RAM. Adam
@@ -56,16 +58,24 @@ model.

 ```python
 >>> from transformers import AutoModelForCausalLM, AutoTokenizer
+
 >>> model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")
 >>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")

->>> prompt = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, " \
-...          "previously unexplored valley, in the Andes Mountains. Even more surprising to the " \
-...          "researchers was the fact that the unicorns spoke perfect English."
+>>> prompt = (
+...     "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
+...     "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
+...     "researchers was the fact that the unicorns spoke perfect English."
+... )

 >>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids

->>> gen_tokens = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=100,)
+>>> gen_tokens = model.generate(
+...     input_ids,
+...     do_sample=True,
+...     temperature=0.9,
+...     max_length=100,
+... )
 >>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
 ```

@@ -78,13 +88,20 @@ model.
 >>> model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", torch_dtype=torch.float16)
 >>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")

->>> prompt = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, " \
-...          "previously unexplored valley, in the Andes Mountains. Even more surprising to the " \
-...          "researchers was the fact that the unicorns spoke perfect English."
+>>> prompt = (
+...     "In a shocking finding, scientists discovered a herd of unicorns living in a remote, "
+...     "previously unexplored valley, in the Andes Mountains. Even more surprising to the "
+...     "researchers was the fact that the unicorns spoke perfect English."
+... )

 >>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids

->>> gen_tokens = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=100,)
+>>> gen_tokens = model.generate(
+...     input_ids,
+...     do_sample=True,
+...     temperature=0.9,
+...     max_length=100,
+... )
 >>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
 ```

--- a/docs/source/model_doc/herbert.mdx
+++ b/docs/source/model_doc/herbert.mdx
@@ -41,7 +41,7 @@ Examples of use:
 >>> tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
 >>> model = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")

->>> encoded_input = tokenizer.encode("Kto ma lepszą sztukę, ma lepszy rząd – to jasne.", return_tensors='pt')
+>>> encoded_input = tokenizer.encode("Kto ma lepszą sztukę, ma lepszy rząd – to jasne.", return_tensors="pt")
 >>> outputs = model(encoded_input)

 >>> # HerBERT can also be loaded using AutoTokenizer and AutoModel:
--- a/docs/source/model_doc/layoutlm.mdx
+++ b/docs/source/model_doc/layoutlm.mdx
@@ -53,12 +53,12 @@ Tips:

 ```python
 def normalize_bbox(bbox, width, height):
-     return [
-         int(1000 * (bbox[0] / width)),
-         int(1000 * (bbox[1] / height)),
-         int(1000 * (bbox[2] / width)),
-         int(1000 * (bbox[3] / height)),
-     ]
+    return [
+        int(1000 * (bbox[0] / width)),
+        int(1000 * (bbox[1] / height)),
+        int(1000 * (bbox[2] / width)),
+        int(1000 * (bbox[3] / height)),
+    ]
 ```

 Here, `width` and `height` correspond to the width and height of the original document in which the token
--- a/docs/source/model_doc/layoutlmv2.mdx
+++ b/docs/source/model_doc/layoutlmv2.mdx
@@ -70,12 +70,12 @@ Tips:

 ```python
 def normalize_bbox(bbox, width, height):
-     return [
-         int(1000 * (bbox[0] / width)),
-         int(1000 * (bbox[1] / height)),
-         int(1000 * (bbox[2] / width)),
-         int(1000 * (bbox[3] / height)),
-     ]
+    return [
+        int(1000 * (bbox[0] / width)),
+        int(1000 * (bbox[1] / height)),
+        int(1000 * (bbox[2] / width)),
+        int(1000 * (bbox[3] / height)),
+    ]
 ```

 Here, `width` and `height` correspond to the width and height of the original document in which the token
@@ -123,7 +123,7 @@ modality.
 ```python
 from transformers import LayoutLMv2FeatureExtractor, LayoutLMv2TokenizerFast, LayoutLMv2Processor

-feature_extractor = LayoutLMv2FeatureExtractor() # apply_ocr is set to True by default
+feature_extractor = LayoutLMv2FeatureExtractor()  # apply_ocr is set to True by default
 tokenizer = LayoutLMv2TokenizerFast.from_pretrained("microsoft/layoutlmv2-base-uncased")
 processor = LayoutLMv2Processor(feature_extractor, tokenizer)
 ```
@@ -158,7 +158,9 @@ from PIL import Image
 processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")

 image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
-encoding = processor(image, return_tensors="pt") # you can also add all tokenizer parameters here such as padding, truncation
+encoding = processor(
+    image, return_tensors="pt"
+)  # you can also add all tokenizer parameters here such as padding, truncation
 print(encoding.keys())
 # dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
 ```
@@ -177,7 +179,7 @@ processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncas

 image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
 words = ["hello", "world"]
-boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes
+boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]  # make sure to normalize your bounding boxes
 encoding = processor(image, words, boxes=boxes, return_tensors="pt")
 print(encoding.keys())
 # dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
@@ -199,7 +201,7 @@ processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncas

 image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
 words = ["hello", "world"]
-boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes
+boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]  # make sure to normalize your bounding boxes
 word_labels = [1, 2]
 encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="pt")
 print(encoding.keys())
@@ -219,7 +221,7 @@ processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncas

 image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
 question = "What's his name?"
-encoding = processor(image, question, return_tensors="pt") 
+encoding = processor(image, question, return_tensors="pt")
 print(encoding.keys())
 # dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
 ```
@@ -238,8 +240,8 @@ processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncas
 image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
 question = "What's his name?"
 words = ["hello", "world"]
-boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes
-encoding = processor(image, question, words, boxes=boxes, return_tensors="pt")  
+boxes = [[1, 2, 3, 4], [5, 6, 7, 8]]  # make sure to normalize your bounding boxes
+encoding = processor(image, question, words, boxes=boxes, return_tensors="pt")
 print(encoding.keys())
 # dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
 ```
--- a/docs/source/model_doc/layoutxlm.mdx
+++ b/docs/source/model_doc/layoutxlm.mdx
@@ -34,7 +34,7 @@ One can directly plug in the weights of LayoutXLM into a LayoutLMv2 model, like
 ```python
 from transformers import LayoutLMv2Model

-model = LayoutLMv2Model.from_pretrained('microsoft/layoutxlm-base')
+model = LayoutLMv2Model.from_pretrained("microsoft/layoutxlm-base")
 ```

 Note that LayoutXLM has its own tokenizer, based on
@@ -44,7 +44,7 @@ follows:
 ```python
 from transformers import LayoutXLMTokenizer

-tokenizer = LayoutXLMTokenizer.from_pretrained('microsoft/layoutxlm-base')
+tokenizer = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base")
 ```

 Similar to LayoutLMv2, you can use [`LayoutXLMProcessor`] (which internally applies
--- a/docs/source/model_doc/longformer.mdx
+++ b/docs/source/model_doc/longformer.mdx
@@ -75,8 +75,8 @@ For more information, please refer to the official [paper](https://arxiv.org/pdf
 trained and should be used as follows:

 ```python
-input_ids = tokenizer.encode('This is a sentence from [MASK] training data', return_tensors='pt')
-mlm_labels = tokenizer.encode('This is a sentence from the training data', return_tensors='pt')
+input_ids = tokenizer.encode("This is a sentence from [MASK] training data", return_tensors="pt")
+mlm_labels = tokenizer.encode("This is a sentence from the training data", return_tensors="pt")

 loss = model(input_ids, labels=input_ids, masked_lm_labels=mlm_labels)[0]
 ```
--- a/docs/source/model_doc/luke.mdx
+++ b/docs/source/model_doc/luke.mdx
@@ -84,24 +84,27 @@ Example:

 >>> model = LukeModel.from_pretrained("studio-ousia/luke-base")
 >>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base")
-
 # Example 1: Computing the contextualized entity representation corresponding to the entity mention "Beyoncé"
+
 >>> text = "Beyoncé lives in Los Angeles."
 >>> entity_spans = [(0, 7)]  # character-based entity span corresponding to "Beyoncé"
 >>> inputs = tokenizer(text, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
 >>> outputs = model(**inputs)
 >>> word_last_hidden_state = outputs.last_hidden_state
 >>> entity_last_hidden_state = outputs.entity_last_hidden_state
-
 # Example 2: Inputting Wikipedia entities to obtain enriched contextualized representations
->>> entities = ["Beyoncé", "Los Angeles"]  # Wikipedia entity titles corresponding to the entity mentions "Beyoncé" and "Los Angeles"
+
+>>> entities = [
+...     "Beyoncé",
+...     "Los Angeles",
+>>> ]  # Wikipedia entity titles corresponding to the entity mentions "Beyoncé" and "Los Angeles"
 >>> entity_spans = [(0, 7), (17, 28)]  # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
 >>> inputs = tokenizer(text, entities=entities, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt")
 >>> outputs = model(**inputs)
 >>> word_last_hidden_state = outputs.last_hidden_state
 >>> entity_last_hidden_state = outputs.entity_last_hidden_state
-
 # Example 3: Classifying the relationship between two entities using LukeForEntityPairClassification head model
+
 >>> model = LukeForEntityPairClassification.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
 >>> tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-tacred")
 >>> entity_spans = [(0, 7), (17, 28)]  # character-based entity spans corresponding to "Beyoncé" and "Los Angeles"
--- a/docs/source/model_doc/m2m_100.mdx
+++ b/docs/source/model_doc/m2m_100.mdx
@@ -49,8 +49,8 @@ examples. To install `sentencepiece` run `pip install sentencepiece`.
 ```python
 from transformers import M2M100Config, M2M100ForConditionalGeneration, M2M100Tokenizer

-model = M2M100ForConditionalGeneration.from_pretrained('facebook/m2m100_418M')
-tokenizer = M2M100Tokenizer.from_pretrained('facebook/m2m100_418M', src_lang="en", tgt_lang="fr")
+model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
+tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en", tgt_lang="fr")

 src_text = "Life is like a box of chocolates."
 tgt_text = "La vie est comme une boîte de chocolat."
@@ -59,7 +59,7 @@ model_inputs = tokenizer(src_text, return_tensors="pt")
 with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_text, return_tensors="pt").input_ids

-loss = model(**model_inputs, labels=labels) # forward pass
+loss = model(**model_inputs, labels=labels)  # forward pass
 ```

 - Generation
--- a/docs/source/model_doc/marian.mdx
+++ b/docs/source/model_doc/marian.mdx
@@ -65,13 +65,14 @@ require 3 character language codes:

 ```python
 >>> from transformers import MarianMTModel, MarianTokenizer
->>> src_text = [
-...     '>>fra<< this is a sentence in english that we want to translate to french',
-...     '>>por<< This should go to portuguese',
-...     '>>esp<< And this to Spanish'
->>> ]

->>> model_name = 'Helsinki-NLP/opus-mt-en-roa'
+>>> src_text = [
+...     ">>fra<< this is a sentence in english that we want to translate to french",
+...     ">>por<< This should go to portuguese",
+...     ">>esp<< And this to Spanish",
+... ]
+
+>>> model_name = "Helsinki-NLP/opus-mt-en-roa"
 >>> tokenizer = MarianTokenizer.from_pretrained(model_name)
 >>> print(tokenizer.supported_language_codes)
 ['>>zlm_Latn<<', '>>mfe<<', '>>hat<<', '>>pap<<', '>>ast<<', '>>cat<<', '>>ind<<', '>>glg<<', '>>wln<<', '>>spa<<', '>>fra<<', '>>ron<<', '>>por<<', '>>ita<<', '>>oci<<', '>>arg<<', '>>min<<']
@@ -88,11 +89,12 @@ Here is the code to see all available pretrained models on the hub:

 ```python
 from huggingface_hub import list_models
+
 model_list = list_models()
 org = "Helsinki-NLP"
 model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)]
-suffix = [x.split('/')[1] for x in model_ids]
-old_style_multi_models = [f'{org}/{s}' for s in suffix if s != s.lower()]
+suffix = [x.split("/")[1] for x in model_ids]
+old_style_multi_models = [f"{org}/{s}" for s in suffix if s != s.lower()]
 ```

 ## Old Style Multi-Lingual Models
@@ -100,7 +102,7 @@ old_style_multi_models = [f'{org}/{s}' for s in suffix if s != s.lower()]
 These are the old style multi-lingual models ported from the OPUS-MT-Train repo: and the members of each language
 group:

-```python
+```python no-style
 ['Helsinki-NLP/opus-mt-NORTH_EU-NORTH_EU',
 'Helsinki-NLP/opus-mt-ROMANCE-en',
 'Helsinki-NLP/opus-mt-SCANDINAVIA-SCANDINAVIA',
@@ -129,13 +131,14 @@ Example of translating english to many romance languages, using old-style 2 char

 ```python
 >>> from transformers import MarianMTModel, MarianTokenizer
->>> src_text = [
-...     '>>fr<< this is a sentence in english that we want to translate to french',
-...     '>>pt<< This should go to portuguese',
-...     '>>es<< And this to Spanish'
->>> ]

->>> model_name = 'Helsinki-NLP/opus-mt-en-ROMANCE'
+>>> src_text = [
+...     ">>fr<< this is a sentence in english that we want to translate to french",
+...     ">>pt<< This should go to portuguese",
+...     ">>es<< And this to Spanish",
+... ]
+
+>>> model_name = "Helsinki-NLP/opus-mt-en-ROMANCE"
 >>> tokenizer = MarianTokenizer.from_pretrained(model_name)

 >>> model = MarianMTModel.from_pretrained(model_name)
--- a/docs/source/model_doc/mbart.mdx
+++ b/docs/source/model_doc/mbart.mdx
@@ -52,7 +52,7 @@ inside the context manager [`~MBartTokenizer.as_target_tokenizer`] to encode tar

 >>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
 >>> # forward pass
->>> model(**inputs, labels=batch['labels'])
+>>> model(**inputs, labels=batch["labels"])
 ```

 - Generation
@@ -106,13 +106,13 @@ model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")
 tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")

 src_text = " UN Chief Says There Is No Military Solution in Syria"
-tgt_text =  "Şeful ONU declară că nu există o soluţie militară în Siria"
+tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"

 model_inputs = tokenizer(src_text, return_tensors="pt")
 with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_text, return_tensors="pt").input_ids

-model(**model_inputs, labels=labels) # forward pass
+model(**model_inputs, labels=labels)  # forward pass
 ```

 - Generation
--- a/docs/source/model_doc/mluke.mdx
+++ b/docs/source/model_doc/mluke.mdx
@@ -38,7 +38,7 @@ One can directly plug in the weights of mLUKE into a LUKE model, like so:
 ```python
 from transformers import LukeModel

-model = LukeModel.from_pretrained('studio-ousia/mluke-base')
+model = LukeModel.from_pretrained("studio-ousia/mluke-base")
 ```

 Note that mLUKE has its own tokenizer, [`MLukeTokenizer`]. You can initialize it as follows:
@@ -46,7 +46,7 @@ Note that mLUKE has its own tokenizer, [`MLukeTokenizer`]. You can initialize it
 ```python
 from transformers import MLukeTokenizer

-tokenizer = MLukeTokenizer.from_pretrained('studio-ousia/mluke-base')
+tokenizer = MLukeTokenizer.from_pretrained("studio-ousia/mluke-base")
 ```

 As mLUKE's architecture is equivalent to that of LUKE, one can refer to [LUKE's documentation page](luke) for all
--- a/docs/source/model_doc/pegasus.mdx
+++ b/docs/source/model_doc/pegasus.mdx
@@ -69,18 +69,22 @@ All the [checkpoints](https://huggingface.co/models?search=pegasus) are fine-tun
 ```python
 >>> from transformers import PegasusForConditionalGeneration, PegasusTokenizer
 >>> import torch
+
 >>> src_text = [
 ...     """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."""
->>> ]
+... ]

->>> model_name = 'google/pegasus-xsum'
->>> device = 'cuda' if torch.cuda.is_available() else 'cpu'
->>> tokenizer = PegasusTokenizer.from_pretrained(model_name)
->>> model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
->>> batch = tokenizer(src_text, truncation=True, padding='longest', return_tensors="pt").to(device)
->>> translated = model.generate(**batch)
->>> tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
->>> assert tgt_text[0] == "California's largest electricity provider has turned off power to hundreds of thousands of customers."
+... model_name = "google/pegasus-xsum"
+... device = "cuda" if torch.cuda.is_available() else "cpu"
+... tokenizer = PegasusTokenizer.from_pretrained(model_name)
+... model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)
+... batch = tokenizer(src_text, truncation=True, padding="longest", return_tensors="pt").to(device)
+... translated = model.generate(**batch)
+... tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
+... assert (
+...     tgt_text[0]
+...     == "California's largest electricity provider has turned off power to hundreds of thousands of customers."
+... )
 ```

 ## PegasusConfig
--- a/docs/source/model_doc/qdqbert.mdx
+++ b/docs/source/model_doc/qdqbert.mdx
@@ -75,9 +75,9 @@ tensors. After setting up the tensor quantizers, one can use the following examp
 ```python
 >>> # Find the TensorQuantizer and enable calibration
 >>> for name, module in model.named_modules():
->>>     if name.endswith('_input_quantizer'):
->>>         module.enable_calib()
->>>         module.disable_quant()  # Use full precision data to calibrate
+...     if name.endswith("_input_quantizer"):
+...         module.enable_calib()
+...         module.disable_quant()  # Use full precision data to calibrate

 >>> # Feeding data samples
 >>> model(x)
@@ -85,9 +85,9 @@ tensors. After setting up the tensor quantizers, one can use the following examp

 >>> # Finalize calibration
 >>> for name, module in model.named_modules():
->>>     if name.endswith('_input_quantizer'):
->>>         module.load_calib_amax()
->>>         module.enable_quant()
+...     if name.endswith("_input_quantizer"):
+...         module.load_calib_amax()
+...         module.enable_quant()

 >>> # If running on GPU, it needs to call .cuda() again because new tensors will be created by calibration process
 >>> model.cuda()
@@ -105,6 +105,7 @@ the instructions in [torch.onnx](https://pytorch.org/docs/stable/onnx.html). Exa

 ```python
 >>> from pytorch_quantization.nn import TensorQuantizer
+
 >>> TensorQuantizer.use_fb_fake_quant = True

 >>> # Load the calibrated model
--- a/docs/source/model_doc/reformer.mdx
+++ b/docs/source/model_doc/reformer.mdx
@@ -134,7 +134,7 @@ easily be trained on sequences as long as 64000 tokens.
 For training, the [`ReformerModelWithLMHead`] should be used as follows:

 ```python
-input_ids = tokenizer.encode('This is a sentence from the training data', return_tensors='pt')
+input_ids = tokenizer.encode("This is a sentence from the training data", return_tensors="pt")
 loss = model(input_ids, labels=input_ids)[0]
 ```

--- a/docs/source/model_doc/speech_to_text.mdx
+++ b/docs/source/model_doc/speech_to_text.mdx
@@ -52,11 +52,13 @@ be installed as follows: `apt install libsndfile1-dev`
 >>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr")
 >>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")

+
 >>> def map_to_array(batch):
 ...     speech, _ = sf.read(batch["file"])
 ...     batch["speech"] = speech
 ...     return batch

+
 >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
 >>> ds = ds.map(map_to_array)

@@ -83,16 +85,22 @@ be installed as follows: `apt install libsndfile1-dev`
 >>> model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")
 >>> processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-mustc-multilingual-st")

+
 >>> def map_to_array(batch):
 ...     speech, _ = sf.read(batch["file"])
 ...     batch["speech"] = speech
 ...     return batch

+
 >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
 >>> ds = ds.map(map_to_array)

 >>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
->>> generated_ids = model.generate(input_ids=inputs["input_features"], attention_mask=inputs["attention_mask], forced_bos_token_id=processor.tokenizer.lang_code_to_id["fr"])
+>>> generated_ids = model.generate(
+...     input_ids=inputs["input_features"],
+...     attention_mask=inputs["attention_mask"],
+...     forced_bos_token_id=processor.tokenizer.lang_code_to_id["fr"],
+... )

 >>> translation = processor.batch_decode(generated_ids)
 ```
--- a/docs/source/model_doc/speech_to_text_2.mdx
+++ b/docs/source/model_doc/speech_to_text_2.mdx
@@ -58,11 +58,13 @@ predicted token ids.
 >>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
 >>> processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de")

+
 >>> def map_to_array(batch):
 ...     speech, _ = sf.read(batch["file"])
 ...     batch["speech"] = speech
 ...     return batch

+
 >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
 >>> ds = ds.map(map_to_array)

@@ -81,7 +83,11 @@ predicted token ids.
 >>> from transformers import pipeline

 >>> librispeech_en = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
->>> asr = pipeline("automatic-speech-recognition", model="facebook/s2t-wav2vec2-large-en-de", feature_extractor="facebook/s2t-wav2vec2-large-en-de")
+>>> asr = pipeline(
+...     "automatic-speech-recognition",
+...     model="facebook/s2t-wav2vec2-large-en-de",
+...     feature_extractor="facebook/s2t-wav2vec2-large-en-de",
+... )

 >>> translation_de = asr(librispeech_en[0]["file"])
 ```
--- a/docs/source/model_doc/t5.mdx
+++ b/docs/source/model_doc/t5.mdx
@@ -98,8 +98,8 @@ language modeling head on top of the decoder.
  tokenizer = T5Tokenizer.from_pretrained("t5-small")
  model = T5ForConditionalGeneration.from_pretrained("t5-small")

-  input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
-  labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
+  input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
+  labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
  # the forward function automatically creates the correct decoder_input_ids
  loss = model(input_ids=input_ids, labels=labels).loss
  ```
@@ -120,8 +120,8 @@ language modeling head on top of the decoder.
  tokenizer = T5Tokenizer.from_pretrained("t5-small")
  model = T5ForConditionalGeneration.from_pretrained("t5-small")

-  input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
-  labels = tokenizer('Das Haus ist wunderbar.', return_tensors='pt').input_ids
+  input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids
+  labels = tokenizer("Das Haus ist wunderbar.", return_tensors="pt").input_ids
  # the forward function automatically creates the correct decoder_input_ids
  loss = model(input_ids=input_ids, labels=labels).loss
  ```
@@ -148,7 +148,7 @@ language modeling head on top of the decoder.
  ignored. The code example below illustrates all of this.

  ```python
-  from transformers import T5Tokenizer, T5ForConditionalGeneration 
+  from transformers import T5Tokenizer, T5ForConditionalGeneration
  import torch

  tokenizer = T5Tokenizer.from_pretrained("t5-small")
@@ -168,18 +168,19 @@ language modeling head on top of the decoder.
  # encode the inputs
  task_prefix = "translate English to French: "
  input_sequences = [input_sequence_1, input_sequence_2]
-  encoding = tokenizer([task_prefix + sequence for sequence in input_sequences], 
-                      padding='longest', 
-                      max_length=max_source_length, 
-                      truncation=True, 
-                      return_tensors="pt")
+  encoding = tokenizer(
+      [task_prefix + sequence for sequence in input_sequences],
+      padding="longest",
+      max_length=max_source_length,
+      truncation=True,
+      return_tensors="pt",
+  )
  input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

  # encode the targets
-  target_encoding = tokenizer([output_sequence_1, output_sequence_2], 
-                              padding='longest', 
-                              max_length=max_target_length, 
-                              truncation=True)
+  target_encoding = tokenizer(
+      [output_sequence_1, output_sequence_2], padding="longest", max_length=max_target_length, truncation=True
+  )
  labels = target_encoding.input_ids

  # replace padding token id's of the labels by -100
@@ -218,12 +219,12 @@ There's also [this blog post](https://huggingface.co/blog/encoder-decoder#encode
 generation works in general in encoder-decoder models.

 ```python
-from transformers import T5Tokenizer, T5ForConditionalGeneration 
+from transformers import T5Tokenizer, T5ForConditionalGeneration

 tokenizer = T5Tokenizer.from_pretrained("t5-small")
 model = T5ForConditionalGeneration.from_pretrained("t5-small")

-input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
+input_ids = tokenizer("translate English to German: The house is wonderful.", return_tensors="pt").input_ids
 outputs = model.generate(input_ids)
 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 # Das Haus ist wunderbar.
@@ -242,17 +243,17 @@ model = T5ForConditionalGeneration.from_pretrained("t5-small")

 # when generating, we will use the logits of right-most token to predict the next token
 # so the padding should be on the left
-tokenizer.padding_side = "left" 
-tokenizer.pad_token = tokenizer.eos_token # to avoid an error
+tokenizer.padding_side = "left"
+tokenizer.pad_token = tokenizer.eos_token  # to avoid an error

-task_prefix = 'translate English to German: '
-sentences = ['The house is wonderful.', 'I like to work in NYC.'] # use different length sentences to test batching
+task_prefix = "translate English to German: "
+sentences = ["The house is wonderful.", "I like to work in NYC."]  # use different length sentences to test batching
 inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True)

 output_sequences = model.generate(
-    input_ids=inputs['input_ids'],
-    attention_mask=inputs['attention_mask'],
-    do_sample=False, # disable sampling to test if batching affects output
+    input_ids=inputs["input_ids"],
+    attention_mask=inputs["attention_mask"],
+    do_sample=False,  # disable sampling to test if batching affects output
 )

 print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True))
--- a/docs/source/model_doc/t5v1.1.mdx
+++ b/docs/source/model_doc/t5v1.1.mdx
@@ -22,7 +22,7 @@ One can directly plug in the weights of T5v1.1 into a T5 model, like so:
 ```python
 from transformers import T5ForConditionalGeneration

-model = T5ForConditionalGeneration.from_pretrained('google/t5-v1_1-base')
+model = T5ForConditionalGeneration.from_pretrained("google/t5-v1_1-base")
 ```

 T5 Version 1.1 includes the following improvements compared to the original T5 model:
--- a/docs/source/model_doc/tapas.mdx
+++ b/docs/source/model_doc/tapas.mdx
@@ -75,28 +75,28 @@ dependency in case you're using Tensorflow:
 >>> from transformers import TapasConfig, TapasForQuestionAnswering

 >>> # for example, the base sized model with default SQA configuration
->>> model = TapasForQuestionAnswering.from_pretrained('google/tapas-base')
+>>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base")

 >>> # or, the base sized model with WTQ configuration
->>> config = TapasConfig.from_pretrained('google/tapas-base-finetuned-wtq')
->>> model = TapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)
+>>> config = TapasConfig.from_pretrained("google/tapas-base-finetuned-wtq")
+>>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)

 >>> # or, the base sized model with WikiSQL configuration
->>> config = TapasConfig('google-base-finetuned-wikisql-supervised')
->>> model = TapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)
+>>> config = TapasConfig("google-base-finetuned-wikisql-supervised")
+>>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
 ===PT-TF-SPLIT===
 >>> from transformers import TapasConfig, TFTapasForQuestionAnswering

 >>> # for example, the base sized model with default SQA configuration
->>> model = TFTapasForQuestionAnswering.from_pretrained('google/tapas-base')
+>>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base")

 >>> # or, the base sized model with WTQ configuration
->>> config = TapasConfig.from_pretrained('google/tapas-base-finetuned-wtq')
->>> model = TFTapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)
+>>> config = TapasConfig.from_pretrained("google/tapas-base-finetuned-wtq")
+>>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)

 >>> # or, the base sized model with WikiSQL configuration
->>> config = TapasConfig('google-base-finetuned-wikisql-supervised')
->>> model = TFTapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)
+>>> config = TapasConfig("google-base-finetuned-wikisql-supervised")
+>>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
 ```

 Of course, you don't necessarily have to follow one of these three ways in which TAPAS was fine-tuned. You can also experiment by defining any hyperparameters you want when initializing [`TapasConfig`], and then create a [`TapasForQuestionAnswering`] based on that configuration. For example, if you have a dataset that has both conversational questions and questions that might involve aggregation, then you can do it this way. Here's an example:
@@ -107,14 +107,14 @@ Of course, you don't necessarily have to follow one of these three ways in which
 >>> # you can initialize the classification heads any way you want (see docs of TapasConfig)
 >>> config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True)
 >>> # initializing the pre-trained base sized model with our custom classification heads
->>> model = TapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)
+>>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
 ===PT-TF-SPLIT===
 >>> from transformers import TapasConfig, TFTapasForQuestionAnswering

 >>> # you can initialize the classification heads any way you want (see docs of TapasConfig)
 >>> config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True)
 >>> # initializing the pre-trained base sized model with our custom classification heads
->>> model = TFTapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)
+>>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
 ```

 What you can also do is start from an already fine-tuned checkpoint. A note here is that the already fine-tuned checkpoint on WTQ has some issues due to the L2-loss which is somewhat brittle. See [here](https://github.com/google-research/tapas/issues/91#issuecomment-735719340) for more info.
@@ -154,15 +154,26 @@ inputs to be fine-tuned:
 >>> from transformers import TapasTokenizer
 >>> import pandas as pd

->>> model_name = 'google/tapas-base'
+>>> model_name = "google/tapas-base"
 >>> tokenizer = TapasTokenizer.from_pretrained(model_name)

->>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], 'Number of movies': ["87", "53", "69"]}
->>> queries = ["What is the name of the first actor?", "How many movies has George Clooney played in?", "What is the total number of movies?"]
+>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
+>>> queries = [
+...     "What is the name of the first actor?",
+...     "How many movies has George Clooney played in?",
+...     "What is the total number of movies?",
+... ]
 >>> answer_coordinates = [[(0, 0)], [(2, 1)], [(0, 1), (1, 1), (2, 1)]]
 >>> answer_text = [["Brad Pitt"], ["69"], ["209"]]
 >>> table = pd.DataFrame.from_dict(data)
->>> inputs = tokenizer(table=table, queries=queries, answer_coordinates=answer_coordinates, answer_text=answer_text, padding='max_length', return_tensors='pt')
+>>> inputs = tokenizer(
+...     table=table,
+...     queries=queries,
+...     answer_coordinates=answer_coordinates,
+...     answer_text=answer_text,
+...     padding="max_length",
+...     return_tensors="pt",
+... )
 >>> inputs
 {'input_ids': tensor([[ ... ]]), 'attention_mask': tensor([[...]]), 'token_type_ids': tensor([[[...]]]),
 'numeric_values': tensor([[ ... ]]), 'numeric_values_scale: tensor([[ ... ]]), labels: tensor([[ ... ]])}
@@ -170,15 +181,26 @@ inputs to be fine-tuned:
 >>> from transformers import TapasTokenizer
 >>> import pandas as pd

->>> model_name = 'google/tapas-base'
+>>> model_name = "google/tapas-base"
 >>> tokenizer = TapasTokenizer.from_pretrained(model_name)

->>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], 'Number of movies': ["87", "53", "69"]}
->>> queries = ["What is the name of the first actor?", "How many movies has George Clooney played in?", "What is the total number of movies?"]
+>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
+>>> queries = [
+...     "What is the name of the first actor?",
+...     "How many movies has George Clooney played in?",
+...     "What is the total number of movies?",
+... ]
 >>> answer_coordinates = [[(0, 0)], [(2, 1)], [(0, 1), (1, 1), (2, 1)]]
 >>> answer_text = [["Brad Pitt"], ["69"], ["209"]]
 >>> table = pd.DataFrame.from_dict(data)
->>> inputs = tokenizer(table=table, queries=queries, answer_coordinates=answer_coordinates, answer_text=answer_text, padding='max_length', return_tensors='tf')
+>>> inputs = tokenizer(
+...     table=table,
+...     queries=queries,
+...     answer_coordinates=answer_coordinates,
+...     answer_text=answer_text,
+...     padding="max_length",
+...     return_tensors="tf",
+... )
 >>> inputs
 {'input_ids': tensor([[ ... ]]), 'attention_mask': tensor([[...]]), 'token_type_ids': tensor([[[...]]]),
 'numeric_values': tensor([[ ... ]]), 'numeric_values_scale: tensor([[ ... ]]), labels: tensor([[ ... ]])}
@@ -194,32 +216,37 @@ Of course, this only shows how to encode a single training example. It is advise
 >>> tsv_path = "your_path_to_the_tsv_file"
 >>> table_csv_path = "your_path_to_a_directory_containing_all_csv_files"

+
 >>> class TableDataset(torch.utils.data.Dataset):
 ...     def __init__(self, data, tokenizer):
 ...         self.data = data
 ...         self.tokenizer = tokenizer
-...
+
 ...     def __getitem__(self, idx):
 ...         item = data.iloc[idx]
-...         table = pd.read_csv(table_csv_path + item.table_file).astype(str) # be sure to make your table data text only
-...         encoding = self.tokenizer(table=table, 
-...                                   queries=item.question, 
-...                                   answer_coordinates=item.answer_coordinates, 
-...                                   answer_text=item.answer_text,
-...                                   truncation=True,
-...                                   padding="max_length",
-...                                   return_tensors="pt"
+...         table = pd.read_csv(table_csv_path + item.table_file).astype(
+...             str
+...         )  # be sure to make your table data text only
+...         encoding = self.tokenizer(
+...             table=table,
+...             queries=item.question,
+...             answer_coordinates=item.answer_coordinates,
+...             answer_text=item.answer_text,
+...             truncation=True,
+...             padding="max_length",
+...             return_tensors="pt",
 ...         )
 ...         # remove the batch dimension which the tokenizer adds by default
 ...         encoding = {key: val.squeeze(0) for key, val in encoding.items()}
 ...         # add the float_answer which is also required (weak supervision for aggregation case)
-...         encoding["float_answer"] = torch.tensor(item.float_answer) 
+...         encoding["float_answer"] = torch.tensor(item.float_answer)
 ...         return encoding
-...
-...     def __len__(self):
-...        return len(self.data)

->>> data = pd.read_csv(tsv_path, sep='\t')
+...     def __len__(self):
+...         return len(self.data)
+
+
+>>> data = pd.read_csv(tsv_path, sep="\t")
 >>> train_dataset = TableDataset(data, tokenizer)
 >>> train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32)
 ===PT-TF-SPLIT===
@@ -229,44 +256,50 @@ Of course, this only shows how to encode a single training example. It is advise
 >>> tsv_path = "your_path_to_the_tsv_file"
 >>> table_csv_path = "your_path_to_a_directory_containing_all_csv_files"

+
 >>> class TableDataset:
 ...     def __init__(self, data, tokenizer):
 ...         self.data = data
 ...         self.tokenizer = tokenizer
-...
+
 ...     def __iter__(self):
 ...         for idx in range(self.__len__()):
 ...             item = self.data.iloc[idx]
-...             table = pd.read_csv(table_csv_path + item.table_file).astype(str) # be sure to make your table data text only
-...             encoding = self.tokenizer(table=table, 
-...                                   queries=item.question, 
-...                                   answer_coordinates=item.answer_coordinates, 
-...                                   answer_text=item.answer_text,
-...                                   truncation=True,
-...                                   padding="max_length",
-...                                   return_tensors="tf"
+...             table = pd.read_csv(table_csv_path + item.table_file).astype(
+...                 str
+...             )  # be sure to make your table data text only
+...             encoding = self.tokenizer(
+...                 table=table,
+...                 queries=item.question,
+...                 answer_coordinates=item.answer_coordinates,
+...                 answer_text=item.answer_text,
+...                 truncation=True,
+...                 padding="max_length",
+...                 return_tensors="tf",
 ...             )
 ...             # remove the batch dimension which the tokenizer adds by default
-...             encoding = {key: tf.squeeze(val,0) for key, val in encoding.items()}
+...             encoding = {key: tf.squeeze(val, 0) for key, val in encoding.items()}
 ...             # add the float_answer which is also required (weak supervision for aggregation case)
-...             encoding["float_answer"] = tf.convert_to_tensor(item.float_answer,dtype=tf.float32)
-...             yield encoding['input_ids'], encoding['attention_mask'], encoding['numeric_values'], \
-...                   encoding['numeric_values_scale'], encoding['token_type_ids'], encoding['labels'], \
-...                   encoding['float_answer']
-...
-...     def __len__(self):
-...        return len(self.data)
+...             encoding["float_answer"] = tf.convert_to_tensor(item.float_answer, dtype=tf.float32)
+...             yield encoding["input_ids"], encoding["attention_mask"], encoding["numeric_values"], encoding[
+...                 "numeric_values_scale"
+...             ], encoding["token_type_ids"], encoding["labels"], encoding["float_answer"]

->>> data = pd.read_csv(tsv_path, sep='\t')
+...     def __len__(self):
+...         return len(self.data)
+
+
+>>> data = pd.read_csv(tsv_path, sep="\t")
 >>> train_dataset = TableDataset(data, tokenizer)
 >>> output_signature = (
-... tf.TensorSpec(shape=(512,), dtype=tf.int32),
-... tf.TensorSpec(shape=(512,), dtype=tf.int32),
-... tf.TensorSpec(shape=(512,), dtype=tf.float32),
-... tf.TensorSpec(shape=(512,), dtype=tf.float32),
-... tf.TensorSpec(shape=(512,7), dtype=tf.int32),
-... tf.TensorSpec(shape=(512,), dtype=tf.int32),
-... tf.TensorSpec(shape=(512,), dtype=tf.float32))
+...     tf.TensorSpec(shape=(512,), dtype=tf.int32),
+...     tf.TensorSpec(shape=(512,), dtype=tf.int32),
+...     tf.TensorSpec(shape=(512,), dtype=tf.float32),
+...     tf.TensorSpec(shape=(512,), dtype=tf.float32),
+...     tf.TensorSpec(shape=(512, 7), dtype=tf.int32),
+...     tf.TensorSpec(shape=(512,), dtype=tf.int32),
+...     tf.TensorSpec(shape=(512,), dtype=tf.float32),
+... )
 >>> train_dataloader = tf.data.Dataset.from_generator(train_dataset, output_signature=output_signature).batch(32)
 ```

@@ -282,15 +315,15 @@ You can then fine-tune [`TapasForQuestionAnswering`] or [`TFTapasForQuestionAnsw

 >>> # this is the default WTQ configuration
 >>> config = TapasConfig(
-...            num_aggregation_labels = 4,
-...            use_answer_as_supervision = True,
-...            answer_loss_cutoff = 0.664694,
-...            cell_selection_preference = 0.207951,
-...            huber_loss_delta = 0.121194,
-...            init_cell_selection_weights_to_zero = True,
-...            select_one_column = True,
-...            allow_empty_column_selection = False,
-...            temperature = 0.0352513,
+...     num_aggregation_labels=4,
+...     use_answer_as_supervision=True,
+...     answer_loss_cutoff=0.664694,
+...     cell_selection_preference=0.207951,
+...     huber_loss_delta=0.121194,
+...     init_cell_selection_weights_to_zero=True,
+...     select_one_column=True,
+...     allow_empty_column_selection=False,
+...     temperature=0.0352513,
 ... )
 >>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)

@@ -298,8 +331,8 @@ You can then fine-tune [`TapasForQuestionAnswering`] or [`TFTapasForQuestionAnsw

 >>> model.train()
 >>> for epoch in range(2):  # loop over the dataset multiple times
-...    for batch in train_dataloader:
-...         # get the inputs; 
+...     for batch in train_dataloader:
+...         # get the inputs;
 ...         input_ids = batch["input_ids"]
 ...         attention_mask = batch["attention_mask"]
 ...         token_type_ids = batch["token_type_ids"]
@@ -312,9 +345,15 @@ You can then fine-tune [`TapasForQuestionAnswering`] or [`TFTapasForQuestionAnsw
 ...         optimizer.zero_grad()

 ...         # forward + backward + optimize
-...         outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, 
-...                        labels=labels, numeric_values=numeric_values, numeric_values_scale=numeric_values_scale, 
-...                        float_answer=float_answer)
+...         outputs = model(
+...             input_ids=input_ids,
+...             attention_mask=attention_mask,
+...             token_type_ids=token_type_ids,
+...             labels=labels,
+...             numeric_values=numeric_values,
+...             numeric_values_scale=numeric_values_scale,
+...             float_answer=float_answer,
+...         )
 ...         loss = outputs.loss
 ...         loss.backward()
 ...         optimizer.step()
@@ -324,23 +363,23 @@ You can then fine-tune [`TapasForQuestionAnswering`] or [`TFTapasForQuestionAnsw

 >>> # this is the default WTQ configuration
 >>> config = TapasConfig(
-...            num_aggregation_labels = 4,
-...            use_answer_as_supervision = True,
-...            answer_loss_cutoff = 0.664694,
-...            cell_selection_preference = 0.207951,
-...            huber_loss_delta = 0.121194,
-...            init_cell_selection_weights_to_zero = True,
-...            select_one_column = True,
-...            allow_empty_column_selection = False,
-...            temperature = 0.0352513,
+...     num_aggregation_labels=4,
+...     use_answer_as_supervision=True,
+...     answer_loss_cutoff=0.664694,
+...     cell_selection_preference=0.207951,
+...     huber_loss_delta=0.121194,
+...     init_cell_selection_weights_to_zero=True,
+...     select_one_column=True,
+...     allow_empty_column_selection=False,
+...     temperature=0.0352513,
 ... )
 >>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)

 >>> optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

 >>> for epoch in range(2):  # loop over the dataset multiple times
-...    for batch in train_dataloader:
-...         # get the inputs; 
+...     for batch in train_dataloader:
+...         # get the inputs;
 ...         input_ids = batch[0]
 ...         attention_mask = batch[1]
 ...         token_type_ids = batch[4]
@@ -351,9 +390,15 @@ You can then fine-tune [`TapasForQuestionAnswering`] or [`TFTapasForQuestionAnsw

 ...         # forward + backward + optimize
 ...         with tf.GradientTape() as tape:
-...              outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, 
-...                        labels=labels, numeric_values=numeric_values, numeric_values_scale=numeric_values_scale, 
-...                        float_answer=float_answer )
+...             outputs = model(
+...                 input_ids=input_ids,
+...                 attention_mask=attention_mask,
+...                 token_type_ids=token_type_ids,
+...                 labels=labels,
+...                 numeric_values=numeric_values,
+...                 numeric_values_scale=numeric_values_scale,
+...                 float_answer=float_answer,
+...             )
 ...         grads = tape.gradient(outputs.loss, model.trainable_weights)
 ...         optimizer.apply_gradients(zip(grads, model.trainable_weights))
 ```
@@ -366,47 +411,49 @@ However, note that inference is **different** depending on whether or not the se

 ```py
 >>> from transformers import TapasTokenizer, TapasForQuestionAnswering
->>> import pandas as pd 
+>>> import pandas as pd

->>> model_name = 'google/tapas-base-finetuned-wtq'
+>>> model_name = "google/tapas-base-finetuned-wtq"
 >>> model = TapasForQuestionAnswering.from_pretrained(model_name)
 >>> tokenizer = TapasTokenizer.from_pretrained(model_name)

->>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], 'Number of movies': ["87", "53", "69"]}
->>> queries = ["What is the name of the first actor?", "How many movies has George Clooney played in?", "What is the total number of movies?"]
+>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
+>>> queries = [
+...     "What is the name of the first actor?",
+...     "How many movies has George Clooney played in?",
+...     "What is the total number of movies?",
+... ]
 >>> table = pd.DataFrame.from_dict(data)
->>> inputs = tokenizer(table=table, queries=queries, padding='max_length', return_tensors="pt") 
+>>> inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="pt")
 >>> outputs = model(**inputs)
 >>> predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
-...         inputs, 
-...         outputs.logits.detach(), 
-...         outputs.logits_aggregation.detach()
+...     inputs, outputs.logits.detach(), outputs.logits_aggregation.detach()
 ... )

 >>> # let's print out the results:
->>> id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3:"COUNT"}
+>>> id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"}
 >>> aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices]

 >>> answers = []
 >>> for coordinates in predicted_answer_coordinates:
-...   if len(coordinates) == 1:
-...     # only a single cell:
-...     answers.append(table.iat[coordinates[0]])
-...   else:
-...     # multiple cells
-...     cell_values = []
-...     for coordinate in coordinates:
-...        cell_values.append(table.iat[coordinate])
-...     answers.append(", ".join(cell_values))
+...     if len(coordinates) == 1:
+...         # only a single cell:
+...         answers.append(table.iat[coordinates[0]])
+...     else:
+...         # multiple cells
+...         cell_values = []
+...         for coordinate in coordinates:
+...             cell_values.append(table.iat[coordinate])
+...         answers.append(", ".join(cell_values))

 >>> display(table)
 >>> print("")
 >>> for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string):
-...   print(query)
-...   if predicted_agg == "NONE":
-...     print("Predicted answer: " + answer)
-...   else:
-...     print("Predicted answer: " + predicted_agg + " > " + answer)    
+...     print(query)
+...     if predicted_agg == "NONE":
+...         print("Predicted answer: " + answer)
+...     else:
+...         print("Predicted answer: " + predicted_agg + " > " + answer)
 What is the name of the first actor?
 Predicted answer: Brad Pitt
 How many movies has George Clooney played in?
@@ -415,47 +462,49 @@ What is the total number of movies?
 Predicted answer: SUM > 87, 53, 69
 ===PT-TF-SPLIT===
 >>> from transformers import TapasTokenizer, TFTapasForQuestionAnswering
->>> import pandas as pd 
+>>> import pandas as pd

->>> model_name = 'google/tapas-base-finetuned-wtq'
+>>> model_name = "google/tapas-base-finetuned-wtq"
 >>> model = TFTapasForQuestionAnswering.from_pretrained(model_name)
 >>> tokenizer = TapasTokenizer.from_pretrained(model_name)

->>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], 'Number of movies': ["87", "53", "69"]}
->>> queries = ["What is the name of the first actor?", "How many movies has George Clooney played in?", "What is the total number of movies?"]
+>>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
+>>> queries = [
+...     "What is the name of the first actor?",
+...     "How many movies has George Clooney played in?",
+...     "What is the total number of movies?",
+... ]
 >>> table = pd.DataFrame.from_dict(data)
->>> inputs = tokenizer(table=table, queries=queries, padding='max_length', return_tensors="tf") 
+>>> inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="tf")
 >>> outputs = model(**inputs)
 >>> predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
-...         inputs, 
-...         outputs.logits, 
-...         outputs.logits_aggregation
+...     inputs, outputs.logits, outputs.logits_aggregation
 ... )

 >>> # let's print out the results:
->>> id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3:"COUNT"}
+>>> id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"}
 >>> aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices]

 >>> answers = []
 >>> for coordinates in predicted_answer_coordinates:
-...   if len(coordinates) == 1:
-...     # only a single cell:
-...     answers.append(table.iat[coordinates[0]])
-...   else:
-...     # multiple cells
-...     cell_values = []
-...     for coordinate in coordinates:
-...        cell_values.append(table.iat[coordinate])
-...     answers.append(", ".join(cell_values))
+...     if len(coordinates) == 1:
+...         # only a single cell:
+...         answers.append(table.iat[coordinates[0]])
+...     else:
+...         # multiple cells
+...         cell_values = []
+...         for coordinate in coordinates:
+...             cell_values.append(table.iat[coordinate])
+...         answers.append(", ".join(cell_values))

 >>> display(table)
 >>> print("")
 >>> for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string):
-...   print(query)
-...   if predicted_agg == "NONE":
-...     print("Predicted answer: " + answer)
-...   else:
-...     print("Predicted answer: " + predicted_agg + " > " + answer)    
+...     print(query)
+...     if predicted_agg == "NONE":
+...         print("Predicted answer: " + answer)
+...     else:
+...         print("Predicted answer: " + predicted_agg + " > " + answer)
 What is the name of the first actor?
 Predicted answer: Brad Pitt
 How many movies has George Clooney played in?
--- a/docs/source/model_doc/visual_bert.mdx
+++ b/docs/source/model_doc/visual_bert.mdx
@@ -77,11 +77,13 @@ The following example shows how to get the last hidden state using [`VisualBertM

 >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
 >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
->>> inputs.update({
-...     "visual_embeds": visual_embeds,
-...     "visual_token_type_ids": visual_token_type_ids,
-...     "visual_attention_mask": visual_attention_mask
-... })
+>>> inputs.update(
+...     {
+...         "visual_embeds": visual_embeds,
+...         "visual_token_type_ids": visual_token_type_ids,
+...         "visual_attention_mask": visual_attention_mask,
+...     }
+... )
 >>> outputs = model(**inputs)
 >>> last_hidden_state = outputs.last_hidden_state
 ```