From 986526a0e4f5ab803581074e9e4069c3edcff1dc Mon Sep 17 00:00:00 2001
From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
Date: Fri, 29 Jul 2022 08:09:09 -0400
Subject: [PATCH] Replace `as_target` context managers by direct calls (#18325)

* Preliminary work on tokenizers

* Quality + fix tests

* Treat processors

* Fix pad

* Remove all uses of  in tests, docs and examples

* Replace all as_target_tokenizer

* Fix tests

* Fix quality

* Update examples/flax/image-captioning/run_image_captioning_flax.py

Co-authored-by: amyeroberts <amy@huggingface.co>

* Style

Co-authored-by: amyeroberts <amy@huggingface.co>
---
 docs/source/en/model_doc/m2m_100.mdx          |   4 +-
 docs/source/en/model_doc/marian.mdx           |   2 +-
 docs/source/en/model_doc/mbart.mdx            |  17 +--
 docs/source/en/model_doc/mctct.mdx            |   1 -
 docs/source/en/model_doc/nllb.mdx             |   1 -
 docs/source/en/model_doc/plbart.mdx           |  12 +-
 .../en/model_doc/speech-encoder-decoder.mdx   |   2 +-
 docs/source/en/model_doc/speech_to_text.mdx   |   1 -
 docs/source/en/model_doc/speech_to_text_2.mdx |   1 -
 docs/source/en/model_doc/trocr.mdx            |   1 -
 docs/source/en/model_doc/wav2vec2.mdx         |   2 -
 docs/source/en/preprocessing.mdx              |   4 +-
 docs/source/en/tasks/asr.mdx                  |  19 +--
 docs/source/en/tasks/summarization.mdx        |   5 +-
 docs/source/en/tasks/translation.mdx          |   7 +-
 docs/source/es/preprocessing.mdx              |   4 +-
 docs/source/it/preprocessing.mdx              |   4 +-
 .../run_image_captioning_flax.py              |  13 +-
 .../summarization/run_summarization_flax.py   |  11 +-
 .../question-answering/run_seq2seq_qa.py      |  10 +-
 .../run_speech_recognition_ctc.py             |  13 +-
 .../summarization/run_summarization.py        |   5 +-
 .../run_summarization_no_trainer.py           |   5 +-
 .../pytorch/translation/run_translation.py    |   5 +-
 .../translation/run_translation_no_trainer.py |   5 +-
 .../run_speech_recognition_ctc_bnb.py         |  13 +-
 .../run_speech_recognition_ctc_streaming.py   |  13 +-
 .../tapex/run_wikisql_with_tapex.py           |  13 +-
 .../run_wikitablequestions_with_tapex.py      |  13 +-
 .../research_projects/wav2vec2/run_asr.py     |  22 ++--
 .../wav2vec2/run_common_voice.py              |  24 ++--
 .../xtreme-s/run_xtreme_s.py                  |  13 +-
 .../summarization/run_summarization.py        |   5 +-
 .../tensorflow/translation/run_translation.py |   5 +-
 .../models/hubert/modeling_tf_hubert.py       |   5 +-
 .../models/m2m_100/tokenization_m2m_100.py    |  19 +--
 .../models/marian/tokenization_marian.py      |  20 +--
 .../models/mbart/tokenization_mbart.py        |  20 +--
 .../models/mbart/tokenization_mbart_fast.py   |  20 +--
 .../models/mbart50/tokenization_mbart50.py    |  21 ++--
 .../mbart50/tokenization_mbart50_fast.py      |  21 ++--
 .../models/mctct/processing_mctct.py          |  62 +++++++++-
 .../models/mt5/modeling_flax_mt5.py           |   9 +-
 src/transformers/models/mt5/modeling_mt5.py   |   9 +-
 .../models/mt5/modeling_tf_mt5.py             |   9 +-
 .../models/nllb/tokenization_nllb.py          |  20 +--
 .../models/nllb/tokenization_nllb_fast.py     |  20 +--
 .../models/plbart/tokenization_plbart.py      |  20 +--
 src/transformers/models/rag/modeling_rag.py   |   6 +-
 .../models/rag/tokenization_rag.py            |  36 +++---
 .../modeling_speech_encoder_decoder.py        |   3 +-
 .../processing_speech_to_text.py              |  39 +++++-
 .../processing_speech_to_text_2.py            |  39 +++++-
 .../models/tapex/tokenization_tapex.py        |  31 +----
 .../models/trocr/processing_trocr.py          |  39 +++++-
 .../models/wav2vec2/modeling_tf_wav2vec2.py   |   5 +-
 .../models/wav2vec2/processing_wav2vec2.py    |  61 +++++++++-
 .../processing_wav2vec2_with_lm.py            |  64 +++++++++-
 src/transformers/tokenization_utils_base.py   | 114 ++++++++++++++++--
 src/transformers/utils/doc.py                 |   6 +-
 tests/models/bart/test_tokenization_bart.py   |   8 +-
 tests/models/byt5/test_tokenization_byt5.py   |  13 +-
 .../models/canine/test_tokenization_canine.py |   5 +-
 tests/models/dpr/test_tokenization_dpr.py     |   1 -
 .../m2m_100/test_tokenization_m2m_100.py      |  20 +--
 tests/models/marian/test_modeling_marian.py   |   5 +-
 .../models/marian/test_tokenization_marian.py |   5 +-
 tests/models/mbart/test_tokenization_mbart.py |  37 +++---
 .../mbart50/test_tokenization_mbart50.py      |  37 +++---
 tests/models/mctct/test_processor_mctct.py    |   3 +-
 tests/models/mvp/test_tokenization_mvp.py     |  11 +-
 tests/models/nllb/test_tokenization_nllb.py   |  23 ++--
 .../pegasus/test_tokenization_pegasus.py      |  14 +--
 .../perceiver/test_tokenization_perceiver.py  |   7 +-
 .../models/plbart/test_tokenization_plbart.py |  34 +++---
 .../test_processor_speech_to_text.py          |   3 +-
 tests/models/t5/test_tokenization_t5.py       |  13 +-
 tests/models/tapex/test_tokenization_tapex.py |  37 +++---
 .../wav2vec2/test_processor_wav2vec2.py       |   3 +-
 .../test_processor_wav2vec2_with_lm.py        |   3 +-
 80 files changed, 725 insertions(+), 550 deletions(-)

diff --git a/docs/source/en/model_doc/m2m_100.mdx b/docs/source/en/model_doc/m2m_100.mdx
index 65e119aa4e..f0a7714d24 100644
--- a/docs/source/en/model_doc/m2m_100.mdx
+++ b/docs/source/en/model_doc/m2m_100.mdx
@@ -55,9 +55,7 @@ tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en
 src_text = "Life is like a box of chocolates."
 tgt_text = "La vie est comme une boîte de chocolat."
 
-model_inputs = tokenizer(src_text, return_tensors="pt")
-with tokenizer.as_target_tokenizer():
-    labels = tokenizer(tgt_text, return_tensors="pt").input_ids
+model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
 
 loss = model(**model_inputs, labels=labels)  # forward pass
 ```
diff --git a/docs/source/en/model_doc/marian.mdx b/docs/source/en/model_doc/marian.mdx
index 7b10c9309a..9d0a9ff257 100644
--- a/docs/source/en/model_doc/marian.mdx
+++ b/docs/source/en/model_doc/marian.mdx
@@ -155,7 +155,7 @@ Example of translating english to many romance languages, using old-style 2 char
 ## MarianTokenizer
 
 [[autodoc]] MarianTokenizer
-    - as_target_tokenizer
+    - build_inputs_with_special_tokens
 
 ## MarianModel
 
diff --git a/docs/source/en/model_doc/mbart.mdx b/docs/source/en/model_doc/mbart.mdx
index 0f3d82ce5d..b24e31f33c 100644
--- a/docs/source/en/model_doc/mbart.mdx
+++ b/docs/source/en/model_doc/mbart.mdx
@@ -34,8 +34,8 @@ model is multilingual it expects the sequences in a different format. A special
 source and target text. The source text format is `X [eos, src_lang_code]` where `X` is the source text. The
 target text format is `[tgt_lang_code] X [eos]`. `bos` is never used.
 
-The regular [`~MBartTokenizer.__call__`] will encode source text format, and it should be wrapped
-inside the context manager [`~MBartTokenizer.as_target_tokenizer`] to encode target text format.
+The regular [`~MBartTokenizer.__call__`] will encode source text format passed as first argument or with the `text`
+keyword, and target text format passed with the `text_label` keyword argument.
 
 - Supervised training
 
@@ -46,13 +46,11 @@ inside the context manager [`~MBartTokenizer.as_target_tokenizer`] to encode tar
 >>> example_english_phrase = "UN Chief Says There Is No Military Solution in Syria"
 >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
 
->>> inputs = tokenizer(example_english_phrase, return_tensors="pt")
->>> with tokenizer.as_target_tokenizer():
-...     labels = tokenizer(expected_translation_romanian, return_tensors="pt")
+>>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
 
 >>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
 >>> # forward pass
->>> model(**inputs, labels=batch["labels"])
+>>> model(**inputs)
 ```
 
 - Generation
@@ -108,11 +106,9 @@ tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_
 src_text = " UN Chief Says There Is No Military Solution in Syria"
 tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
 
-model_inputs = tokenizer(src_text, return_tensors="pt")
-with tokenizer.as_target_tokenizer():
-    labels = tokenizer(tgt_text, return_tensors="pt").input_ids
+model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
 
-model(**model_inputs, labels=labels)  # forward pass
+model(**model_inputs)  # forward pass
 ```
 
 - Generation
@@ -154,7 +150,6 @@ tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
 ## MBartTokenizer
 
 [[autodoc]] MBartTokenizer
-    - as_target_tokenizer
     - build_inputs_with_special_tokens
 
 ## MBartTokenizerFast
diff --git a/docs/source/en/model_doc/mctct.mdx b/docs/source/en/model_doc/mctct.mdx
index f064f1e3d0..531508cfa9 100644
--- a/docs/source/en/model_doc/mctct.mdx
+++ b/docs/source/en/model_doc/mctct.mdx
@@ -48,7 +48,6 @@ This model was contributed by [cwkeam](https://huggingface.co/cwkeam). The origi
     - save_pretrained
     - batch_decode
     - decode
-    - as_target_processor
 
 
 ## MCTCTModel
diff --git a/docs/source/en/model_doc/nllb.mdx b/docs/source/en/model_doc/nllb.mdx
index 477ef1ca83..d2c0089fa3 100644
--- a/docs/source/en/model_doc/nllb.mdx
+++ b/docs/source/en/model_doc/nllb.mdx
@@ -91,7 +91,6 @@ UN-Chef sagt, es gibt keine militärische Lösung in Syrien
 ## NllbTokenizer
 
 [[autodoc]] NllbTokenizer
-    - as_target_tokenizer
     - build_inputs_with_special_tokens
 
 ## NllbTokenizerFast
diff --git a/docs/source/en/model_doc/plbart.mdx b/docs/source/en/model_doc/plbart.mdx
index 6e3e4a5b77..0755bb9a56 100644
--- a/docs/source/en/model_doc/plbart.mdx
+++ b/docs/source/en/model_doc/plbart.mdx
@@ -45,8 +45,9 @@ target text format is `[tgt_lang_code] X [eos]`. `bos` is never used.
 
 However, for fine-tuning, in some cases no language token is provided in cases where a single language is used. Please refer to [the paper](https://arxiv.org/abs/2103.06333) to learn more about this.
 
-In cases where the language code is needed, The regular [`~PLBartTokenizer.__call__`] will encode source text format, and it should be wrapped
-inside the context manager [`~PLBartTokenizer.as_target_tokenizer`] to encode target text format.
+In cases where the language code is needed, the regular [`~PLBartTokenizer.__call__`] will encode source text format 
+when you pass texts as the first argument or with the keyword argument `text`, and will encode target text format if
+it's passed with the `text_target` keyword argument.
 
 - Supervised training
 
@@ -56,11 +57,7 @@ inside the context manager [`~PLBartTokenizer.as_target_tokenizer`] to encode ta
 >>> tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-base", src_lang="en_XX", tgt_lang="python")
 >>> example_python_phrase = "def maximum(a,b,c):NEW_LINE_INDENTreturn max([a,b,c])"
 >>> expected_translation_english = "Returns the maximum value of a b c."
->>> inputs = tokenizer(example_python_phrase, return_tensors="pt")
->>> with tokenizer.as_target_tokenizer():
-...     labels = tokenizer(expected_translation_english, return_tensors="pt")
->>> inputs["labels"] = labels["input_ids"]
->>> # forward pass
+>>> inputs = tokenizer(example_python_phrase, text_target=expected_translation_english, return_tensors="pt")
 >>> model(**inputs)
 ```
 
@@ -88,7 +85,6 @@ inside the context manager [`~PLBartTokenizer.as_target_tokenizer`] to encode ta
 ## PLBartTokenizer
 
 [[autodoc]] PLBartTokenizer
-    - as_target_tokenizer
     - build_inputs_with_special_tokens
 
 ## PLBartModel
diff --git a/docs/source/en/model_doc/speech-encoder-decoder.mdx b/docs/source/en/model_doc/speech-encoder-decoder.mdx
index 67028f790d..b0718a27a8 100644
--- a/docs/source/en/model_doc/speech-encoder-decoder.mdx
+++ b/docs/source/en/model_doc/speech-encoder-decoder.mdx
@@ -107,7 +107,7 @@ speech inputs) and `labels` (which are the `input_ids` of the encoded target seq
 >>> labels = tokenizer(ds[0]["text"], return_tensors="pt").input_ids
 
 >>> # the forward function automatically creates the correct decoder_input_ids
->>> loss = model(input_values, labels=labels).loss
+>>> loss = model(**input_features).loss
 >>> loss.backward()
 ```
 
diff --git a/docs/source/en/model_doc/speech_to_text.mdx b/docs/source/en/model_doc/speech_to_text.mdx
index e11d95442d..9d855fceb4 100644
--- a/docs/source/en/model_doc/speech_to_text.mdx
+++ b/docs/source/en/model_doc/speech_to_text.mdx
@@ -120,7 +120,6 @@ See the [model hub](https://huggingface.co/models?filter=speech_to_text) to look
     - save_pretrained
     - batch_decode
     - decode
-    - as_target_processor
 
 ## Speech2TextModel
 
diff --git a/docs/source/en/model_doc/speech_to_text_2.mdx b/docs/source/en/model_doc/speech_to_text_2.mdx
index 72754b67aa..ce9e29c32e 100644
--- a/docs/source/en/model_doc/speech_to_text_2.mdx
+++ b/docs/source/en/model_doc/speech_to_text_2.mdx
@@ -114,7 +114,6 @@ See [model hub](https://huggingface.co/models?filter=speech2text2) to look for S
     - save_pretrained
     - batch_decode
     - decode
-    - as_target_processor
 
 ## Speech2Text2ForCausalLM
 
diff --git a/docs/source/en/model_doc/trocr.mdx b/docs/source/en/model_doc/trocr.mdx
index 08de107e43..37dc6f5455 100644
--- a/docs/source/en/model_doc/trocr.mdx
+++ b/docs/source/en/model_doc/trocr.mdx
@@ -94,7 +94,6 @@ See the [model hub](https://huggingface.co/models?filter=trocr) to look for TrOC
     - save_pretrained
     - batch_decode
     - decode
-    - as_target_processor
 
 ## TrOCRForCausalLM
 
diff --git a/docs/source/en/model_doc/wav2vec2.mdx b/docs/source/en/model_doc/wav2vec2.mdx
index 9b2f13ea45..eaca36be46 100644
--- a/docs/source/en/model_doc/wav2vec2.mdx
+++ b/docs/source/en/model_doc/wav2vec2.mdx
@@ -62,7 +62,6 @@ This model was contributed by [patrickvonplaten](https://huggingface.co/patrickv
     - save_pretrained
     - batch_decode
     - decode
-    - as_target_processor
 
 ## Wav2Vec2ProcessorWithLM
 
@@ -73,7 +72,6 @@ This model was contributed by [patrickvonplaten](https://huggingface.co/patrickv
     - save_pretrained
     - batch_decode
     - decode
-    - as_target_processor
 
 ## Wav2Vec2 specific outputs
 
diff --git a/docs/source/en/preprocessing.mdx b/docs/source/en/preprocessing.mdx
index f9bdae3603..e67741633a 100644
--- a/docs/source/en/preprocessing.mdx
+++ b/docs/source/en/preprocessing.mdx
@@ -486,10 +486,8 @@ A processor combines a feature extractor and tokenizer. Load a processor with [`
 >>> def prepare_dataset(example):
 ...     audio = example["audio"]
 
-...     example["input_values"] = processor(audio["array"], sampling_rate=16000)
+...     example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000))
 
-...     with processor.as_target_processor():
-...         example["labels"] = processor(example["text"]).input_ids
 ...     return example
 ```
 
diff --git a/docs/source/en/tasks/asr.mdx b/docs/source/en/tasks/asr.mdx
index 8ceea824f4..daa627aaf1 100644
--- a/docs/source/en/tasks/asr.mdx
+++ b/docs/source/en/tasks/asr.mdx
@@ -109,11 +109,10 @@ The preprocessing function needs to:
 >>> def prepare_dataset(batch):
 ...     audio = batch["audio"]
 
-...     batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
+...     batch = processor(audio=audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
 ...     batch["input_length"] = len(batch["input_values"])
 
-...     with processor.as_target_processor():
-...         batch["labels"] = processor(batch["transcription"]).input_ids
+...     batch["labels"] = processor(text=batch["transcription"]).input_ids
 ...     return batch
 ```
 
@@ -146,17 +145,9 @@ Unlike other data collators, this specific data collator needs to apply a differ
 ...         input_features = [{"input_values": feature["input_values"]} for feature in features]
 ...         label_features = [{"input_ids": feature["labels"]} for feature in features]
 
-...         batch = self.processor.pad(
-...             input_features,
-...             padding=self.padding,
-...             return_tensors="pt",
-...         )
-...         with self.processor.as_target_processor():
-...             labels_batch = self.processor.pad(
-...                 label_features,
-...                 padding=self.padding,
-...                 return_tensors="pt",
-...             )
+...         batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")
+
+...         labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")
 
 ...         # replace padding with -100 to ignore loss correctly
 ...         labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
diff --git a/docs/source/en/tasks/summarization.mdx b/docs/source/en/tasks/summarization.mdx
index 1c73c7396e..f636141a15 100644
--- a/docs/source/en/tasks/summarization.mdx
+++ b/docs/source/en/tasks/summarization.mdx
@@ -67,7 +67,7 @@ Load the T5 tokenizer to process `text` and `summary`:
 The preprocessing function needs to:
 
 1. Prefix the input with a prompt so T5 knows this is a summarization task. Some models capable of multiple NLP tasks require prompting for specific tasks.
-2. Use a context manager with the `as_target_tokenizer()` function to parallelize tokenization of inputs and labels.
+2. Use the keyword `text_target` argument when tokenizing labels.
 3. Truncate sequences to be no longer than the maximum length set by the `max_length` parameter.
 
 ```py
@@ -78,8 +78,7 @@ The preprocessing function needs to:
 ...     inputs = [prefix + doc for doc in examples["text"]]
 ...     model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
 
-...     with tokenizer.as_target_tokenizer():
-...         labels = tokenizer(examples["summary"], max_length=128, truncation=True)
+...     labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
 
 ...     model_inputs["labels"] = labels["input_ids"]
 ...     return model_inputs
diff --git a/docs/source/en/tasks/translation.mdx b/docs/source/en/tasks/translation.mdx
index 4f628b06db..d17b870414 100644
--- a/docs/source/en/tasks/translation.mdx
+++ b/docs/source/en/tasks/translation.mdx
@@ -78,12 +78,7 @@ The preprocessing function needs to:
 >>> def preprocess_function(examples):
 ...     inputs = [prefix + example[source_lang] for example in examples["translation"]]
 ...     targets = [example[target_lang] for example in examples["translation"]]
-...     model_inputs = tokenizer(inputs, max_length=128, truncation=True)
-
-...     with tokenizer.as_target_tokenizer():
-...         labels = tokenizer(targets, max_length=128, truncation=True)
-
-...     model_inputs["labels"] = labels["input_ids"]
+...     model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
 ...     return model_inputs
 ```
 
diff --git a/docs/source/es/preprocessing.mdx b/docs/source/es/preprocessing.mdx
index 3e749ca2cd..9608bf58d9 100644
--- a/docs/source/es/preprocessing.mdx
+++ b/docs/source/es/preprocessing.mdx
@@ -471,10 +471,8 @@ Un processor combina un extractor de características y un tokenizador. Cargue u
 >>> def prepare_dataset(example):
 ...     audio = example["audio"]
 
-...     example["input_values"] = processor(audio["array"], sampling_rate=16000)
+...     example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000))
 
-...     with processor.as_target_processor():
-...         example["labels"] = processor(example["text"]).input_ids
 ...     return example
 ```
 
diff --git a/docs/source/it/preprocessing.mdx b/docs/source/it/preprocessing.mdx
index 5a245fe843..a57ff9df91 100644
--- a/docs/source/it/preprocessing.mdx
+++ b/docs/source/it/preprocessing.mdx
@@ -471,10 +471,8 @@ Un processor combina un estrattore di caratteristiche e un tokenizer. Carica un
 >>> def prepare_dataset(example):
 ...     audio = example["audio"]
 
-...     example["input_values"] = processor(audio["array"], sampling_rate=16000)
+...     example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000))
 
-...     with processor.as_target_processor():
-...         example["labels"] = processor(example["text"]).input_ids
 ...     return example
 ```
 
diff --git a/examples/flax/image-captioning/run_image_captioning_flax.py b/examples/flax/image-captioning/run_image_captioning_flax.py
index 4fe144db8b..4552defb8e 100644
--- a/examples/flax/image-captioning/run_image_captioning_flax.py
+++ b/examples/flax/image-captioning/run_image_captioning_flax.py
@@ -552,11 +552,14 @@ def main():
         targets = captions
 
         model_inputs = {}
-        # Setup the tokenizer for targets
-        with tokenizer.as_target_tokenizer():
-            labels = tokenizer(
-                targets, max_length=max_target_length, padding="max_length", truncation=True, return_tensors="np"
-            )
+
+        labels = tokenizer(
+            text_target=targets,
+            max_length=max_target_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors="np",
+        )
         model_inputs["labels"] = labels["input_ids"]
         decoder_input_ids = shift_tokens_right_fn(
             labels["input_ids"], model.config.pad_token_id, model.config.decoder_start_token_id
diff --git a/examples/flax/summarization/run_summarization_flax.py b/examples/flax/summarization/run_summarization_flax.py
index d6f8ec78ba..8f3db8daa0 100644
--- a/examples/flax/summarization/run_summarization_flax.py
+++ b/examples/flax/summarization/run_summarization_flax.py
@@ -590,10 +590,13 @@ def main():
         )
 
         # Setup the tokenizer for targets
-        with tokenizer.as_target_tokenizer():
-            labels = tokenizer(
-                targets, max_length=max_target_length, padding="max_length", truncation=True, return_tensors="np"
-            )
+        labels = tokenizer(
+            text_target=targets,
+            max_length=max_target_length,
+            padding="max_length",
+            truncation=True,
+            return_tensors="np",
+        )
 
         model_inputs["labels"] = labels["input_ids"]
         decoder_input_ids = shift_tokens_right_fn(
diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py
index f0178cfaf1..b460f8d9d5 100644
--- a/examples/pytorch/question-answering/run_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/run_seq2seq_qa.py
@@ -453,9 +453,8 @@ def main():
         inputs, targets = preprocess_squad_batch(examples, question_column, context_column, answer_column)
 
         model_inputs = tokenizer(inputs, max_length=max_seq_length, padding=padding, truncation=True)
-        # Setup the tokenizer for targets
-        with tokenizer.as_target_tokenizer():
-            labels = tokenizer(targets, max_length=max_answer_length, padding=padding, truncation=True)
+        # Tokenize targets with text_target=...
+        labels = tokenizer(text_target=targets, max_length=max_answer_length, padding=padding, truncation=True)
 
         # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
         # padding in the loss.
@@ -479,9 +478,8 @@ def main():
             return_overflowing_tokens=True,
             return_offsets_mapping=True,
         )
-        # Setup the tokenizer for targets
-        with tokenizer.as_target_tokenizer():
-            labels = tokenizer(targets, max_length=max_answer_length, padding=padding, truncation=True)
+        # Tokenize targets with the `text_target` keyword argument
+        labels = tokenizer(text_target=targets, max_length=max_answer_length, padding=padding, truncation=True)
 
         # Since one example might give us several features if it has a long context, we need a map from a feature to
         # its corresponding example. This key gives us just that.
diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
index b2890dd981..720dfd1ea0 100755
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@@ -305,13 +305,12 @@ class DataCollatorCTCWithPadding:
             return_tensors="pt",
         )
 
-        with self.processor.as_target_processor():
-            labels_batch = self.processor.pad(
-                label_features,
-                padding=self.padding,
-                pad_to_multiple_of=self.pad_to_multiple_of_labels,
-                return_tensors="pt",
-            )
+        labels_batch = self.processor.pad(
+            labels=label_features,
+            padding=self.padding,
+            pad_to_multiple_of=self.pad_to_multiple_of_labels,
+            return_tensors="pt",
+        )
 
         # replace padding with -100 to ignore loss correctly
         labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py
index 00880c5937..031ac25fa2 100755
--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@@ -522,9 +522,8 @@ def main():
         inputs = [prefix + inp for inp in inputs]
         model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True)
 
-        # Setup the tokenizer for targets
-        with tokenizer.as_target_tokenizer():
-            labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)
+        # Tokenize targets with the `text_target` keyword argument
+        labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True)
 
         # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
         # padding in the loss.
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
index c389257430..16f24cbdab 100644
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -470,9 +470,8 @@ def main():
         inputs = [prefix + inp for inp in inputs]
         model_inputs = tokenizer(inputs, max_length=args.max_source_length, padding=padding, truncation=True)
 
-        # Setup the tokenizer for targets
-        with tokenizer.as_target_tokenizer():
-            labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)
+        # Tokenize targets with the `text_target` keyword argument
+        labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True)
 
         # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
         # padding in the loss.
diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py
index 524121820a..0f21eb2733 100755
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@@ -443,9 +443,8 @@ def main():
         inputs = [prefix + inp for inp in inputs]
         model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True)
 
-        # Setup the tokenizer for targets
-        with tokenizer.as_target_tokenizer():
-            labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)
+        # Tokenize targets with the `text_target` keyword argument
+        labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True)
 
         # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
         # padding in the loss.
diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py
index 7227681d05..6db6e11c50 100644
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -452,9 +452,8 @@ def main():
         inputs = [prefix + inp for inp in inputs]
         model_inputs = tokenizer(inputs, max_length=args.max_source_length, padding=padding, truncation=True)
 
-        # Setup the tokenizer for targets
-        with tokenizer.as_target_tokenizer():
-            labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)
+        # Tokenize targets with the `text_target` keyword argument
+        labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True)
 
         # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
         # padding in the loss.
diff --git a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
index 521036c78e..afa3397eb4 100755
--- a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
+++ b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
@@ -304,13 +304,12 @@ class DataCollatorCTCWithPadding:
             return_tensors="pt",
         )
 
-        with self.processor.as_target_processor():
-            labels_batch = self.processor.pad(
-                label_features,
-                padding=self.padding,
-                pad_to_multiple_of=self.pad_to_multiple_of_labels,
-                return_tensors="pt",
-            )
+        labels_batch = self.processor.pad(
+            labels=label_features,
+            padding=self.padding,
+            pad_to_multiple_of=self.pad_to_multiple_of_labels,
+            return_tensors="pt",
+        )
 
         # replace padding with -100 to ignore loss correctly
         labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
diff --git a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py
index d357bc4696..57f54048a5 100644
--- a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py
+++ b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_streaming.py
@@ -301,13 +301,12 @@ class DataCollatorCTCWithPadding:
             return_tensors="pt",
         )
 
-        with self.processor.as_target_processor():
-            labels_batch = self.processor.pad(
-                label_features,
-                padding=self.padding,
-                pad_to_multiple_of=self.pad_to_multiple_of_labels,
-                return_tensors="pt",
-            )
+        labels_batch = self.processor.pad(
+            labels=label_features,
+            padding=self.padding,
+            pad_to_multiple_of=self.pad_to_multiple_of_labels,
+            return_tensors="pt",
+        )
 
         # replace padding with -100 to ignore loss correctly
         labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
diff --git a/examples/research_projects/tapex/run_wikisql_with_tapex.py b/examples/research_projects/tapex/run_wikisql_with_tapex.py
index 461bfbec9a..7573893629 100644
--- a/examples/research_projects/tapex/run_wikisql_with_tapex.py
+++ b/examples/research_projects/tapex/run_wikisql_with_tapex.py
@@ -437,13 +437,12 @@ def main():
                 table=tables, query=questions, max_length=data_args.max_source_length, padding=padding, truncation=True
             )
 
-        with tokenizer.as_target_tokenizer():
-            labels = tokenizer(
-                answer=[", ".join(answer) for answer in answers],
-                max_length=max_target_length,
-                padding=padding,
-                truncation=True,
-            )
+        labels = tokenizer(
+            answer=[", ".join(answer) for answer in answers],
+            max_length=max_target_length,
+            padding=padding,
+            truncation=True,
+        )
 
         # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
         # padding in the loss.
diff --git a/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py b/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py
index 1750adc546..7ffa8f5f91 100644
--- a/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py
+++ b/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py
@@ -413,13 +413,12 @@ def main():
                 table=tables, query=questions, max_length=data_args.max_source_length, padding=padding, truncation=True
             )
 
-        with tokenizer.as_target_tokenizer():
-            labels = tokenizer(
-                answer=[", ".join(answer) for answer in answers],
-                max_length=max_target_length,
-                padding=padding,
-                truncation=True,
-            )
+        labels = tokenizer(
+            answer=[", ".join(answer) for answer in answers],
+            max_length=max_target_length,
+            padding=padding,
+            truncation=True,
+        )
 
         # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
         # padding in the loss.
diff --git a/examples/research_projects/wav2vec2/run_asr.py b/examples/research_projects/wav2vec2/run_asr.py
index bb34e0a0c7..ab9db11d2a 100755
--- a/examples/research_projects/wav2vec2/run_asr.py
+++ b/examples/research_projects/wav2vec2/run_asr.py
@@ -266,14 +266,13 @@ class DataCollatorCTCWithPadding:
             pad_to_multiple_of=self.pad_to_multiple_of,
             return_tensors="pt",
         )
-        with self.processor.as_target_processor():
-            labels_batch = self.processor.pad(
-                label_features,
-                padding=self.padding,
-                max_length=self.max_length_labels,
-                pad_to_multiple_of=self.pad_to_multiple_of_labels,
-                return_tensors="pt",
-            )
+        labels_batch = self.processor.pad(
+            labels=label_features,
+            padding=self.padding,
+            max_length=self.max_length_labels,
+            pad_to_multiple_of=self.pad_to_multiple_of_labels,
+            return_tensors="pt",
+        )
 
         # replace padding with -100 to ignore loss correctly
         labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
@@ -419,9 +418,10 @@ def main():
             len(set(batch["sampling_rate"])) == 1
         ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."
 
-        batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values
-        with processor.as_target_processor():
-            batch["labels"] = processor(batch[data_args.target_text_column]).input_ids
+        processed_batch = processor(
+            audio=batch["speech"], text=batch[data_args.target_text_column], sampling_rate=batch["sampling_rate"][0]
+        )
+        batch.update(processed_batch)
         return batch
 
     train_dataset = train_dataset.map(
diff --git a/examples/research_projects/wav2vec2/run_common_voice.py b/examples/research_projects/wav2vec2/run_common_voice.py
index b8480d3c7d..10a3a77fa7 100644
--- a/examples/research_projects/wav2vec2/run_common_voice.py
+++ b/examples/research_projects/wav2vec2/run_common_voice.py
@@ -185,14 +185,13 @@ class DataCollatorCTCWithPadding:
             pad_to_multiple_of=self.pad_to_multiple_of,
             return_tensors="pt",
         )
-        with self.processor.as_target_processor():
-            labels_batch = self.processor.pad(
-                label_features,
-                padding=self.padding,
-                max_length=self.max_length_labels,
-                pad_to_multiple_of=self.pad_to_multiple_of_labels,
-                return_tensors="pt",
-            )
+        labels_batch = self.processor.pad(
+            labels=label_features,
+            padding=self.padding,
+            max_length=self.max_length_labels,
+            pad_to_multiple_of=self.pad_to_multiple_of_labels,
+            return_tensors="pt",
+        )
 
         # replace padding with -100 to ignore loss correctly
         labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
@@ -414,10 +413,11 @@ def main():
         assert (
             len(set(batch["sampling_rate"])) == 1
         ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."
-        batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values
-        # Setup the processor for targets
-        with processor.as_target_processor():
-            batch["labels"] = processor(batch["target_text"]).input_ids
+
+        processed_batch = processor(
+            audio=batch["speech"], text=batch["target_text"], sampling_rate=batch["sampling_rate"][0]
+        )
+        batch.update(processed_batch)
         return batch
 
     train_dataset = train_dataset.map(
diff --git a/examples/research_projects/xtreme-s/run_xtreme_s.py b/examples/research_projects/xtreme-s/run_xtreme_s.py
index 972c6d5462..d3e4f5cb38 100644
--- a/examples/research_projects/xtreme-s/run_xtreme_s.py
+++ b/examples/research_projects/xtreme-s/run_xtreme_s.py
@@ -349,13 +349,12 @@ class SpeechDataCollatorWithPadding:
 
         if self.pad_labels:
             label_features = [{"input_ids": feature["labels"]} for feature in features]
-            with self.processor.as_target_processor():
-                labels_batch = self.processor.pad(
-                    label_features,
-                    padding=self.padding,
-                    pad_to_multiple_of=self.pad_to_multiple_of_labels,
-                    return_tensors="pt",
-                )
+            labels_batch = self.processor.pad(
+                labels=label_features,
+                padding=self.padding,
+                pad_to_multiple_of=self.pad_to_multiple_of_labels,
+                return_tensors="pt",
+            )
 
             # replace padding with -100 to ignore loss correctly
             labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py
index a265b4f20f..350995e50d 100644
--- a/examples/tensorflow/summarization/run_summarization.py
+++ b/examples/tensorflow/summarization/run_summarization.py
@@ -504,9 +504,8 @@ def main():
         inputs = [prefix + inp for inp in inputs]
         model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True)
 
-        # Setup the tokenizer for targets
-        with tokenizer.as_target_tokenizer():
-            labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)
+        # Tokenize targets with the `text_target` keyword argument
+        labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True)
 
         # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
         # padding in the loss.
diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py
index 5db26d20ec..6e12288fd4 100644
--- a/examples/tensorflow/translation/run_translation.py
+++ b/examples/tensorflow/translation/run_translation.py
@@ -458,9 +458,8 @@ def main():
         inputs = [prefix + inp for inp in inputs]
         model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True)
 
-        # Setup the tokenizer for targets
-        with tokenizer.as_target_tokenizer():
-            labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)
+        # Tokenize targets with the `text_target` keyword argument
+        labels = tokenizer(text_target=targets, max_length=max_target_length, padding=padding, truncation=True)
 
         # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
         # padding in the loss.
diff --git a/src/transformers/models/hubert/modeling_tf_hubert.py b/src/transformers/models/hubert/modeling_tf_hubert.py
index fc6e5b13d4..f078b5d0cf 100644
--- a/src/transformers/models/hubert/modeling_tf_hubert.py
+++ b/src/transformers/models/hubert/modeling_tf_hubert.py
@@ -1612,9 +1612,8 @@ class TFHubertForCTC(TFHubertPreTrainedModel):
         >>> # compute loss
         >>> target_transcription = "A MAN SAID TO THE UNIVERSE SIR I EXIST"
 
-        >>> # wrap processor as target processor to encode labels
-        >>> with processor.as_target_processor():
-        ...     labels = processor(transcription, return_tensors="tf").input_values
+        >>> # Pass the transcription as text to encode labels
+        >>> labels = processor(text=transcription, return_tensors="tf").input_values
 
         >>> loss = model(input_values, labels=labels).loss
         ```"""
diff --git a/src/transformers/models/m2m_100/tokenization_m2m_100.py b/src/transformers/models/m2m_100/tokenization_m2m_100.py
index f2e9c855bf..b67b82fb7a 100644
--- a/src/transformers/models/m2m_100/tokenization_m2m_100.py
+++ b/src/transformers/models/m2m_100/tokenization_m2m_100.py
@@ -14,7 +14,6 @@
 """Tokenization classes for M2M100."""
 import json
 import os
-from contextlib import contextmanager
 from pathlib import Path
 from shutil import copyfile
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -116,10 +115,8 @@ class M2M100Tokenizer(PreTrainedTokenizer):
     >>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en", tgt_lang="ro")
     >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
     >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
-    >>> model_inputs = tokenizer(src_text, return_tensors="pt")
-    >>> with tokenizer.as_target_tokenizer():
-    ...     labels = tokenizer(tgt_text, return_tensors="pt").input_ids
-    >>> model(**model_inputs, labels=labels)  # should work
+    >>> model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
+    >>> model(**model_inputs)  # should work
     ```"""
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -346,16 +343,12 @@ class M2M100Tokenizer(PreTrainedTokenizer):
         inputs["forced_bos_token_id"] = tgt_lang_id
         return inputs
 
-    @contextmanager
-    def as_target_tokenizer(self):
-        """
-        Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to
-        sequence-to-sequence models that need a slightly different processing for the labels.
-        """
-        self.set_tgt_lang_special_tokens(self.tgt_lang)
-        yield
+    def _switch_to_input_mode(self):
         self.set_src_lang_special_tokens(self.src_lang)
 
+    def _switch_to_target_mode(self):
+        self.set_tgt_lang_special_tokens(self.tgt_lang)
+
     def set_src_lang_special_tokens(self, src_lang: str) -> None:
         """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""
         lang_token = self.get_lang_token(src_lang)
diff --git a/src/transformers/models/marian/tokenization_marian.py b/src/transformers/models/marian/tokenization_marian.py
index 62f145e7b7..66eb5a44c5 100644
--- a/src/transformers/models/marian/tokenization_marian.py
+++ b/src/transformers/models/marian/tokenization_marian.py
@@ -15,7 +15,6 @@ import json
 import os
 import re
 import warnings
-from contextlib import contextmanager
 from pathlib import Path
 from shutil import copyfile
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -112,10 +111,7 @@ class MarianTokenizer(PreTrainedTokenizer):
     >>> tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
     >>> src_texts = ["I am a small frog.", "Tom asked his teacher for advice."]
     >>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."]  # optional
-    >>> inputs = tokenizer(src_texts, return_tensors="pt", padding=True)
-    >>> with tokenizer.as_target_tokenizer():
-    ...     labels = tokenizer(tgt_texts, return_tensors="pt", padding=True)
-    >>> inputs["labels"] = labels["input_ids"]
+    >>> inputs = tokenizer(src_texts, text_target=tgt_texts, return_tensors="pt", padding=True)
     # keys  [input_ids, attention_mask, labels].
 
     >>> outputs = model(**inputs)  # should work
@@ -281,18 +277,14 @@ class MarianTokenizer(PreTrainedTokenizer):
         # We don't expect to process pairs, but leave the pair logic for API consistency
         return token_ids_0 + token_ids_1 + [self.eos_token_id]
 
-    @contextmanager
-    def as_target_tokenizer(self):
-        """
-        Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to
-        sequence-to-sequence models that need a slightly different processing for the labels.
-        """
+    def _switch_to_input_mode(self):
+        self.current_spm = self.spm_source
+        self.current_encoder = self.encoder
+
+    def _switch_to_target_mode(self):
         self.current_spm = self.spm_target
         if self.separate_vocabs:
             self.current_encoder = self.target_encoder
-        yield
-        self.current_spm = self.spm_source
-        self.current_encoder = self.encoder
 
     @property
     def vocab_size(self) -> int:
diff --git a/src/transformers/models/mbart/tokenization_mbart.py b/src/transformers/models/mbart/tokenization_mbart.py
index 2517dfb584..6546074642 100644
--- a/src/transformers/models/mbart/tokenization_mbart.py
+++ b/src/transformers/models/mbart/tokenization_mbart.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 import os
-from contextlib import contextmanager
 from shutil import copyfile
 from typing import Any, Dict, List, Optional, Tuple
 
@@ -69,10 +68,7 @@ class MBartTokenizer(PreTrainedTokenizer):
     >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO")
     >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
     >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
-    >>> inputs = tokenizer(example_english_phrase, return_tensors="pt")
-    >>> with tokenizer.as_target_tokenizer():
-    ...     labels = tokenizer(expected_translation_romanian, return_tensors="pt")
-    >>> inputs["labels"] = labels["input_ids"]
+    >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
     ```"""
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -340,15 +336,11 @@ class MBartTokenizer(PreTrainedTokenizer):
         self.tgt_lang = tgt_lang
         return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
 
-    @contextmanager
-    def as_target_tokenizer(self):
-        """
-        Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to
-        sequence-to-sequence models that need a slightly different processing for the labels.
-        """
-        self.set_tgt_lang_special_tokens(self.tgt_lang)
-        yield
-        self.set_src_lang_special_tokens(self.src_lang)
+    def _switch_to_input_mode(self):
+        return self.set_src_lang_special_tokens(self.src_lang)
+
+    def _switch_to_target_mode(self):
+        return self.set_tgt_lang_special_tokens(self.tgt_lang)
 
     def set_src_lang_special_tokens(self, src_lang) -> None:
         """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""
diff --git a/src/transformers/models/mbart/tokenization_mbart_fast.py b/src/transformers/models/mbart/tokenization_mbart_fast.py
index 52902e3a40..8bf75ebe59 100644
--- a/src/transformers/models/mbart/tokenization_mbart_fast.py
+++ b/src/transformers/models/mbart/tokenization_mbart_fast.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 import os
-from contextlib import contextmanager
 from shutil import copyfile
 from typing import List, Optional, Tuple
 
@@ -82,10 +81,7 @@ class MBartTokenizerFast(PreTrainedTokenizerFast):
     ... )
     >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
     >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
-    >>> inputs = tokenizer(example_english_phrase, return_tensors="pt")
-    >>> with tokenizer.as_target_tokenizer():
-    ...     labels = tokenizer(expected_translation_romanian, return_tensors="pt")
-    >>> inputs["labels"] = labels["input_ids"]
+    >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
     ```"""
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -240,15 +236,11 @@ class MBartTokenizerFast(PreTrainedTokenizerFast):
         self.tgt_lang = tgt_lang
         return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
 
-    @contextmanager
-    def as_target_tokenizer(self):
-        """
-        Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to
-        sequence-to-sequence models that need a slightly different processing for the labels.
-        """
-        self.set_tgt_lang_special_tokens(self.tgt_lang)
-        yield
-        self.set_src_lang_special_tokens(self.src_lang)
+    def _switch_to_input_mode(self):
+        return self.set_src_lang_special_tokens(self.src_lang)
+
+    def _switch_to_target_mode(self):
+        return self.set_tgt_lang_special_tokens(self.tgt_lang)
 
     def set_src_lang_special_tokens(self, src_lang) -> None:
         """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""
diff --git a/src/transformers/models/mbart50/tokenization_mbart50.py b/src/transformers/models/mbart50/tokenization_mbart50.py
index 145a546c18..707a977349 100644
--- a/src/transformers/models/mbart50/tokenization_mbart50.py
+++ b/src/transformers/models/mbart50/tokenization_mbart50.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 import os
-from contextlib import contextmanager
 from shutil import copyfile
 from typing import Any, Dict, List, Optional, Tuple
 
@@ -102,10 +101,8 @@ class MBart50Tokenizer(PreTrainedTokenizer):
     >>> tokenizer = MBart50Tokenizer.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")
     >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
     >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
-    >>> model_inputs = tokenizer(src_text, return_tensors="pt")
-    >>> with tokenizer.as_target_tokenizer():
-    ...     labels = tokenizer(tgt_text, return_tensors="pt").input_ids
-    >>> # model(**model_inputs, labels=labels) should work
+    >>> model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
+    >>> # model(**model_inputs) should work
     ```"""
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -337,15 +334,11 @@ class MBart50Tokenizer(PreTrainedTokenizer):
         self.tgt_lang = tgt_lang
         return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
 
-    @contextmanager
-    def as_target_tokenizer(self):
-        """
-        Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to
-        sequence-to-sequence models that need a slightly different processing for the labels.
-        """
-        self.set_tgt_lang_special_tokens(self.tgt_lang)
-        yield
-        self.set_src_lang_special_tokens(self.src_lang)
+    def _switch_to_input_mode(self):
+        return self.set_src_lang_special_tokens(self.src_lang)
+
+    def _switch_to_target_mode(self):
+        return self.set_tgt_lang_special_tokens(self.tgt_lang)
 
     def set_src_lang_special_tokens(self, src_lang: str) -> None:
         """Reset the special tokens to the source lang setting. prefix=[src_lang_code] and suffix=[eos]."""
diff --git a/src/transformers/models/mbart50/tokenization_mbart50_fast.py b/src/transformers/models/mbart50/tokenization_mbart50_fast.py
index 28fb726c47..1ab8ff06e2 100644
--- a/src/transformers/models/mbart50/tokenization_mbart50_fast.py
+++ b/src/transformers/models/mbart50/tokenization_mbart50_fast.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 import os
-from contextlib import contextmanager
 from shutil import copyfile
 from typing import List, Optional, Tuple
 
@@ -98,10 +97,8 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast):
     >>> tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")
     >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
     >>> tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
-    >>> model_inputs = tokenizer(src_text, return_tensors="pt")
-    >>> with tokenizer.as_target_tokenizer():
-    ...     labels = tokenizer(tgt_text, return_tensors="pt").input_ids
-    >>> # model(**model_inputs, labels=labels) should work
+    >>> model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
+    >>> # model(**model_inputs) should work
     ```"""
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -211,15 +208,11 @@ class MBart50TokenizerFast(PreTrainedTokenizerFast):
         self.tgt_lang = tgt_lang
         return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
 
-    @contextmanager
-    def as_target_tokenizer(self):
-        """
-        Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to
-        sequence-to-sequence models that need a slightly different processing for the labels.
-        """
-        self.set_tgt_lang_special_tokens(self.tgt_lang)
-        yield
-        self.set_src_lang_special_tokens(self.src_lang)
+    def _switch_to_input_mode(self):
+        return self.set_src_lang_special_tokens(self.src_lang)
+
+    def _switch_to_target_mode(self):
+        return self.set_tgt_lang_special_tokens(self.tgt_lang)
 
     def set_src_lang_special_tokens(self, src_lang: str) -> None:
         """Reset the special tokens to the source lang setting. prefix=[src_lang_code] and suffix=[eos]."""
diff --git a/src/transformers/models/mctct/processing_mctct.py b/src/transformers/models/mctct/processing_mctct.py
index 0892f34592..2e05020196 100644
--- a/src/transformers/models/mctct/processing_mctct.py
+++ b/src/transformers/models/mctct/processing_mctct.py
@@ -15,6 +15,7 @@
 """
 Speech processor class for M-CTC-T
 """
+import warnings
 from contextlib import contextmanager
 
 from ...processing_utils import ProcessorMixin
@@ -39,6 +40,7 @@ class MCTCTProcessor(ProcessorMixin):
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
         self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
 
     def __call__(self, *args, **kwargs):
         """
@@ -47,7 +49,35 @@ class MCTCTProcessor(ProcessorMixin):
         [`~MCTCTProcessor.as_target_processor`] this method forwards all its arguments to AutoTokenizer's
         [`~AutoTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information.
         """
-        return self.current_processor(*args, **kwargs)
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor(*args, **kwargs)
+
+        if "raw_speech" in kwargs:
+            warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
+            audio = kwargs.pop("raw_speech")
+        else:
+            audio = kwargs.pop("audio", None)
+        text = kwargs.pop("text", None)
+        if len(args) > 0:
+            audio = args[0]
+            args = args[1:]
+
+        if audio is None and text is None:
+            raise ValueError("You need to specify either an `audio` or `text` input to process.")
+
+        if audio is not None:
+            inputs = self.feature_extractor(audio, *args, **kwargs)
+        if text is not None:
+            encodings = self.tokenizer(text, **kwargs)
+
+        if text is None:
+            return inputs
+        elif audio is None:
+            return encodings
+        else:
+            inputs["labels"] = encodings["input_ids"]
+            return inputs
 
     def batch_decode(self, *args, **kwargs):
         """
@@ -63,7 +93,28 @@ class MCTCTProcessor(ProcessorMixin):
         [`~MCTCTProcessor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's
         [`~PreTrainedTokenizer.pad`]. Please refer to the docstring of the above two methods for more information.
         """
-        return self.current_processor.pad(*args, **kwargs)
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor.pad(*args, **kwargs)
+
+        input_features = kwargs.pop("input_features", None)
+        labels = kwargs.pop("labels", None)
+        if len(args) > 0:
+            input_features = args[0]
+            args = args[1:]
+
+        if input_features is not None:
+            input_features = self.feature_extractor.pad(input_features, *args, **kwargs)
+        if labels is not None:
+            labels = self.tokenizer.pad(labels, **kwargs)
+
+        if labels is None:
+            return input_features
+        elif input_features is None:
+            return labels
+        else:
+            input_features["labels"] = labels["input_ids"]
+            return input_features
 
     def decode(self, *args, **kwargs):
         """
@@ -77,6 +128,13 @@ class MCTCTProcessor(ProcessorMixin):
         """
         Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning MCTCT.
         """
+        warnings.warn(
+            "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
+            "labels by using the argument `text` of the regular `__call__` method (either in the same call as "
+            "your audio inputs, or in a separate call."
+        )
+        self._in_target_context_manager = True
         self.current_processor = self.tokenizer
         yield
         self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
diff --git a/src/transformers/models/mt5/modeling_flax_mt5.py b/src/transformers/models/mt5/modeling_flax_mt5.py
index 841d2069e6..4f2fa5b9fb 100644
--- a/src/transformers/models/mt5/modeling_flax_mt5.py
+++ b/src/transformers/models/mt5/modeling_flax_mt5.py
@@ -57,8 +57,7 @@ class FlaxMT5Model(FlaxT5Model):
     >>> summary = "Weiter Verhandlung in Syrien."
     >>> inputs = tokenizer(article, return_tensors="np")
 
-    >>> with tokenizer.as_target_tokenizer():
-    ...     decoder_input_ids = tokenizer(summary, return_tensors="np").input_ids
+    >>> decoder_input_ids = tokenizer(text_target=summary, return_tensors="np").input_ids
 
     >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=decoder_input_ids)
     >>> hidden_states = outputs.last_hidden_state
@@ -84,8 +83,7 @@ class FlaxMT5EncoderModel(FlaxT5EncoderModel):
     >>> summary = "Weiter Verhandlung in Syrien."
     >>> inputs = tokenizer(article, return_tensors="np")
 
-    >>> with tokenizer.as_target_tokenizer():
-    ...     decoder_input_ids = tokenizer(summary, return_tensors="np").input_ids
+    >>> decoder_input_ids = tokenizer(text_target=summary, return_tensors="np").input_ids
 
     >>> outputs = model(input_ids=inputs["input_ids"])
     >>> hidden_states = outputs.last_hidden_state
@@ -111,8 +109,7 @@ class FlaxMT5ForConditionalGeneration(FlaxT5ForConditionalGeneration):
     >>> summary = "Weiter Verhandlung in Syrien."
     >>> inputs = tokenizer(article, return_tensors="np")
 
-    >>> with tokenizer.as_target_tokenizer():
-    ...     decoder_input_ids = tokenizer(summary, return_tensors="np").input_ids
+    >>> decoder_input_ids = tokenizer(text_target=summary, return_tensors="np").input_ids
 
     >>> outputs = model(**inputs, decoder_input_ids=decoder_input_ids)
     >>> logits = outputs.logits
diff --git a/src/transformers/models/mt5/modeling_mt5.py b/src/transformers/models/mt5/modeling_mt5.py
index 8c19a63ede..c562b01152 100644
--- a/src/transformers/models/mt5/modeling_mt5.py
+++ b/src/transformers/models/mt5/modeling_mt5.py
@@ -40,8 +40,7 @@ class MT5Model(T5Model):
     >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
     >>> summary = "Weiter Verhandlung in Syrien."
     >>> inputs = tokenizer(article, return_tensors="pt")
-    >>> with tokenizer.as_target_tokenizer():
-    ...     labels = tokenizer(summary, return_tensors="pt")
+    >>> labels = tokenizer(text_target=summary, return_tensors="pt")
 
     >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
     >>> hidden_states = outputs.last_hidden_state
@@ -73,11 +72,9 @@ class MT5ForConditionalGeneration(T5ForConditionalGeneration):
     >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
     >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
     >>> summary = "Weiter Verhandlung in Syrien."
-    >>> inputs = tokenizer(article, return_tensors="pt")
-    >>> with tokenizer.as_target_tokenizer():
-    ...     labels = tokenizer(summary, return_tensors="pt")
+    >>> inputs = tokenizer(article, text_target=summary, return_tensors="pt")
 
-    >>> outputs = model(**inputs, labels=labels["input_ids"])
+    >>> outputs = model(**inputs)
     >>> loss = outputs.loss
     ```"""
 
diff --git a/src/transformers/models/mt5/modeling_tf_mt5.py b/src/transformers/models/mt5/modeling_tf_mt5.py
index 2808b8421a..71aa0bb66a 100644
--- a/src/transformers/models/mt5/modeling_tf_mt5.py
+++ b/src/transformers/models/mt5/modeling_tf_mt5.py
@@ -40,8 +40,7 @@ class TFMT5Model(TFT5Model):
     >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
     >>> summary = "Weiter Verhandlung in Syrien."
     >>> inputs = tokenizer(article, return_tensors="tf")
-    >>> with tokenizer.as_target_tokenizer():
-    ...     labels = tokenizer(summary, return_tensors="tf")
+    >>> labels = tokenizer(text_target=summary, return_tensors="tf")
 
     >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
     >>> hidden_states = outputs.last_hidden_state
@@ -64,11 +63,9 @@ class TFMT5ForConditionalGeneration(TFT5ForConditionalGeneration):
     >>> tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
     >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
     >>> summary = "Weiter Verhandlung in Syrien."
-    >>> inputs = tokenizer(article, return_tensors="tf")
-    >>> with tokenizer.as_target_tokenizer():
-    ...     labels = tokenizer(summary, return_tensors="tf")
+    >>> inputs = tokenizer(article, text_target=summary, return_tensors="tf")
 
-    >>> outputs = model(**inputs, labels=labels["input_ids"])
+    >>> outputs = model(**inputs)
     >>> loss = outputs.loss
     ```"""
 
diff --git a/src/transformers/models/nllb/tokenization_nllb.py b/src/transformers/models/nllb/tokenization_nllb.py
index ef0ee942bf..6a326fd3ca 100644
--- a/src/transformers/models/nllb/tokenization_nllb.py
+++ b/src/transformers/models/nllb/tokenization_nllb.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 import os
-from contextlib import contextmanager
 from shutil import copyfile
 from typing import Any, Dict, List, Optional, Tuple
 
@@ -67,10 +66,7 @@ class NllbTokenizer(PreTrainedTokenizer):
     ... )
     >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
     >>> expected_translation_french = "Le chef de l'ONU affirme qu'il n'y a pas de solution militaire en Syrie."
-    >>> inputs = tokenizer(example_english_phrase, return_tensors="pt")
-    >>> with tokenizer.as_target_tokenizer():
-    ...     labels = tokenizer(expected_translation_french, return_tensors="pt")
-    >>> inputs["labels"] = labels["input_ids"]
+    >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_french, return_tensors="pt")
     ```
 
     Args:
@@ -386,15 +382,11 @@ class NllbTokenizer(PreTrainedTokenizer):
         self.tgt_lang = tgt_lang
         return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
 
-    @contextmanager
-    def as_target_tokenizer(self):
-        """
-        Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to
-        sequence-to-sequence models that need a slightly different processing for the labels.
-        """
-        self.set_tgt_lang_special_tokens(self.tgt_lang)
-        yield
-        self.set_src_lang_special_tokens(self.src_lang)
+    def _switch_to_input_mode(self):
+        return self.set_src_lang_special_tokens(self.src_lang)
+
+    def _switch_to_target_mode(self):
+        return self.set_tgt_lang_special_tokens(self.tgt_lang)
 
     def set_src_lang_special_tokens(self, src_lang) -> None:
         """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""
diff --git a/src/transformers/models/nllb/tokenization_nllb_fast.py b/src/transformers/models/nllb/tokenization_nllb_fast.py
index fa4eaa4c5a..1afe27f43b 100644
--- a/src/transformers/models/nllb/tokenization_nllb_fast.py
+++ b/src/transformers/models/nllb/tokenization_nllb_fast.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 import os
-from contextlib import contextmanager
 from shutil import copyfile
 from typing import List, Optional, Tuple
 
@@ -80,10 +79,7 @@ class NllbTokenizerFast(PreTrainedTokenizerFast):
     ... )
     >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
     >>> expected_translation_french = "Le chef de l'ONU affirme qu'il n'y a pas de solution militaire en Syrie."
-    >>> inputs = tokenizer(example_english_phrase, return_tensors="pt")
-    >>> with tokenizer.as_target_tokenizer():
-    ...     labels = tokenizer(expected_translation_french, return_tensors="pt")
-    >>> inputs["labels"] = labels["input_ids"]
+    >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_french, return_tensors="pt")
     ```
 
     Args:
@@ -284,15 +280,11 @@ class NllbTokenizerFast(PreTrainedTokenizerFast):
         self.tgt_lang = tgt_lang
         return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
 
-    @contextmanager
-    def as_target_tokenizer(self):
-        """
-        Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to
-        sequence-to-sequence models that need a slightly different processing for the labels.
-        """
-        self.set_tgt_lang_special_tokens(self.tgt_lang)
-        yield
-        self.set_src_lang_special_tokens(self.src_lang)
+    def _switch_to_input_mode(self):
+        return self.set_src_lang_special_tokens(self.src_lang)
+
+    def _switch_to_target_mode(self):
+        return self.set_tgt_lang_special_tokens(self.tgt_lang)
 
     def set_src_lang_special_tokens(self, src_lang) -> None:
         """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""
diff --git a/src/transformers/models/plbart/tokenization_plbart.py b/src/transformers/models/plbart/tokenization_plbart.py
index 4a3ee1cdcd..411df99692 100644
--- a/src/transformers/models/plbart/tokenization_plbart.py
+++ b/src/transformers/models/plbart/tokenization_plbart.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 
 import os
-from contextlib import contextmanager
 from shutil import copyfile
 from typing import Any, Dict, List, Optional, Tuple
 
@@ -153,10 +152,7 @@ class PLBartTokenizer(PreTrainedTokenizer):
     >>> tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-python-en_XX", src_lang="python", tgt_lang="en_XX")
     >>> example_python_phrase = "def maximum(a,b,c):NEW_LINE_INDENTreturn max([a,b,c])"
     >>> expected_translation_english = "Returns the maximum value of a b c."
-    >>> inputs = tokenizer(example_python_phrase, return_tensors="pt")
-    >>> with tokenizer.as_target_tokenizer():
-    ...     labels = tokenizer(expected_translation_english, return_tensors="pt")
-    >>> inputs["labels"] = labels["input_ids"]
+    >>> inputs = tokenizer(example_python_phrase, text_target=expected_translation_english, return_tensors="pt")
     ```"""
 
     vocab_files_names = VOCAB_FILES_NAMES
@@ -441,15 +437,11 @@ class PLBartTokenizer(PreTrainedTokenizer):
         self.tgt_lang = tgt_lang
         return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
 
-    @contextmanager
-    def as_target_tokenizer(self):
-        """
-        Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to
-        sequence-to-sequence models that need a slightly different processing for the labels.
-        """
-        self.set_tgt_lang_special_tokens(self.tgt_lang)
-        yield
-        self.set_src_lang_special_tokens(self.src_lang)
+    def _switch_to_input_mode(self):
+        return self.set_src_lang_special_tokens(self.src_lang)
+
+    def _switch_to_target_mode(self):
+        return self.set_tgt_lang_special_tokens(self.tgt_lang)
 
     def set_src_lang_special_tokens(self, src_lang) -> None:
         """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""
diff --git a/src/transformers/models/rag/modeling_rag.py b/src/transformers/models/rag/modeling_rag.py
index 1d6a62b201..41af393c67 100644
--- a/src/transformers/models/rag/modeling_rag.py
+++ b/src/transformers/models/rag/modeling_rag.py
@@ -818,8 +818,7 @@ class RagSequenceForGeneration(RagPreTrainedModel):
         >>> model = RagSequenceForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)
 
         >>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
-        >>> with tokenizer.as_target_tokenizer():
-        ...     targets = tokenizer("In Paris, there are 10 million people.", return_tensors="pt")
+        >>> targets = tokenizer(text_target="In Paris, there are 10 million people.", return_tensors="pt")
         >>> input_ids = inputs["input_ids"]
         >>> labels = targets["input_ids"]
         >>> outputs = model(input_ids=input_ids, labels=labels)
@@ -1287,8 +1286,7 @@ class RagTokenForGeneration(RagPreTrainedModel):
         >>> model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)
 
         >>> inputs = tokenizer("How many people live in Paris?", return_tensors="pt")
-        >>> with tokenizer.as_target_tokenizer():
-        ...     targets = tokenizer("In Paris, there are 10 million people.", return_tensors="pt")
+        >>> targets = tokenizer(text_target="In Paris, there are 10 million people.", return_tensors="pt")
         >>> input_ids = inputs["input_ids"]
         >>> labels = targets["input_ids"]
         >>> outputs = model(input_ids=input_ids, labels=labels)
diff --git a/src/transformers/models/rag/tokenization_rag.py b/src/transformers/models/rag/tokenization_rag.py
index d92ca1788f..5b6ec67e6b 100644
--- a/src/transformers/models/rag/tokenization_rag.py
+++ b/src/transformers/models/rag/tokenization_rag.py
@@ -15,7 +15,6 @@
 """Tokenization classes for RAG."""
 import os
 import warnings
-from contextlib import contextmanager
 from typing import List, Optional
 
 from ...tokenization_utils_base import BatchEncoding
@@ -68,16 +67,12 @@ class RagTokenizer:
     def decode(self, *args, **kwargs):
         return self.generator.decode(*args, **kwargs)
 
-    @contextmanager
-    def as_target_tokenizer(self):
-        """
-        Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to
-        sequence-to-sequence models that need a slightly different processing for the labels.
-        """
-        self.current_tokenizer = self.generator
-        yield
+    def _switch_to_input_mode(self):
         self.current_tokenizer = self.question_encoder
 
+    def _switch_to_target_mode(self):
+        self.current_tokenizer = self.generator
+
     def prepare_seq2seq_batch(
         self,
         src_texts: List[str],
@@ -110,17 +105,16 @@ class RagTokenizer:
         if tgt_texts is None:
             return model_inputs
         # Process tgt_texts
-        with self.as_target_tokenizer():
-            if max_target_length is None:
-                max_target_length = self.current_tokenizer.model_max_length
-            labels = self(
-                tgt_texts,
-                add_special_tokens=True,
-                return_tensors=return_tensors,
-                padding=padding,
-                max_length=max_target_length,
-                truncation=truncation,
-                **kwargs,
-            )
+        if max_target_length is None:
+            max_target_length = self.current_tokenizer.model_max_length
+        labels = self(
+            text_target=tgt_texts,
+            add_special_tokens=True,
+            return_tensors=return_tensors,
+            padding=padding,
+            max_length=max_target_length,
+            truncation=truncation,
+            **kwargs,
+        )
         model_inputs["labels"] = labels["input_ids"]
         return model_inputs
diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
index bf67c6d544..388be24499 100644
--- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
+++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py
@@ -482,8 +482,7 @@ class SpeechEncoderDecoderModel(PreTrainedModel):
         'Mr. Quilter ist der Apostel der Mittelschicht und wir freuen uns, sein Evangelium willkommen heißen zu können.'
 
         >>> # Training: Train model on English transcription
-        >>> with processor.as_target_processor():
-        ...     labels = processor(ds[0]["text"], return_tensors="pt").input_ids
+        >>> labels = processor(text=ds[0]["text"], return_tensors="pt").input_ids
 
         >>> loss = model(input_values, labels=labels).loss
         >>> loss.backward()
diff --git a/src/transformers/models/speech_to_text/processing_speech_to_text.py b/src/transformers/models/speech_to_text/processing_speech_to_text.py
index 969df9d108..3f04793203 100644
--- a/src/transformers/models/speech_to_text/processing_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/processing_speech_to_text.py
@@ -15,6 +15,7 @@
 """
 Speech processor class for Speech2Text
 """
+import warnings
 from contextlib import contextmanager
 
 from ...processing_utils import ProcessorMixin
@@ -41,6 +42,7 @@ class Speech2TextProcessor(ProcessorMixin):
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
         self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
 
     def __call__(self, *args, **kwargs):
         """
@@ -50,7 +52,35 @@ class Speech2TextProcessor(ProcessorMixin):
         [`~Speech2TextTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more
         information.
         """
-        return self.current_processor(*args, **kwargs)
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor(*args, **kwargs)
+
+        if "raw_speech" in kwargs:
+            warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
+            audio = kwargs.pop("raw_speech")
+        else:
+            audio = kwargs.pop("audio", None)
+        text = kwargs.pop("text", None)
+        if len(args) > 0:
+            audio = args[0]
+            args = args[1:]
+
+        if audio is None and text is None:
+            raise ValueError("You need to specify either an `audio` or `text` input to process.")
+
+        if audio is not None:
+            inputs = self.feature_extractor(audio, *args, **kwargs)
+        if text is not None:
+            encodings = self.tokenizer(text, **kwargs)
+
+        if text is None:
+            return inputs
+        elif audio is None:
+            return encodings
+        else:
+            inputs["labels"] = encodings["input_ids"]
+            return inputs
 
     def batch_decode(self, *args, **kwargs):
         """
@@ -72,6 +102,13 @@ class Speech2TextProcessor(ProcessorMixin):
         Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning
         Speech2Text.
         """
+        warnings.warn(
+            "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
+            "labels by using the argument `text` of the regular `__call__` method (either in the same call as "
+            "your audio inputs, or in a separate call."
+        )
+        self._in_target_context_manager = True
         self.current_processor = self.tokenizer
         yield
         self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
diff --git a/src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py b/src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py
index 28189ba881..c40831d021 100644
--- a/src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py
+++ b/src/transformers/models/speech_to_text_2/processing_speech_to_text_2.py
@@ -15,6 +15,7 @@
 """
 Speech processor class for Speech2Text2
 """
+import warnings
 from contextlib import contextmanager
 
 from ...processing_utils import ProcessorMixin
@@ -40,6 +41,7 @@ class Speech2Text2Processor(ProcessorMixin):
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
         self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
 
     def __call__(self, *args, **kwargs):
         """
@@ -49,7 +51,35 @@ class Speech2Text2Processor(ProcessorMixin):
         Speech2Text2Tokenizer's [`~Speech2Text2Tokenizer.__call__`]. Please refer to the doctsring of the above two
         methods for more information.
         """
-        return self.current_processor(*args, **kwargs)
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor(*args, **kwargs)
+
+        if "raw_speech" in kwargs:
+            warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
+            audio = kwargs.pop("raw_speech")
+        else:
+            audio = kwargs.pop("audio", None)
+        text = kwargs.pop("text", None)
+        if len(args) > 0:
+            audio = args[0]
+            args = args[1:]
+
+        if audio is None and text is None:
+            raise ValueError("You need to specify either an `audio` or `text` input to process.")
+
+        if audio is not None:
+            inputs = self.feature_extractor(audio, *args, **kwargs)
+        if text is not None:
+            encodings = self.tokenizer(text, **kwargs)
+
+        if text is None:
+            return inputs
+        elif audio is None:
+            return encodings
+        else:
+            inputs["labels"] = encodings["input_ids"]
+            return inputs
 
     def batch_decode(self, *args, **kwargs):
         """
@@ -71,6 +101,13 @@ class Speech2Text2Processor(ProcessorMixin):
         Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning
         Speech2Text2.
         """
+        warnings.warn(
+            "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
+            "labels by using the argument `text` of the regular `__call__` method (either in the same call as "
+            "your audio inputs, or in a separate call."
+        )
+        self._in_target_context_manager = True
         self.current_processor = self.tokenizer
         yield
         self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
diff --git a/src/transformers/models/tapex/tokenization_tapex.py b/src/transformers/models/tapex/tokenization_tapex.py
index ea1dc0dcc4..7c0725ffe7 100644
--- a/src/transformers/models/tapex/tokenization_tapex.py
+++ b/src/transformers/models/tapex/tokenization_tapex.py
@@ -17,7 +17,6 @@
 import json
 import os
 import random
-from contextlib import contextmanager
 from functools import lru_cache
 from typing import Dict, List, Optional, Tuple, Union
 
@@ -63,12 +62,6 @@ class TapexTruncationStrategy(ExplicitEnum):
     DROP_ROWS_TO_FIT = "drop_rows_to_fit"
 
 
-class TokenizerStrategy(ExplicitEnum):
-
-    TOKENIZE_SOURCE = "tokenize_source"
-    TOKENIZE_TARGET = "tokenize_target"
-
-
 TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
             add_special_tokens (`bool`, *optional*, defaults to `True`):
                 Whether or not to encode the sequences with the special tokens relative to their model.
@@ -341,9 +334,6 @@ class TapexTokenizer(PreTrainedTokenizer):
         self.max_cell_length = max_cell_length
         self.table_linearize = IndexedRowTableLinearize()
 
-        # property to decide using which call function
-        self.current_tokenizer = TokenizerStrategy.TOKENIZE_SOURCE
-
     def build_inputs_with_special_tokens(
         self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
     ) -> List[int]:
@@ -555,9 +545,7 @@ class TapexTokenizer(PreTrainedTokenizer):
                 Optionally, the corresponding answer to the questions as supervision.
         """
 
-        if self.current_tokenizer == TokenizerStrategy.TOKENIZE_SOURCE:
-            if table is None:
-                raise ValueError("Please ensure that the table is not empty if you use TAPEX to encode source.")
+        if table is not None:
             return self.source_call_func(
                 table=table,
                 query=query,
@@ -578,9 +566,7 @@ class TapexTokenizer(PreTrainedTokenizer):
                 verbose=verbose,
                 **kwargs,
             )
-        else:
-            if answer is None:
-                raise ValueError("Please ensure that the answer is not empty if you use TAPEX to encode target.")
+        elif answer is not None:
             return self.target_call_func(
                 answer=answer,
                 add_special_tokens=add_special_tokens,
@@ -599,6 +585,8 @@ class TapexTokenizer(PreTrainedTokenizer):
                 verbose=verbose,
                 **kwargs,
             )
+        else:
+            raise ValueError("You need to provide either a `table` or an `answer`.")
 
     def source_call_func(
         self,
@@ -1330,17 +1318,6 @@ class TapexTokenizer(PreTrainedTokenizer):
             verbose=verbose,
         )
 
-    @contextmanager
-    def as_target_tokenizer(self):
-        """
-        Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to
-        sequence-to-sequence models that need a slightly different processing for the labels.
-        """
-        self.current_tokenizer = TokenizerStrategy.TOKENIZE_TARGET
-        yield
-        # restore the call function
-        self.current_tokenizer = TokenizerStrategy.TOKENIZE_SOURCE
-
     def prepare_table_query(
         self,
         table,
diff --git a/src/transformers/models/trocr/processing_trocr.py b/src/transformers/models/trocr/processing_trocr.py
index 2c7893a091..44a276fd63 100644
--- a/src/transformers/models/trocr/processing_trocr.py
+++ b/src/transformers/models/trocr/processing_trocr.py
@@ -15,6 +15,7 @@
 """
 Processor class for TrOCR.
 """
+import warnings
 from contextlib import contextmanager
 
 from ...processing_utils import ProcessorMixin
@@ -40,6 +41,7 @@ class TrOCRProcessor(ProcessorMixin):
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
         self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
 
     def __call__(self, *args, **kwargs):
         """
@@ -48,7 +50,35 @@ class TrOCRProcessor(ProcessorMixin):
         [`~TrOCRProcessor.as_target_processor`] this method forwards all its arguments to TrOCRTokenizer's
         [`~TrOCRTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information.
         """
-        return self.current_processor(*args, **kwargs)
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor(*args, **kwargs)
+
+        if "raw_speech" in kwargs:
+            warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
+            audio = kwargs.pop("raw_speech")
+        else:
+            audio = kwargs.pop("audio", None)
+        text = kwargs.pop("text", None)
+        if len(args) > 0:
+            audio = args[0]
+            args = args[1:]
+
+        if audio is None and text is None:
+            raise ValueError("You need to specify either an `audio` or `text` input to process.")
+
+        if audio is not None:
+            inputs = self.feature_extractor(audio, *args, **kwargs)
+        if text is not None:
+            encodings = self.tokenizer(text, **kwargs)
+
+        if text is None:
+            return inputs
+        elif audio is None:
+            return encodings
+        else:
+            inputs["labels"] = encodings["input_ids"]
+            return inputs
 
     def batch_decode(self, *args, **kwargs):
         """
@@ -69,6 +99,13 @@ class TrOCRProcessor(ProcessorMixin):
         """
         Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning TrOCR.
         """
+        warnings.warn(
+            "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
+            "labels by using the argument `text` of the regular `__call__` method (either in the same call as "
+            "your audio inputs, or in a separate call."
+        )
+        self._in_target_context_manager = True
         self.current_processor = self.tokenizer
         yield
         self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
diff --git a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
index fed0414863..854831e45a 100644
--- a/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py
@@ -1650,9 +1650,8 @@ class TFWav2Vec2ForCTC(TFWav2Vec2PreTrainedModel):
         >>> # compute loss
         >>> target_transcription = "A MAN SAID TO THE UNIVERSE SIR I EXIST"
 
-        >>> # wrap processor as target processor to encode labels
-        >>> with processor.as_target_processor():
-        ...     labels = processor(transcription, return_tensors="tf").input_ids
+        >>> # Pass transcription as `text` to encode labels
+        >>> labels = processor(text=transcription, return_tensors="tf").input_ids
 
         >>> loss = model(input_values, labels=labels).loss
         ```"""
diff --git a/src/transformers/models/wav2vec2/processing_wav2vec2.py b/src/transformers/models/wav2vec2/processing_wav2vec2.py
index 1470c254dc..5763d4d59e 100644
--- a/src/transformers/models/wav2vec2/processing_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/processing_wav2vec2.py
@@ -43,6 +43,7 @@ class Wav2Vec2Processor(ProcessorMixin):
     def __init__(self, feature_extractor, tokenizer):
         super().__init__(feature_extractor, tokenizer)
         self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
@@ -70,7 +71,35 @@ class Wav2Vec2Processor(ProcessorMixin):
         [`~Wav2Vec2Processor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's
         [`~PreTrainedTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.
         """
-        return self.current_processor(*args, **kwargs)
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor(*args, **kwargs)
+
+        if "raw_speech" in kwargs:
+            warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
+            audio = kwargs.pop("raw_speech")
+        else:
+            audio = kwargs.pop("audio", None)
+        text = kwargs.pop("text", None)
+        if len(args) > 0:
+            audio = args[0]
+            args = args[1:]
+
+        if audio is None and text is None:
+            raise ValueError("You need to specify either an `audio` or `text` input to process.")
+
+        if audio is not None:
+            inputs = self.feature_extractor(audio, *args, **kwargs)
+        if text is not None:
+            encodings = self.tokenizer(text, **kwargs)
+
+        if text is None:
+            return inputs
+        elif audio is None:
+            return encodings
+        else:
+            inputs["labels"] = encodings["input_ids"]
+            return inputs
 
     def pad(self, *args, **kwargs):
         """
@@ -79,7 +108,28 @@ class Wav2Vec2Processor(ProcessorMixin):
         [`~Wav2Vec2Processor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's
         [`~PreTrainedTokenizer.pad`]. Please refer to the docstring of the above two methods for more information.
         """
-        return self.current_processor.pad(*args, **kwargs)
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor.pad(*args, **kwargs)
+
+        input_features = kwargs.pop("input_features", None)
+        labels = kwargs.pop("labels", None)
+        if len(args) > 0:
+            input_features = args[0]
+            args = args[1:]
+
+        if input_features is not None:
+            input_features = self.feature_extractor.pad(input_features, *args, **kwargs)
+        if labels is not None:
+            labels = self.tokenizer.pad(labels, **kwargs)
+
+        if labels is None:
+            return input_features
+        elif input_features is None:
+            return labels
+        else:
+            input_features["labels"] = labels["input_ids"]
+            return input_features
 
     def batch_decode(self, *args, **kwargs):
         """
@@ -101,6 +151,13 @@ class Wav2Vec2Processor(ProcessorMixin):
         Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning
         Wav2Vec2.
         """
+        warnings.warn(
+            "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
+            "labels by using the argument `text` of the regular `__call__` method (either in the same call as "
+            "your audio inputs, or in a separate call."
+        )
+        self._in_target_context_manager = True
         self.current_processor = self.tokenizer
         yield
         self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
diff --git a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
index 4e7da07526..f09b5eb922 100644
--- a/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
+++ b/src/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
@@ -16,6 +16,7 @@
 Speech processor class for Wav2Vec2
 """
 import os
+import warnings
 from contextlib import contextmanager
 from dataclasses import dataclass
 from multiprocessing import get_context
@@ -99,6 +100,7 @@ class Wav2Vec2ProcessorWithLM(ProcessorMixin):
 
         self.decoder = decoder
         self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
 
     def save_pretrained(self, save_directory):
         super().save_pretrained(save_directory)
@@ -214,7 +216,35 @@ class Wav2Vec2ProcessorWithLM(ProcessorMixin):
         Wav2Vec2CTCTokenizer's [`~Wav2Vec2CTCTokenizer.__call__`]. Please refer to the docstring of the above two
         methods for more information.
         """
-        return self.current_processor(*args, **kwargs)
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor(*args, **kwargs)
+
+        if "raw_speech" in kwargs:
+            warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
+            audio = kwargs.pop("raw_speech")
+        else:
+            audio = kwargs.pop("audio", None)
+        text = kwargs.pop("text", None)
+        if len(args) > 0:
+            audio = args[0]
+            args = args[1:]
+
+        if audio is None and text is None:
+            raise ValueError("You need to specify either an `audio` or `text` input to process.")
+
+        if audio is not None:
+            inputs = self.feature_extractor(audio, *args, **kwargs)
+        if text is not None:
+            encodings = self.tokenizer(text, **kwargs)
+
+        if text is None:
+            return inputs
+        elif audio is None:
+            return encodings
+        else:
+            inputs["labels"] = encodings["input_ids"]
+            return inputs
 
     def pad(self, *args, **kwargs):
         """
@@ -224,7 +254,28 @@ class Wav2Vec2ProcessorWithLM(ProcessorMixin):
         Wav2Vec2CTCTokenizer's [`~Wav2Vec2CTCTokenizer.pad`]. Please refer to the docstring of the above two methods
         for more information.
         """
-        return self.current_processor.pad(*args, **kwargs)
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor.pad(*args, **kwargs)
+
+        input_features = kwargs.pop("input_features", None)
+        labels = kwargs.pop("labels", None)
+        if len(args) > 0:
+            input_features = args[0]
+            args = args[1:]
+
+        if input_features is not None:
+            input_features = self.feature_extractor.pad(input_features, *args, **kwargs)
+        if labels is not None:
+            labels = self.tokenizer.pad(labels, **kwargs)
+
+        if labels is None:
+            return input_features
+        elif input_features is None:
+            return labels
+        else:
+            input_features["labels"] = labels["input_ids"]
+            return input_features
 
     def batch_decode(
         self,
@@ -486,9 +537,16 @@ class Wav2Vec2ProcessorWithLM(ProcessorMixin):
     @contextmanager
     def as_target_processor(self):
         """
-        Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning
+        Temporarily sets the processor for processing the target. Useful for encoding the labels when fine-tuning
         Wav2Vec2.
         """
+        warnings.warn(
+            "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
+            "labels by using the argument `text` of the regular `__call__` method (either in the same call as "
+            "your audio inputs, or in a separate call."
+        )
+        self._in_target_context_manager = True
         self.current_processor = self.tokenizer
         yield
         self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 776c9a69db..7e259fce90 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1501,7 +1501,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         self.deprecation_warnings = (
             {}
         )  # Use to store when we have already noticed a deprecation warning (avoid overlogging).
-
+        self._in_target_context_manager = False
         super().__init__(**kwargs)
 
     @property
@@ -2431,8 +2431,12 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
     @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
     def __call__(
         self,
-        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
         text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        text_pair_target: Optional[
+            Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]
+        ] = None,
         add_special_tokens: bool = True,
         padding: Union[bool, str, PaddingStrategy] = False,
         truncation: Union[bool, str, TruncationStrategy] = False,
@@ -2455,15 +2459,85 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         sequences.
 
         Args:
-            text (`str`, `List[str]`, `List[List[str]]`):
+            text (`str`, `List[str]`, `List[List[str]]`, *optional*):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                 `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            text_pair (`str`, `List[str]`, `List[List[str]]`):
+            text_pair (`str`, `List[str]`, `List[List[str]]`, *optional*):
                 The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                 (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                 `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            text_target (`str`, `List[str]`, `List[List[str]]`, *optional*):
+                The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a
+                list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized),
+                you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            text_pair_target (`str`, `List[str]`, `List[List[str]]`, *optional*):
+                The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a
+                list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized),
+                you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
         """
+        # To avoid duplicating
+        all_kwargs = dict(
+            add_special_tokens=add_special_tokens,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            stride=stride,
+            is_split_into_words=is_split_into_words,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+        )
+        all_kwargs.update(kwargs)
+        if text is None and text_target is None:
+            raise ValueError("You need to specify either `text` or `text_target`.")
+        if text is not None:
+            # The context manager will send the inputs as normal texts and not text_target, but we shouldn't change the
+            # input mode in this case.
+            if not self._in_target_context_manager:
+                self._switch_to_input_mode()
+            encodings = self._call_one(text=text, text_pair=text_pair, **all_kwargs)
+        if text_target is not None:
+            self._switch_to_target_mode()
+            target_encodings = self._call_one(text=text_target, text_pair=text_pair_target, **all_kwargs)
+        # Leave back tokenizer in input mode
+        self._switch_to_input_mode()
+
+        if text_target is None:
+            return encodings
+        elif text is None:
+            return target_encodings
+        else:
+            encodings["labels"] = target_encodings["input_ids"]
+            return encodings
+
+    def _call_one(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
+        text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = False,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
         # Input type checking for clearer error
         def _is_valid_text_input(t):
             if isinstance(t, str):
@@ -3456,13 +3530,34 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
                 )
             self.deprecation_warnings["sequence-length-is-longer-than-the-specified-maximum"] = True
 
+    def _switch_to_input_mode(self):
+        """
+        Private method to put the tokenizer in input mode (when it has different modes for input/outputs)
+        """
+        pass
+
+    def _switch_to_target_mode(self):
+        """
+        Private method to put the tokenizer in target mode (when it has different modes for input/outputs)
+        """
+        pass
+
     @contextmanager
     def as_target_tokenizer(self):
         """
         Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to
         sequence-to-sequence models that need a slightly different processing for the labels.
         """
+        warnings.warn(
+            "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your "
+            "labels by using the argument `text_target` of the regular `__call__` method (either in the same call as "
+            "your input texts if you use the same keyword arguments, or in a separate call."
+        )
+        self._switch_to_target_mode()
+        self._in_target_context_manager = True
         yield
+        self._in_target_context_manager = False
+        self._switch_to_input_mode()
 
     @classmethod
     def register_for_auto_class(cls, auto_class="AutoTokenizer"):
@@ -3563,14 +3658,17 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         # docstyle-ignore
         formatted_warning = """
 `prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
-`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
-your targets.
+`__call__` method to prepare your inputs and targets.
 
 Here is a short example:
 
+model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)
+
+If you either need to use different keyword arguments for the source and target texts, you should do two calls like
+this:
+
 model_inputs = tokenizer(src_texts, ...)
-with tokenizer.as_target_tokenizer():
-    labels = tokenizer(tgt_texts, ...)
+labels = tokenizer(text_target=tgt_texts, ...)
 model_inputs["labels"] = labels["input_ids"]
 
 See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
diff --git a/src/transformers/utils/doc.py b/src/transformers/utils/doc.py
index 8f0caf825b..6761dec9c9 100644
--- a/src/transformers/utils/doc.py
+++ b/src/transformers/utils/doc.py
@@ -428,8 +428,7 @@ PT_SPEECH_CTC_SAMPLE = r"""
     ```
 
     ```python
-    >>> with processor.as_target_processor():
-    ...     inputs["labels"] = processor(dataset[0]["text"], return_tensors="pt").input_ids
+    >>> inputs["labels"] = processor(text=dataset[0]["text"], return_tensors="pt").input_ids
 
     >>> # compute loss
     >>> loss = model(**inputs).loss
@@ -849,8 +848,7 @@ TF_SPEECH_CTC_SAMPLE = r"""
     ```
 
     ```python
-    >>> with processor.as_target_processor():
-    ...     inputs["labels"] = processor(dataset[0]["text"], return_tensors="tf").input_ids
+    >>> inputs["labels"] = processor(text=dataset[0]["text"], return_tensors="tf").input_ids
 
     >>> # compute loss
     >>> loss = model(**inputs).loss
diff --git a/tests/models/bart/test_tokenization_bart.py b/tests/models/bart/test_tokenization_bart.py
index b8e216e69b..24ea6e1e5c 100644
--- a/tests/models/bart/test_tokenization_bart.py
+++ b/tests/models/bart/test_tokenization_bart.py
@@ -112,14 +112,13 @@ class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase):
             self.assertNotIn("decoder_attention_mask", batch)
 
     @require_torch
-    def test_as_target_tokenizer_target_length(self):
+    def test_tokenizer_as_target_length(self):
         tgt_text = [
             "Summary of the text.",
             "Another summary.",
         ]
         for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]:
-            with tokenizer.as_target_tokenizer():
-                targets = tokenizer(tgt_text, max_length=32, padding="max_length", return_tensors="pt")
+            targets = tokenizer(text_target=tgt_text, max_length=32, padding="max_length", return_tensors="pt")
             self.assertEqual(32, targets["input_ids"].shape[1])
 
     @require_torch
@@ -140,8 +139,7 @@ class TestTokenizationBart(TokenizerTesterMixin, unittest.TestCase):
         ]
         for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]:
             inputs = tokenizer(src_text, return_tensors="pt")
-            with tokenizer.as_target_tokenizer():
-                targets = tokenizer(tgt_text, return_tensors="pt")
+            targets = tokenizer(text_target=tgt_text, return_tensors="pt")
             input_ids = inputs["input_ids"]
             labels = targets["input_ids"]
             self.assertTrue((input_ids[:, 0] == tokenizer.bos_token_id).all().item())
diff --git a/tests/models/byt5/test_tokenization_byt5.py b/tests/models/byt5/test_tokenization_byt5.py
index 70cfa40ef9..85057c5278 100644
--- a/tests/models/byt5/test_tokenization_byt5.py
+++ b/tests/models/byt5/test_tokenization_byt5.py
@@ -152,10 +152,9 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
             "Summary of the text.",
             "Another summary.",
         ]
-        with tokenizer.as_target_tokenizer():
-            targets = tokenizer(
-                tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors=FRAMEWORK
-            )
+        targets = tokenizer(
+            text_target=tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors=FRAMEWORK
+        )
         self.assertEqual(32, targets["input_ids"].shape[1])
 
     def test_eos_in_input(self):
@@ -167,12 +166,10 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         expected_tgt_tokens = [86, 120, 112, 112, 100, 117, 124, 35, 114, 105, 35, 119, 107, 104, 35, 119, 104, 123, 119, 49, 35, 1]
         # fmt: on
 
-        batch = tokenizer(src_text)
-        with tokenizer.as_target_tokenizer():
-            targets = tokenizer(tgt_text)
+        batch = tokenizer(src_text, text_target=tgt_text)
 
         self.assertEqual(expected_src_tokens, batch["input_ids"][0])
-        self.assertEqual(expected_tgt_tokens, targets["input_ids"][0])
+        self.assertEqual(expected_tgt_tokens, batch["labels"][0])
 
     # cannot use default save_and_load_tokenzier test method because tokenzier has no vocab
     def test_save_and_load_tokenizer(self):
diff --git a/tests/models/canine/test_tokenization_canine.py b/tests/models/canine/test_tokenization_canine.py
index 0e016d523b..6ae27082cc 100644
--- a/tests/models/canine/test_tokenization_canine.py
+++ b/tests/models/canine/test_tokenization_canine.py
@@ -80,8 +80,9 @@ class CanineTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
             "What's the weater?",
             "It's about 25 degrees.",
         ]
-        with tokenizer.as_target_tokenizer():
-            targets = tokenizer(tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors="pt")
+        targets = tokenizer(
+            text_target=tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors="pt"
+        )
         self.assertEqual(32, targets["input_ids"].shape[1])
 
     # cannot use default save_and_load_tokenzier test method because tokenzier has no vocab
diff --git a/tests/models/dpr/test_tokenization_dpr.py b/tests/models/dpr/test_tokenization_dpr.py
index 2870e0bcf3..8ad2fea09c 100644
--- a/tests/models/dpr/test_tokenization_dpr.py
+++ b/tests/models/dpr/test_tokenization_dpr.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 from transformers import (
     DPRContextEncoderTokenizer,
     DPRContextEncoderTokenizerFast,
diff --git a/tests/models/m2m_100/test_tokenization_m2m_100.py b/tests/models/m2m_100/test_tokenization_m2m_100.py
index 729deb6cd4..ca8349d940 100644
--- a/tests/models/m2m_100/test_tokenization_m2m_100.py
+++ b/tests/models/m2m_100/test_tokenization_m2m_100.py
@@ -187,9 +187,7 @@ class M2M100TokenizerIntegrationTest(unittest.TestCase):
         self.tokenizer.src_lang = "en"
         self.tokenizer.tgt_lang = "fr"
 
-        batch = self.tokenizer(self.src_text, padding=True, return_tensors="pt")
-        with self.tokenizer.as_target_tokenizer():
-            batch["labels"] = self.tokenizer(self.tgt_text, padding=True, return_tensors="pt").input_ids
+        batch = self.tokenizer(self.src_text, text_target=self.tgt_text, padding=True, return_tensors="pt")
 
         batch["decoder_input_ids"] = shift_tokens_right(
             batch["labels"], self.tokenizer.pad_token_id, self.tokenizer.eos_token_id
@@ -217,17 +215,19 @@ class M2M100TokenizerIntegrationTest(unittest.TestCase):
         self.assertListEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id])
 
     @require_torch
-    def test_as_target_tokenizer(self):
+    def test_tokenizer_target_mode(self):
         self.tokenizer.tgt_lang = "mr"
-        with self.tokenizer.as_target_tokenizer():
-            self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id("mr")])
-            self.assertListEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id])
+        self.tokenizer._switch_to_target_mode()
+        self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id("mr")])
+        self.assertListEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id])
+        self.tokenizer._switch_to_input_mode()
         self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id(self.tokenizer.src_lang)])
 
         self.tokenizer.tgt_lang = "zh"
-        with self.tokenizer.as_target_tokenizer():
-            self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id("zh")])
-            self.assertListEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id])
+        self.tokenizer._switch_to_target_mode()
+        self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id("zh")])
+        self.assertListEqual(self.tokenizer.suffix_tokens, [self.tokenizer.eos_token_id])
+        self.tokenizer._switch_to_input_mode()
         self.assertListEqual(self.tokenizer.prefix_tokens, [self.tokenizer.get_lang_id(self.tokenizer.src_lang)])
 
     @require_torch
diff --git a/tests/models/marian/test_modeling_marian.py b/tests/models/marian/test_modeling_marian.py
index e454f981b4..6ca951e37a 100644
--- a/tests/models/marian/test_modeling_marian.py
+++ b/tests/models/marian/test_modeling_marian.py
@@ -438,10 +438,7 @@ class TestMarian_EN_DE_More(MarianIntegrationTest):
         src, tgt = ["I am a small frog"], ["Ich bin ein kleiner Frosch."]
         expected_ids = [38, 121, 14, 697, 38848, 0]
 
-        model_inputs = self.tokenizer(src, return_tensors="pt").to(torch_device)
-        with self.tokenizer.as_target_tokenizer():
-            targets = self.tokenizer(tgt, return_tensors="pt")
-        model_inputs["labels"] = targets["input_ids"].to(torch_device)
+        model_inputs = self.tokenizer(src, text_target=tgt, return_tensors="pt").to(torch_device)
 
         self.assertListEqual(expected_ids, model_inputs.input_ids[0].tolist())
 
diff --git a/tests/models/marian/test_tokenization_marian.py b/tests/models/marian/test_tokenization_marian.py
index 2cbc0b0a3f..6a079036bb 100644
--- a/tests/models/marian/test_tokenization_marian.py
+++ b/tests/models/marian/test_tokenization_marian.py
@@ -145,9 +145,8 @@ class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         src_ids = tokenizer(source_text).input_ids
         self.assertListEqual(src_ids, expected_src_ids)
 
-        with tokenizer.as_target_tokenizer():
-            target_ids = tokenizer(target_text).input_ids
-            self.assertListEqual(target_ids, expected_target_ids)
+        target_ids = tokenizer(text_target=target_text).input_ids
+        self.assertListEqual(target_ids, expected_target_ids)
 
         decoded = tokenizer.decode(target_ids, skip_special_tokens=True)
         self.assertEqual(decoded, target_text)
diff --git a/tests/models/mbart/test_tokenization_mbart.py b/tests/models/mbart/test_tokenization_mbart.py
index e80531051b..f65662dbe2 100644
--- a/tests/models/mbart/test_tokenization_mbart.py
+++ b/tests/models/mbart/test_tokenization_mbart.py
@@ -265,33 +265,27 @@ class MBartEnroIntegrationTest(unittest.TestCase):
 
     @require_torch
     def test_batch_fairseq_parity(self):
-        batch = self.tokenizer(self.src_text, padding=True)
-        with self.tokenizer.as_target_tokenizer():
-            targets = self.tokenizer(self.tgt_text, padding=True, return_tensors="pt")
-        labels = targets["input_ids"]
-        batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id).tolist()
+        batch = self.tokenizer(self.src_text, text_target=self.tgt_text, padding=True, return_tensors="pt")
+        batch["decoder_input_ids"] = shift_tokens_right(batch["labels"], self.tokenizer.pad_token_id)
 
         # fairseq batch: https://gist.github.com/sshleifer/cba08bc2109361a74ac3760a7e30e4f4
-        assert batch.input_ids[1][-2:] == [2, EN_CODE]
-        assert batch.decoder_input_ids[1][0] == RO_CODE
+        assert batch.input_ids[1][-2:].tolist() == [2, EN_CODE]
+        assert batch.decoder_input_ids[1][0].tolist() == RO_CODE
         assert batch.decoder_input_ids[1][-1] == 2
-        assert labels[1][-2:].tolist() == [2, RO_CODE]
+        assert batch.labels[1][-2:].tolist() == [2, RO_CODE]
 
     @require_torch
     def test_enro_tokenizer_prepare_batch(self):
         batch = self.tokenizer(
-            self.src_text, padding=True, truncation=True, max_length=len(self.expected_src_tokens), return_tensors="pt"
+            self.src_text,
+            text_target=self.tgt_text,
+            padding=True,
+            truncation=True,
+            max_length=len(self.expected_src_tokens),
+            return_tensors="pt",
         )
-        with self.tokenizer.as_target_tokenizer():
-            targets = self.tokenizer(
-                self.tgt_text,
-                padding=True,
-                truncation=True,
-                max_length=len(self.expected_src_tokens),
-                return_tensors="pt",
-            )
-        labels = targets["input_ids"]
-        batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id)
+
+        batch["decoder_input_ids"] = shift_tokens_right(batch["labels"], self.tokenizer.pad_token_id)
 
         self.assertIsInstance(batch, BatchEncoding)
 
@@ -306,8 +300,9 @@ class MBartEnroIntegrationTest(unittest.TestCase):
 
     def test_seq2seq_max_length(self):
         batch = self.tokenizer(self.src_text, padding=True, truncation=True, max_length=3, return_tensors="pt")
-        with self.tokenizer.as_target_tokenizer():
-            targets = self.tokenizer(self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pt")
+        targets = self.tokenizer(
+            text_target=self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pt"
+        )
         labels = targets["input_ids"]
         batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id)
 
diff --git a/tests/models/mbart50/test_tokenization_mbart50.py b/tests/models/mbart50/test_tokenization_mbart50.py
index 5a65d88566..d10d51df90 100644
--- a/tests/models/mbart50/test_tokenization_mbart50.py
+++ b/tests/models/mbart50/test_tokenization_mbart50.py
@@ -256,35 +256,27 @@ class MBart50OneToManyIntegrationTest(unittest.TestCase):
 
     @require_torch
     def test_batch_fairseq_parity(self):
-        batch = self.tokenizer(self.src_text, padding=True)
-        with self.tokenizer.as_target_tokenizer():
-            targets = self.tokenizer(self.tgt_text, padding=True, return_tensors="pt")
-        labels = targets["input_ids"]
-        batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id).tolist()
-        labels = labels.tolist()
+        batch = self.tokenizer(self.src_text, text_target=self.tgt_text, padding=True, return_tensors="pt")
+        batch["decoder_input_ids"] = shift_tokens_right(batch["labels"], self.tokenizer.pad_token_id)
 
         # fairseq batch: https://gist.github.com/sshleifer/cba08bc2109361a74ac3760a7e30e4f4
         assert batch.input_ids[1][0] == EN_CODE
         assert batch.input_ids[1][-1] == 2
-        assert labels[1][0] == RO_CODE
-        assert labels[1][-1] == 2
-        assert batch.decoder_input_ids[1][:2] == [2, RO_CODE]
+        assert batch.labels[1][0] == RO_CODE
+        assert batch.labels[1][-1] == 2
+        assert batch.decoder_input_ids[1][:2].tolist() == [2, RO_CODE]
 
     @require_torch
     def test_tokenizer_prepare_batch(self):
         batch = self.tokenizer(
-            self.src_text, padding=True, truncation=True, max_length=len(self.expected_src_tokens), return_tensors="pt"
+            self.src_text,
+            text_target=self.tgt_text,
+            padding=True,
+            truncation=True,
+            max_length=len(self.expected_src_tokens),
+            return_tensors="pt",
         )
-        with self.tokenizer.as_target_tokenizer():
-            targets = self.tokenizer(
-                self.tgt_text,
-                padding=True,
-                truncation=True,
-                max_length=len(self.expected_src_tokens),
-                return_tensors="pt",
-            )
-        labels = targets["input_ids"]
-        batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id)
+        batch["decoder_input_ids"] = shift_tokens_right(batch["labels"], self.tokenizer.pad_token_id)
 
         self.assertIsInstance(batch, BatchEncoding)
 
@@ -299,8 +291,9 @@ class MBart50OneToManyIntegrationTest(unittest.TestCase):
 
     def test_seq2seq_max_target_length(self):
         batch = self.tokenizer(self.src_text, padding=True, truncation=True, max_length=3, return_tensors="pt")
-        with self.tokenizer.as_target_tokenizer():
-            targets = self.tokenizer(self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pt")
+        targets = self.tokenizer(
+            text_target=self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pt"
+        )
         labels = targets["input_ids"]
         batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id)
 
diff --git a/tests/models/mctct/test_processor_mctct.py b/tests/models/mctct/test_processor_mctct.py
index 83201f4102..821e44b48e 100644
--- a/tests/models/mctct/test_processor_mctct.py
+++ b/tests/models/mctct/test_processor_mctct.py
@@ -125,8 +125,7 @@ class MCTCTProcessorTest(unittest.TestCase):
 
         input_str = "This is a test string"
 
-        with processor.as_target_processor():
-            encoded_processor = processor(input_str)
+        encoded_processor = processor(text=input_str)
 
         encoded_tok = tokenizer(input_str)
 
diff --git a/tests/models/mvp/test_tokenization_mvp.py b/tests/models/mvp/test_tokenization_mvp.py
index ad3fad67c9..71e83fba0e 100644
--- a/tests/models/mvp/test_tokenization_mvp.py
+++ b/tests/models/mvp/test_tokenization_mvp.py
@@ -112,14 +112,13 @@ class TestTokenizationMvp(TokenizerTesterMixin, unittest.TestCase):
             self.assertNotIn("decoder_attention_mask", batch)
 
     @require_torch
-    def test_as_target_tokenizer_target_length(self):
+    def test_tokenizer_as_target_length(self):
         tgt_text = [
             "Summary of the text.",
             "Another summary.",
         ]
         for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]:
-            with tokenizer.as_target_tokenizer():
-                targets = tokenizer(tgt_text, max_length=32, padding="max_length", return_tensors="pt")
+            targets = tokenizer(text_target=tgt_text, max_length=32, padding="max_length", return_tensors="pt")
             self.assertEqual(32, targets["input_ids"].shape[1])
 
     @require_torch
@@ -139,11 +138,9 @@ class TestTokenizationMvp(TokenizerTesterMixin, unittest.TestCase):
             "Summary of the text.",
         ]
         for tokenizer in [self.default_tokenizer, self.default_tokenizer_fast]:
-            inputs = tokenizer(src_text, return_tensors="pt")
-            with tokenizer.as_target_tokenizer():
-                targets = tokenizer(tgt_text, return_tensors="pt")
+            inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
             input_ids = inputs["input_ids"]
-            labels = targets["input_ids"]
+            labels = inputs["labels"]
             self.assertTrue((input_ids[:, 0] == tokenizer.bos_token_id).all().item())
             self.assertTrue((labels[:, 0] == tokenizer.bos_token_id).all().item())
             self.assertTrue((input_ids[:, -1] == tokenizer.eos_token_id).all().item())
diff --git a/tests/models/nllb/test_tokenization_nllb.py b/tests/models/nllb/test_tokenization_nllb.py
index 10575084a7..d77b101fa7 100644
--- a/tests/models/nllb/test_tokenization_nllb.py
+++ b/tests/models/nllb/test_tokenization_nllb.py
@@ -373,19 +373,15 @@ class NllbDistilledIntegrationTest(unittest.TestCase):
     @require_torch
     def test_enro_tokenizer_prepare_batch(self):
         batch = self.tokenizer(
-            self.src_text, padding=True, truncation=True, max_length=len(self.expected_src_tokens), return_tensors="pt"
+            self.src_text,
+            text_target=self.tgt_text,
+            padding=True,
+            truncation=True,
+            max_length=len(self.expected_src_tokens),
+            return_tensors="pt",
         )
-        with self.tokenizer.as_target_tokenizer():
-            targets = self.tokenizer(
-                self.tgt_text,
-                padding=True,
-                truncation=True,
-                max_length=len(self.expected_src_tokens),
-                return_tensors="pt",
-            )
-        labels = targets["input_ids"]
         batch["decoder_input_ids"] = shift_tokens_right(
-            labels, self.tokenizer.pad_token_id, self.tokenizer.lang_code_to_id["ron_Latn"]
+            batch["labels"], self.tokenizer.pad_token_id, self.tokenizer.lang_code_to_id["ron_Latn"]
         )
 
         self.assertIsInstance(batch, BatchEncoding)
@@ -401,8 +397,9 @@ class NllbDistilledIntegrationTest(unittest.TestCase):
 
     def test_seq2seq_max_length(self):
         batch = self.tokenizer(self.src_text, padding=True, truncation=True, max_length=3, return_tensors="pt")
-        with self.tokenizer.as_target_tokenizer():
-            targets = self.tokenizer(self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pt")
+        targets = self.tokenizer(
+            text_target=self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pt"
+        )
         labels = targets["input_ids"]
         batch["decoder_input_ids"] = shift_tokens_right(
             labels,
diff --git a/tests/models/pegasus/test_tokenization_pegasus.py b/tests/models/pegasus/test_tokenization_pegasus.py
index d473725f9a..de2886a5e1 100644
--- a/tests/models/pegasus/test_tokenization_pegasus.py
+++ b/tests/models/pegasus/test_tokenization_pegasus.py
@@ -109,10 +109,9 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         src_texts = ["This is going to be way too long." * 150, "short example"]
         tgt_texts = ["not super long but more than 5 tokens", "tiny"]
         batch = self._large_tokenizer(src_texts, padding=True, truncation=True, return_tensors="pt")
-        with self._large_tokenizer.as_target_tokenizer():
-            targets = self._large_tokenizer(
-                tgt_texts, max_length=5, padding=True, truncation=True, return_tensors="pt"
-            )
+        targets = self._large_tokenizer(
+            text_target=tgt_texts, max_length=5, padding=True, truncation=True, return_tensors="pt"
+        )
 
         assert batch.input_ids.shape == (2, 1024)
         assert batch.attention_mask.shape == (2, 1024)
@@ -174,10 +173,9 @@ class BigBirdPegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         src_texts = ["This is going to be way too long." * 1000, "short example"]
         tgt_texts = ["not super long but more than 5 tokens", "tiny"]
         batch = self._large_tokenizer(src_texts, padding=True, truncation=True, return_tensors="pt")
-        with self._large_tokenizer.as_target_tokenizer():
-            targets = self._large_tokenizer(
-                tgt_texts, max_length=5, padding=True, truncation=True, return_tensors="pt"
-            )
+        targets = self._large_tokenizer(
+            text_target=tgt_texts, max_length=5, padding=True, truncation=True, return_tensors="pt"
+        )
 
         assert batch.input_ids.shape == (2, 4096)
         assert batch.attention_mask.shape == (2, 4096)
diff --git a/tests/models/perceiver/test_tokenization_perceiver.py b/tests/models/perceiver/test_tokenization_perceiver.py
index ca61e9c856..3c7a67bcd2 100644
--- a/tests/models/perceiver/test_tokenization_perceiver.py
+++ b/tests/models/perceiver/test_tokenization_perceiver.py
@@ -146,10 +146,9 @@ class PerceiverTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
             "Summary of the text.",
             "Another summary.",
         ]
-        with tokenizer.as_target_tokenizer():
-            targets = tokenizer(
-                tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors=FRAMEWORK
-            )
+        targets = tokenizer(
+            text_target=tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors=FRAMEWORK
+        )
         self.assertEqual(32, targets["input_ids"].shape[1])
 
     # cannot use default save_and_load_tokenzier test method because tokenzier has no vocab
diff --git a/tests/models/plbart/test_tokenization_plbart.py b/tests/models/plbart/test_tokenization_plbart.py
index 9aed6040f3..2ce7cafbda 100644
--- a/tests/models/plbart/test_tokenization_plbart.py
+++ b/tests/models/plbart/test_tokenization_plbart.py
@@ -299,33 +299,26 @@ class PLBartPythonEnIntegrationTest(unittest.TestCase):
 
     @require_torch
     def test_batch_fairseq_parity(self):
-        batch = self.tokenizer(self.src_text, padding=True)
-        with self.tokenizer.as_target_tokenizer():
-            targets = self.tokenizer(self.tgt_text, padding=True, return_tensors="pt")
-        labels = targets["input_ids"]
-        batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id).tolist()
+        batch = self.tokenizer(self.src_text, text_target=self.tgt_text, padding=True, return_tensors="pt")
+        batch["decoder_input_ids"] = shift_tokens_right(batch["labels"], self.tokenizer.pad_token_id)
 
         # fairseq batch: https://gist.github.com/sshleifer/cba08bc2109361a74ac3760a7e30e4f4
-        self.assertEqual(batch.input_ids[1][-2:], [2, PYTHON_CODE])
+        self.assertEqual(batch.input_ids[1][-2:].tolist(), [2, PYTHON_CODE])
         self.assertEqual(batch.decoder_input_ids[1][0], EN_CODE)
         self.assertEqual(batch.decoder_input_ids[1][-1], 2)
-        self.assertEqual(labels[1][-2:].tolist(), [2, EN_CODE])
+        self.assertEqual(batch.labels[1][-2:].tolist(), [2, EN_CODE])
 
     @require_torch
     def test_python_en_tokenizer_prepare_batch(self):
         batch = self.tokenizer(
-            self.src_text, padding=True, truncation=True, max_length=len(self.expected_src_tokens), return_tensors="pt"
+            self.src_text,
+            text_target=self.tgt_text,
+            padding=True,
+            truncation=True,
+            max_length=len(self.expected_src_tokens),
+            return_tensors="pt",
         )
-        with self.tokenizer.as_target_tokenizer():
-            targets = self.tokenizer(
-                self.tgt_text,
-                padding=True,
-                truncation=True,
-                max_length=len(self.expected_src_tokens),
-                return_tensors="pt",
-            )
-        labels = targets["input_ids"]
-        batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id)
+        batch["decoder_input_ids"] = shift_tokens_right(batch["labels"], self.tokenizer.pad_token_id)
 
         self.assertIsInstance(batch, BatchEncoding)
 
@@ -340,8 +333,9 @@ class PLBartPythonEnIntegrationTest(unittest.TestCase):
 
     def test_seq2seq_max_length(self):
         batch = self.tokenizer(self.src_text, padding=True, truncation=True, max_length=3, return_tensors="pt")
-        with self.tokenizer.as_target_tokenizer():
-            targets = self.tokenizer(self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pt")
+        targets = self.tokenizer(
+            text_target=self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pt"
+        )
         labels = targets["input_ids"]
         batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id)
 
diff --git a/tests/models/speech_to_text/test_processor_speech_to_text.py b/tests/models/speech_to_text/test_processor_speech_to_text.py
index e6e43f1bb8..d519f005d3 100644
--- a/tests/models/speech_to_text/test_processor_speech_to_text.py
+++ b/tests/models/speech_to_text/test_processor_speech_to_text.py
@@ -125,8 +125,7 @@ class Speech2TextProcessorTest(unittest.TestCase):
 
         input_str = "This is a test string"
 
-        with processor.as_target_processor():
-            encoded_processor = processor(input_str)
+        encoded_processor = processor(text=input_str)
 
         encoded_tok = tokenizer(input_str)
 
diff --git a/tests/models/t5/test_tokenization_t5.py b/tests/models/t5/test_tokenization_t5.py
index 1c0fde222c..28d85c77c9 100644
--- a/tests/models/t5/test_tokenization_t5.py
+++ b/tests/models/t5/test_tokenization_t5.py
@@ -210,10 +210,9 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
             "Summary of the text.",
             "Another summary.",
         ]
-        with tokenizer.as_target_tokenizer():
-            targets = tokenizer(
-                tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors=FRAMEWORK
-            )
+        targets = tokenizer(
+            text_target=tgt_text, max_length=32, padding="max_length", truncation=True, return_tensors=FRAMEWORK
+        )
         self.assertEqual(32, targets["input_ids"].shape[1])
 
     def test_outputs_not_longer_than_maxlen(self):
@@ -235,12 +234,10 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         expected_src_tokens = [71, 307, 8986, 21, 4505, 1635, 1707, 5, 1]
         expected_tgt_tokens = [20698, 13, 8, 1499, 5, 1]
 
-        batch = tokenizer(src_text)
-        with tokenizer.as_target_tokenizer():
-            targets = tokenizer(tgt_text)
+        batch = tokenizer(src_text, text_target=tgt_text)
 
         self.assertEqual(expected_src_tokens, batch["input_ids"][0])
-        self.assertEqual(expected_tgt_tokens, targets["input_ids"][0])
+        self.assertEqual(expected_tgt_tokens, batch["labels"][0])
 
     def test_token_type_ids(self):
         src_text_1 = ["A first paragraph for summarization."]
diff --git a/tests/models/tapex/test_tokenization_tapex.py b/tests/models/tapex/test_tokenization_tapex.py
index c959b78021..dec0f507ed 100644
--- a/tests/models/tapex/test_tokenization_tapex.py
+++ b/tests/models/tapex/test_tokenization_tapex.py
@@ -859,9 +859,8 @@ class TapexTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         tokenizer = TapexTokenizer.from_pretrained("microsoft/tapex-base")
         answer_text = "tapex is a good model!"
         expected_src_tokens = [0, 90, 5776, 1178, 16, 10, 205, 1421, 328, 2]
-        with tokenizer.as_target_tokenizer():
-            answer_encoding = tokenizer(answer=answer_text)
-            self.assertListEqual(answer_encoding.input_ids, expected_src_tokens)
+        answer_encoding = tokenizer(answer=answer_text)
+        self.assertListEqual(answer_encoding.input_ids, expected_src_tokens)
 
     @slow
     def test_tokenizer_lower_case(self):
@@ -870,23 +869,21 @@ class TapexTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
         answer_text = "Beijing, London, Paris"
         answer_text_lower = "beijing, london, paris"
 
-        with cased_tokenizer.as_target_tokenizer():
-            with uncased_tokenizer.as_target_tokenizer():
-                self.assertNotEqual(
-                    cased_tokenizer(answer=answer_text).input_ids, uncased_tokenizer(answer=answer_text).input_ids
-                )
-                self.assertEqual(
-                    cased_tokenizer(answer=answer_text_lower).input_ids,
-                    uncased_tokenizer(answer=answer_text).input_ids,
-                )
-                # batched encoding assert
-                self.assertNotEqual(
-                    cased_tokenizer(answer=[answer_text]).input_ids, uncased_tokenizer(answer=[answer_text]).input_ids
-                )
-                self.assertEqual(
-                    cased_tokenizer(answer=[answer_text_lower]).input_ids,
-                    uncased_tokenizer(answer=[answer_text]).input_ids,
-                )
+        self.assertNotEqual(
+            cased_tokenizer(answer=answer_text).input_ids, uncased_tokenizer(answer=answer_text).input_ids
+        )
+        self.assertEqual(
+            cased_tokenizer(answer=answer_text_lower).input_ids,
+            uncased_tokenizer(answer=answer_text).input_ids,
+        )
+        # batched encoding assert
+        self.assertNotEqual(
+            cased_tokenizer(answer=[answer_text]).input_ids, uncased_tokenizer(answer=[answer_text]).input_ids
+        )
+        self.assertEqual(
+            cased_tokenizer(answer=[answer_text_lower]).input_ids,
+            uncased_tokenizer(answer=[answer_text]).input_ids,
+        )
         # test input encoding lowercase
         question = "Greece held its last Summer Olympics in 2004"
         table_dict = {
diff --git a/tests/models/wav2vec2/test_processor_wav2vec2.py b/tests/models/wav2vec2/test_processor_wav2vec2.py
index 8b7188f8eb..5f1c259061 100644
--- a/tests/models/wav2vec2/test_processor_wav2vec2.py
+++ b/tests/models/wav2vec2/test_processor_wav2vec2.py
@@ -118,8 +118,7 @@ class Wav2Vec2ProcessorTest(unittest.TestCase):
 
         input_str = "This is a test string"
 
-        with processor.as_target_processor():
-            encoded_processor = processor(input_str)
+        encoded_processor = processor(text=input_str)
 
         encoded_tok = tokenizer(input_str)
 
diff --git a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
index f5b3eea926..d66a592386 100644
--- a/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
+++ b/tests/models/wav2vec2_with_lm/test_processor_wav2vec2_with_lm.py
@@ -164,8 +164,7 @@ class Wav2Vec2ProcessorWithLMTest(unittest.TestCase):
 
         input_str = "This is a test string"
 
-        with processor.as_target_processor():
-            encoded_processor = processor(input_str)
+        encoded_processor = processor(text=input_str)
 
         encoded_tok = tokenizer(input_str)