Remove deprecated (#8604)

* Remove old deprecated arguments Co-authored-by: LysandreJik <lysandre.debut@reseau.eseo.fr> * Remove needless imports * Fix tests Co-authored-by: LysandreJik <lysandre.debut@reseau.eseo.fr>
2020-11-17 15:11:29 -05:00
parent 3095ee9dab
commit dd52804f5f
37 changed files with 22 additions and 610 deletions
--- a/examples/seq2seq/test_finetune_trainer.py
+++ b/examples/seq2seq/test_finetune_trainer.py
@@ -138,7 +138,7 @@ class TestFinetuneTrainer(TestCasePlus):
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            predict_with_generate=True,
-            evaluate_during_training=True,
+            evaluation_strategy="steps",
            do_train=True,
            do_eval=True,
            warmup_steps=0,
@@ -179,7 +179,7 @@ class TestFinetuneTrainer(TestCasePlus):
            --per_device_eval_batch_size 4
            --learning_rate 3e-3
            --warmup_steps 8
-            --evaluate_during_training
+            --evaluation_strategy steps
            --predict_with_generate
            --logging_steps 0
            --save_steps {str(eval_steps)}
--- a/examples/token-classification/run_ner_old.py
+++ b/examples/token-classification/run_ner_old.py
@@ -254,7 +254,7 @@ def main():
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
-        if trainer.is_world_master():
+        if trainer.is_world_process_zero():
            tokenizer.save_pretrained(training_args.output_dir)
    # Evaluation
@@ -265,7 +265,7 @@ def main():
        result = trainer.evaluate()
        output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
-        if trainer.is_world_master():
+        if trainer.is_world_process_zero():
            with open(output_eval_file, "w") as writer:
                logger.info("***** Eval results *****")
                for key, value in result.items():
--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@@ -145,11 +145,11 @@ def squad_convert_example_to_features(
    # in the way they compute mask of added tokens.
    tokenizer_type = type(tokenizer).__name__.replace("Tokenizer", "").lower()
    sequence_added_tokens = (
-        tokenizer.max_len - tokenizer.max_len_single_sentence + 1
+        tokenizer.model_max_length - tokenizer.max_len_single_sentence + 1
        if tokenizer_type in MULTI_SEP_TOKENS_TOKENIZERS_SET
-        else tokenizer.max_len - tokenizer.max_len_single_sentence
+        else tokenizer.model_max_length - tokenizer.max_len_single_sentence
    )
-    sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair
+    sequence_pair_added_tokens = tokenizer.model_max_length - tokenizer.max_len_sentences_pair
    span_doc_tokens = all_doc_tokens
    while len(spans) * doc_stride < len(all_doc_tokens):
--- a/src/transformers/models/albert/modeling_albert.py
+++ b/src/transformers/models/albert/modeling_albert.py
@@ -16,7 +16,6 @@
 import math
 import os
 import warnings
 from dataclasses import dataclass
 from typing import Optional, Tuple
@@ -742,7 +741,6 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs,
    ):
        r"""
        labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`):
@@ -753,8 +751,6 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
            (see :obj:`input_ids` docstring) Indices should be in ``[0, 1]``. ``0`` indicates original order (sequence
            A, then sequence B), ``1`` indicates switched order (sequence B, then sequence A).
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        Returns:
@@ -773,14 +769,6 @@ class AlbertForPreTraining(AlbertPreTrainedModel):
            >>> sop_logits = outputs.sop_logits
        """
        if "masked_lm_labels" in kwargs:
            warnings.warn(
                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
                FutureWarning,
            )
            labels = kwargs.pop("masked_lm_labels")
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.albert(
@@ -898,23 +886,13 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        """
        if "masked_lm_labels" in kwargs:
            warnings.warn(
                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
                FutureWarning,
            )
            labels = kwargs.pop("masked_lm_labels")
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.albert(
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -15,7 +15,6 @@
 """PyTorch BART model, ported from the fairseq repo."""
 import math
 import random
 import warnings
 from typing import Dict, List, Optional, Tuple
 import numpy as np
@@ -529,7 +528,6 @@ class BartDecoder(nn.Module):
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
        **unused,
    ):
        """
        Includes several features from "Jointly Learning to Align and Translate with Transformer Models" (Garg et al.,
@@ -551,18 +549,6 @@ class BartDecoder(nn.Module):
                - hidden states
                - attentions
        """
        if "decoder_cached_states" in unused:
            warnings.warn(
                "The `decoder_cached_states` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
                FutureWarning,
            )
            past_key_values = unused.pop("decoder_cached_states")
        if "decoder_past_key_values" in unused:
            warnings.warn(
                "The `decoder_past_key_values` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
                FutureWarning,
            )
            past_key_values = unused.pop("decoder_past_key_values")
        # check attention mask and invert
        if encoder_padding_mask is not None:
@@ -873,14 +859,7 @@ class BartModel(PretrainedBartModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs,
    ):
        if "decoder_past_key_values" in kwargs:
            warnings.warn(
                "The `decoder_past_key_values` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
                FutureWarning,
            )
            past_key_values = kwargs.pop("decoder_past_key_values")
        if decoder_input_ids is None:
            use_cache = False
@@ -1006,7 +985,6 @@ class BartForConditionalGeneration(PretrainedBartModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **unused,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
@@ -1034,24 +1012,6 @@ class BartForConditionalGeneration(PretrainedBartModel):
            >>> tokenizer.decode(predictions).split()
            >>> # ['good', 'great', 'all', 'really', 'very']
        """
        if "lm_labels" in unused:
            warnings.warn(
                "The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
                FutureWarning,
            )
            labels = unused.pop("lm_labels")
        if "decoder_cached_states" in unused:
            warnings.warn(
                "The `decoder_cached_states` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
                FutureWarning,
            )
            past_key_values = unused.pop("decoder_cached_states")
        if "decoder_past_key_values" in unused:
            warnings.warn(
                "The `decoder_past_key_values` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
                FutureWarning,
            )
            past_key_values = unused.pop("decoder_past_key_values")
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        if labels is not None:
--- a/src/transformers/models/bert/modeling_bert.py
+++ b/src/transformers/models/bert/modeling_bert.py
@@ -896,7 +896,6 @@ class BertForPreTraining(BertPreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape ``(batch_size, sequence_length)``, `optional`):
@@ -928,13 +927,6 @@ class BertForPreTraining(BertPreTrainedModel):
            >>> prediction_logits = outputs.prediction_logits
            >>> seq_relationship_logits = outputs.seq_relationship_logits
        """
        if "masked_lm_labels" in kwargs:
            warnings.warn(
                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
                FutureWarning,
            )
            labels = kwargs.pop("masked_lm_labels")
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.bert(
@@ -1136,24 +1128,13 @@ class BertForMaskedLM(BertPreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        """
        if "masked_lm_labels" in kwargs:
            warnings.warn(
                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
                FutureWarning,
            )
            labels = kwargs.pop("masked_lm_labels")
        assert "lm_labels" not in kwargs, "Use `BertWithLMHead` for autoregressive language modeling task."
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
--- a/src/transformers/models/ctrl/modeling_ctrl.py
+++ b/src/transformers/models/ctrl/modeling_ctrl.py
@@ -15,9 +15,6 @@
 # limitations under the License.
 """ PyTorch CTRL model."""
 import warnings
 import numpy as np
 import torch
 import torch.nn as nn
@@ -369,15 +366,7 @@ class CTRLModel(CTRLPreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs,
    ):
        if "past" in kwargs:
            warnings.warn(
                "The `past` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
                FutureWarning,
            )
            past_key_values = kwargs.pop("past")
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        use_cache = use_cache if use_cache is not None else self.config.use_cache
@@ -542,7 +531,6 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
@@ -550,13 +538,6 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
            ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to
            ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
        """
        if "past" in kwargs:
            warnings.warn(
                "The `past` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
                FutureWarning,
            )
            past_key_values = kwargs.pop("past")
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        transformer_outputs = self.transformer(
--- a/src/transformers/models/distilbert/modeling_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_distilbert.py
@@ -20,7 +20,6 @@
 import copy
 import math
 import warnings
 import numpy as np
 import torch
@@ -526,23 +525,13 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        """
        if "masked_lm_labels" in kwargs:
            warnings.warn(
                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
                FutureWarning,
            )
            labels = kwargs.pop("masked_lm_labels")
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        dlbrt_output = self.distilbert(
--- a/src/transformers/models/electra/modeling_electra.py
+++ b/src/transformers/models/electra/modeling_electra.py
@@ -16,7 +16,6 @@
 import math
 import os
 import warnings
 from dataclasses import dataclass
 from typing import Optional, Tuple
@@ -1000,23 +999,13 @@ class ElectraForMaskedLM(ElectraPreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
            config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        """
        if "masked_lm_labels" in kwargs:
            warnings.warn(
                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
                FutureWarning,
            )
            labels = kwargs.pop("masked_lm_labels")
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        generator_hidden_states = self.electra(
--- a/src/transformers/models/fsmt/modeling_fsmt.py
+++ b/src/transformers/models/fsmt/modeling_fsmt.py
@@ -29,7 +29,6 @@
 import math
 import random
 import warnings
 from typing import Any, Dict, List, Optional, Tuple
 import torch
@@ -618,7 +617,6 @@ class FSMTDecoder(nn.Module):
        output_attentions=False,
        output_hidden_states=False,
        return_dict=True,
        **unused,
    ):
        """
        Includes several features from "Jointly Learning to Align and Translate with Transformer Models" (Garg et al.,
@@ -640,19 +638,6 @@ class FSMTDecoder(nn.Module):
                - hidden states
                - attentions
        """
        if "decoder_cached_states" in unused:
            warnings.warn(
                "The `decoder_cached_states` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
                FutureWarning,
            )
            past_key_values = unused.pop("decoder_cached_states")
        if "decoder_past_key_values" in unused:
            warnings.warn(
                "The `decoder_past_key_values` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
                FutureWarning,
            )
            past_key_values = unused.pop("decoder_past_key_values")
        # check attention mask and invert
        if encoder_padding_mask is not None:
            encoder_padding_mask = invert_mask(encoder_padding_mask)
@@ -933,15 +918,7 @@ class FSMTModel(PretrainedFSMTModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs,
    ):
        if "decoder_past_key_values" in kwargs:
            warnings.warn(
                "The `decoder_past_key_values` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
                FutureWarning,
            )
            past_key_values = kwargs.pop("decoder_past_key_values")
        if decoder_input_ids is None:
            use_cache = False
@@ -1071,7 +1048,6 @@ class FSMTForConditionalGeneration(PretrainedFSMTModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **unused,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
--- a/src/transformers/models/gpt2/modeling_gpt2.py
+++ b/src/transformers/models/gpt2/modeling_gpt2.py
@@ -16,7 +16,6 @@
 """PyTorch OpenAI GPT-2 model."""
 import os
 import warnings
 from dataclasses import dataclass
 from typing import List, Optional, Tuple
@@ -528,16 +527,7 @@ class GPT2Model(GPT2PreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs,
    ):
        if "past" in kwargs:
            warnings.warn(
                "The `past` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
                FutureWarning,
            )
            past_key_values = kwargs.pop("past")
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -758,7 +748,6 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
@@ -766,13 +755,6 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
            ``labels = input_ids`` Indices are selected in ``[-100, 0, ..., config.vocab_size]`` All labels set to
            ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]``
        """
        if "past" in kwargs:
            warnings.warn(
                "The `past` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
                FutureWarning,
            )
            past_key_values = kwargs.pop("past")
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        transformer_outputs = self.transformer(
@@ -900,8 +882,6 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
            num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see
            `input_ids` above)
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        Return:
@@ -930,19 +910,6 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
            >>> mc_logits = outputs.mc_logits
        """
        if "lm_labels" in kwargs:
            warnings.warn(
                "The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
                FutureWarning,
            )
            labels = kwargs.pop("lm_labels")
        if "past" in kwargs:
            warnings.warn(
                "The `past` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
                FutureWarning,
            )
            past_key_values = kwargs.pop("past")
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        transformer_outputs = self.transformer(
--- a/src/transformers/models/gpt2/tokenization_gpt2.py
+++ b/src/transformers/models/gpt2/tokenization_gpt2.py
@@ -17,7 +17,6 @@
 import json
 import os
 import warnings
 from functools import lru_cache
 from typing import Optional, Tuple
@@ -293,13 +292,6 @@ class GPT2Tokenizer(PreTrainedTokenizer):
        return vocab_file, merge_file
    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
        if "is_pretokenized" in kwargs:
            warnings.warn(
                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
                FutureWarning,
            )
            is_split_into_words = kwargs.pop("is_pretokenized")
        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
        if is_split_into_words or add_prefix_space:
            text = " " + text
--- a/src/transformers/models/gpt2/tokenization_gpt2_fast.py
+++ b/src/transformers/models/gpt2/tokenization_gpt2_fast.py
@@ -16,7 +16,6 @@
 import json
 import warnings
 from typing import Optional, Tuple
 from tokenizers import pre_tokenizers
@@ -151,13 +150,6 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
        self.add_prefix_space = add_prefix_space
    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
        if "is_pretokenized" in kwargs:
            warnings.warn(
                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
                FutureWarning,
            )
            is_split_into_words = kwargs.pop("is_pretokenized")
        is_split_into_words = kwargs.get("is_split_into_words", False)
        assert self.add_prefix_space or not is_split_into_words, (
            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
@@ -167,14 +159,7 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
        return super()._batch_encode_plus(*args, **kwargs)
    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
-        if "is_pretokenized" in kwargs:
+        is_split_into_words = kwargs.get("is_split_into_words", False)
            warnings.warn(
                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
                FutureWarning,
            )
            is_split_into_words = kwargs.pop("is_pretokenized")
        else:
            is_split_into_words = kwargs.get("is_split_into_words", False)
        assert self.add_prefix_space or not is_split_into_words, (
            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
--- a/src/transformers/models/longformer/modeling_longformer.py
+++ b/src/transformers/models/longformer/modeling_longformer.py
@@ -15,7 +15,6 @@
 """PyTorch Longformer model. """
 import math
 import warnings
 from dataclasses import dataclass
 from typing import Optional, Tuple
@@ -1509,7 +1508,6 @@ class LongformerForMaskedLM(LongformerPreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
@@ -1538,14 +1536,6 @@ class LongformerForMaskedLM(LongformerPreTrainedModel):
            >>> loss = outputs.loss
            >>> prediction_logits = output.logits
        """
        if "masked_lm_labels" in kwargs:
            warnings.warn(
                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
                FutureWarning,
            )
            labels = kwargs.pop("masked_lm_labels")
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.longformer(
--- a/src/transformers/models/mobilebert/modeling_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_mobilebert.py
@@ -1109,7 +1109,6 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
@@ -1119,12 +1118,6 @@ class MobileBertForMaskedLM(MobileBertPreTrainedModel):
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        """
        if "masked_lm_labels" in kwargs:
            warnings.warn(
                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
                FutureWarning,
            )
            labels = kwargs.pop("masked_lm_labels")
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.mobilebert(
--- a/src/transformers/models/openai/modeling_openai.py
+++ b/src/transformers/models/openai/modeling_openai.py
@@ -19,7 +19,6 @@
 import json
 import math
 import os
 import warnings
 from dataclasses import dataclass
 from typing import Optional, Tuple
@@ -645,7 +644,6 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs
    ):
        r"""
        mc_token_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, num_choices)`, `optional`, default to index of the last token of the input):
@@ -659,8 +657,6 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
            Labels for computing the multiple choice classification loss. Indices should be in ``[0, ...,
            num_choices]`` where `num_choices` is the size of the second dimension of the input tensors. (see
            `input_ids` above)
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        Return:
@@ -683,13 +679,6 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
            >>> mc_logits = outputs.mc_logits
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        if "lm_labels" in kwargs:
            warnings.warn(
                "The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
                FutureWarning,
            )
            labels = kwargs.pop("lm_labels")
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        transformer_outputs = self.transformer(
            input_ids,
--- a/src/transformers/models/prophetnet/tokenization_prophetnet.py
+++ b/src/transformers/models/prophetnet/tokenization_prophetnet.py
@@ -302,7 +302,7 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
        **kwargs,
    ) -> BatchEncoding:
        if max_length is None:
-            max_length = self.max_len
+            max_length = self.model_max_length
        model_inputs = self(
            src_texts,
            add_special_tokens=True,
--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
@@ -16,7 +16,6 @@
 """PyTorch RoBERTa model. """
 import math
 import warnings
 import torch
 import torch.nn as nn
@@ -872,7 +871,6 @@ class RobertaForMaskedLM(RobertaPreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
@@ -882,13 +880,6 @@ class RobertaForMaskedLM(RobertaPreTrainedModel):
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        """
        if "masked_lm_labels" in kwargs:
            warnings.warn(
                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
                FutureWarning,
            )
            labels = kwargs.pop("masked_lm_labels")
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.roberta(
--- a/src/transformers/models/roberta/tokenization_roberta.py
+++ b/src/transformers/models/roberta/tokenization_roberta.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Tokenization classes for RoBERTa."""
 import warnings
 from typing import List, Optional
 from ...tokenization_utils import AddedToken
@@ -251,13 +250,6 @@ class RobertaTokenizer(GPT2Tokenizer):
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
        if "is_pretokenized" in kwargs:
            warnings.warn(
                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
                FutureWarning,
            )
            is_split_into_words = kwargs.pop("is_pretokenized")
        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
        if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
            text = " " + text
--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -18,7 +18,6 @@
 import copy
 import math
 import os
 import warnings
 import torch
 import torch.nn.functional as F
@@ -1048,7 +1047,6 @@ class T5Model(T5PreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs,
    ):
        r"""
        Returns:
@@ -1066,20 +1064,6 @@ class T5Model(T5PreTrainedModel):
            >>> last_hidden_states = outputs.last_hidden_state
        """
        if "decoder_past_key_value_states" in kwargs:
            warnings.warn(
                "The `decoder_past_key_value_states` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
                FutureWarning,
            )
            past_key_values = kwargs.pop("decoder_past_key_value_states")
        if "decoder_past_key_values" in kwargs:
            warnings.warn(
                "The `decoder_past_key_values` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
                FutureWarning,
            )
            past_key_values = kwargs.pop("decoder_past_key_values")
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@@ -1198,15 +1182,12 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs,
    ):
        r"""
        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to ``-100`` are ignored (masked), the loss is only computed for
            labels in ``[0, ..., config.vocab_size]``
        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
            Used to hide legacy arguments that have been deprecated.
        Returns:
@@ -1226,27 +1207,6 @@ class T5ForConditionalGeneration(T5PreTrainedModel):
            >>> input_ids = tokenizer("summarize: studies have shown that owning a dog is good for you ", return_tensors="pt").input_ids  # Batch size 1
            >>> outputs = model.generate(input_ids)
        """
        if "lm_labels" in kwargs:
            warnings.warn(
                "The `lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
                FutureWarning,
            )
            labels = kwargs.pop("lm_labels")
        if "decoder_past_key_value_states" in kwargs:
            warnings.warn(
                "The `decoder_past_key_value_states` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
                FutureWarning,
            )
            past_key_values = kwargs.pop("decoder_past_key_value_states")
        if "decoder_past_key_values" in kwargs:
            warnings.warn(
                "The `decoder_past_key_values` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
                FutureWarning,
            )
            past_key_values = kwargs.pop("decoder_past_key_values")
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
--- a/src/transformers/models/t5/modeling_tf_t5.py
+++ b/src/transformers/models/t5/modeling_tf_t5.py
@@ -595,7 +595,6 @@ class TFT5MainLayer(tf.keras.layers.Layer):
        output_attentions=None,
        output_hidden_states=None,
        training=False,
        **kwargs,
    ) -> Tuple:
        if isinstance(inputs, (tuple, list)):
            input_ids = inputs[0]
@@ -621,21 +620,8 @@ class TFT5MainLayer(tf.keras.layers.Layer):
            output_attentions = inputs.get("output_attentions", output_attentions)
            output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
            assert len(inputs) <= 10, "Too many inputs."
            if "past_key_values" in inputs:
                warnings.warn(
                    "The `past_key_values` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
                    FutureWarning,
                )
                past_key_values = inputs.pop("past_key_values")
        else:
            input_ids = inputs
            if "past_key_values" in kwargs:
                warnings.warn(
                    "The `past_key_values` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
                    FutureWarning,
                )
                past_key_values = kwargs.pop("past_key_values")
        output_attentions = output_attentions if output_attentions is not None else self.output_attentions
        output_hidden_states = output_hidden_states if output_hidden_states is not None else self.output_hidden_states
@@ -1078,23 +1064,9 @@ class TFT5Model(TFT5PreTrainedModel):
            output_attentions = inputs.get("output_attentions", output_attentions)
            output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
            assert len(inputs) <= 13, "Too many inputs."
            if "past_key_value_states" in inputs:
                warnings.warn(
                    "The `past_key_value_states` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
                    FutureWarning,
                )
                past_key_values = inputs.pop("past_key_value_states")
        else:
            input_ids = inputs
            if "past_key_value_states" in kwargs:
                warnings.warn(
                    "The `past_key_value_states` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
                    FutureWarning,
                )
                past_key_values = kwargs.pop("past_key_value_states")
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        output_attentions = output_attentions if output_attentions else self.config.output_attentions
        output_hidden_states = output_hidden_states if output_hidden_states else self.config.output_hidden_states
@@ -1294,23 +1266,9 @@ class TFT5ForConditionalGeneration(TFT5PreTrainedModel, TFCausalLanguageModeling
            output_hidden_states = inputs.get("output_hidden_states", output_hidden_states)
            return_dict = inputs.get("return_dict", return_dict)
            assert len(inputs) <= 14, "Too many inputs."
            if "past_key_value_states" in inputs:
                warnings.warn(
                    "The `past_key_value_states` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
                    FutureWarning,
                )
                past_key_values = inputs.pop("past_key_value_states")
        else:
            input_ids = inputs
            if "past_key_value_states" in kwargs:
                warnings.warn(
                    "The `past_key_value_states` argument is deprecated and will be removed in a future version, use `past_key_values` instead.",
                    FutureWarning,
                )
                past_key_values = kwargs.pop("past_key_value_states")
        use_cache = use_cache if use_cache is not None else self.config.use_cache
        output_attentions = output_attentions if output_attentions else self.config.output_attentions
        output_hidden_states = output_hidden_states if output_hidden_states else self.config.output_hidden_states
--- a/src/transformers/models/transfo_xl/configuration_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/configuration_transfo_xl.py
@@ -15,9 +15,6 @@
 # limitations under the License.
 """ Transformer XL configuration """
 import warnings
 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -139,13 +136,6 @@ class TransfoXLConfig(PretrainedConfig):
        eos_token_id=0,
        **kwargs
    ):
        if "tie_weight" in kwargs:
            warnings.warn(
                "The config parameter `tie_weight` is deprecated. Please use `tie_word_embeddings` instead.",
                FutureWarning,
            )
            kwargs["tie_word_embeddings"] = kwargs["tie_weight"]
        super().__init__(eos_token_id=eos_token_id, **kwargs)
        self.vocab_size = vocab_size
        self.cutoffs = []
--- a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
@@ -16,7 +16,6 @@
 """
 TF 2.0 Transformer XL model.
 """
 import warnings
 from dataclasses import dataclass
 from typing import List, Optional, Tuple
@@ -865,13 +864,6 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
            return self.crit.out_layers[-1]
        return None
    def reset_length(self, tgt_len, ext_len, mem_len):
        warnings.warn(
            "The method `reset_length` is deprecated and will be removed in a future version, use `reset_memory_length` instead.",
            FutureWarning,
        )
        self.transformer.reset_memory_length(mem_len)
    def reset_memory_length(self, mem_len):
        self.transformer.reset_memory_length(mem_len)
--- a/src/transformers/models/transfo_xl/modeling_transfo_xl.py
+++ b/src/transformers/models/transfo_xl/modeling_transfo_xl.py
@@ -17,7 +17,6 @@
 PyTorch Transformer XL model. Adapted from https://github.com/kimiyoung/transformer-xl. In particular
 https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/mem_transformer.py
 """
 import warnings
 from dataclasses import dataclass
 from typing import List, Optional, Tuple
@@ -1010,13 +1009,6 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
                    else:
                        self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i]
    def reset_length(self, tgt_len, ext_len, mem_len):
        warnings.warn(
            "The method `reset_length` is deprecated and will be removed in a future version, use `reset_memory_length` instead.",
            FutureWarning,
        )
        self.transformer.reset_memory_length(mem_len)
    def reset_memory_length(self, mem_len):
        self.transformer.reset_memory_length(mem_len)
--- a/src/transformers/models/xlm/modeling_tf_xlm.py
+++ b/src/transformers/models/xlm/modeling_tf_xlm.py
@@ -16,9 +16,7 @@
 TF 2.0 XLM model.
 """
 import itertools
 import warnings
 from dataclasses import dataclass
 from typing import Optional, Tuple
@@ -997,10 +995,9 @@ class TFXLMForMultipleChoice(TFXLMPreTrainedModel, TFMultipleChoiceLoss):
        )
        if lengths is not None:
-            warnings.warn(
+            logger.warn(
                "The `lengths` parameter cannot be used with the XLM multiple choice models. Please use the "
                "attention mask instead.",
                FutureWarning,
            )
            lengths = None
--- a/src/transformers/models/xlm/modeling_xlm.py
+++ b/src/transformers/models/xlm/modeling_xlm.py
@@ -16,10 +16,8 @@
 PyTorch XLM model.
 """
 import itertools
 import math
 import warnings
 from dataclasses import dataclass
 from typing import Optional, Tuple
@@ -1228,10 +1226,9 @@ class XLMForMultipleChoice(XLMPreTrainedModel):
        )
        if lengths is not None:
-            warnings.warn(
+            logger.warn(
                "The `lengths` parameter cannot be used with the XLM multiple choice models. Please use the "
-                "attention mask instead.",
+                "attention mask instead."
                FutureWarning,
            )
            lengths = None
--- a/src/transformers/pipelines.py
+++ b/src/transformers/pipelines.py
@@ -1182,7 +1182,6 @@ class FillMaskPipeline(Pipeline):
        device: int = -1,
        top_k=5,
        task: str = "",
        **kwargs
    ):
        super().__init__(
            model=model,
@@ -1196,15 +1195,7 @@ class FillMaskPipeline(Pipeline):
        )
        self.check_model_type(TF_MODEL_WITH_LM_HEAD_MAPPING if self.framework == "tf" else MODEL_FOR_MASKED_LM_MAPPING)
-
+        self.top_k = top_k
        if "topk" in kwargs:
            warnings.warn(
                "The `topk` argument is deprecated and will be removed in a future version, use `top_k` instead.",
                FutureWarning,
            )
            self.top_k = kwargs.pop("topk")
        else:
            self.top_k = top_k
    def ensure_exactly_one_mask_token(self, masked_index: np.ndarray):
        numel = np.prod(masked_index.shape)
--- a/src/transformers/tokenization_utils.py
+++ b/src/transformers/tokenization_utils.py
@@ -19,7 +19,6 @@
 import itertools
 import re
 import unicodedata
 import warnings
 from typing import Any, Dict, List, Optional, Tuple, Union, overload
 from .file_utils import add_end_docstrings
@@ -246,12 +245,6 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
        Returns:
            :obj:`List[str]`: The list of tokens.
        """
        if "is_pretokenized" in kwargs:
            warnings.warn(
                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
                FutureWarning,
            )
            kwargs["is_split_into_words"] = kwargs.pop("is_pretokenized")
        # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
        all_special_tokens_extended = dict(
            (str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken)
@@ -448,13 +441,6 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
                "https://github.com/huggingface/transformers/pull/2674"
            )
        if "is_pretokenized" in kwargs:
            warnings.warn(
                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
                FutureWarning,
            )
            is_split_into_words = kwargs.pop("is_pretokenized")
        first_ids = get_input_ids(text)
        second_ids = get_input_ids(text_pair) if text_pair is not None else None
@@ -530,13 +516,6 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
                "transformers.PreTrainedTokenizerFast."
            )
        if "is_pretokenized" in kwargs:
            warnings.warn(
                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
                FutureWarning,
            )
            is_split_into_words = kwargs.pop("is_pretokenized")
        input_ids = []
        for ids_or_pair_ids in batch_text_or_text_pairs:
            if not isinstance(ids_or_pair_ids, (list, tuple)):
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1532,18 +1532,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
        super().__init__(**kwargs)
    @property
    def max_len(self) -> int:
        """
        :obj:`int`: **Deprecated** Kept here for backward compatibility. Now renamed to :obj:`model_max_length` to
        avoid ambiguity.
        """
        warnings.warn(
            "The `max_len` attribute has been deprecated and will be removed in a future version, use `model_max_length` instead.",
            FutureWarning,
        )
        return self.model_max_length
    @property
    def max_len_single_sentence(self) -> int:
        """
@@ -2785,15 +2773,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
                and ``convert_tokens_to_ids`` methods.
        """
        if "return_lengths" in kwargs:
            if verbose:
                warnings.warn(
                    "The PreTrainedTokenizerBase.prepare_for_model `return_lengths` parameter is deprecated. "
                    "Please use `return_length` instead.",
                    FutureWarning,
                )
            return_length = kwargs["return_lengths"]
        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
            padding=padding,
--- a/src/transformers/tokenization_utils_fast.py
+++ b/src/transformers/tokenization_utils_fast.py
@@ -19,7 +19,6 @@
 import json
 import os
 import warnings
 from collections import defaultdict
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -357,7 +356,6 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
        return_offsets_mapping: bool = False,
        return_length: bool = False,
        verbose: bool = True,
        **kwargs
    ) -> BatchEncoding:
        if not isinstance(batch_text_or_text_pairs, list):
@@ -365,16 +363,6 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
                "batch_text_or_text_pairs has to be a list (got {})".format(type(batch_text_or_text_pairs))
            )
        if "is_pretokenized" in kwargs:
            warnings.warn(
                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
                FutureWarning,
            )
            is_split_into_words = kwargs.pop("is_pretokenized")
        if kwargs:
            raise ValueError(f"Keyword arguments {kwargs} not recognized.")
        # Set the truncation and padding strategy and restore the initial configuration
        self.set_truncation_and_padding(
            padding_strategy=padding_strategy,
@@ -453,12 +441,6 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
        verbose: bool = True,
        **kwargs
    ) -> BatchEncoding:
        if "is_pretokenized" in kwargs:
            warnings.warn(
                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
                FutureWarning,
            )
            is_split_into_words = kwargs.pop("is_pretokenized")
        batched_input = [(text, text_pair)] if text_pair else [text]
        batched_output = self._batch_encode_plus(
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -213,8 +213,6 @@ class Trainer:
            containing the optimizer and the scheduler to use. Will default to an instance of
            :class:`~transformers.AdamW` on your model and a scheduler given by
            :func:`~transformers.get_linear_schedule_with_warmup` controlled by :obj:`args`.
        kwargs:
            Deprecated keyword arguments.
    """
    def __init__(
@@ -229,7 +227,6 @@ class Trainer:
        compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None,
        callbacks: Optional[List[TrainerCallback]] = None,
        optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None),
        **kwargs,
    ):
        if args is None:
            logger.info("No `TrainingArguments` passed, using the current path as `output_dir`.")
@@ -262,27 +259,6 @@ class Trainer:
        self.callback_handler = CallbackHandler(callbacks, self.model, self.optimizer, self.lr_scheduler)
        self.add_callback(PrinterCallback if self.args.disable_tqdm else DEFAULT_PROGRESS_CALLBACK)
        # Deprecated arguments
        if "tb_writer" in kwargs:
            warnings.warn(
                "Passing `tb_writer` as a keyword argument is deprecated and won't be possible in a "
                + "future version. Use `TensorBoardCallback(tb_writer=...)` instead and pass it to the `callbacks`"
                + "argument",
                FutureWarning,
            )
            tb_writer = kwargs.pop("tb_writer")
            self.remove_callback(TensorBoardCallback)
            self.add_callback(TensorBoardCallback(tb_writer=tb_writer))
        if "prediction_loss_only" in kwargs:
            warnings.warn(
                "Passing `prediction_loss_only` as a keyword argument is deprecated and won't be possible in a "
                + "future version. Use `args.prediction_loss_only` instead. Setting "
                + f"`args.prediction_loss_only={kwargs['prediction_loss_only']}",
                FutureWarning,
            )
            self.args.prediction_loss_only = kwargs.pop("prediction_loss_only")
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        # Will be set to True by `self._setup_loggers()` on first call to `self.log()`.
        self._loggers_initialized = False
@@ -294,14 +270,7 @@ class Trainer:
            # We'll find a more elegant and not need to do this in the future.
            self.model.config.xla_device = True
        if not callable(self.data_collator) and callable(getattr(self.data_collator, "collate_batch", None)):
-            self.data_collator = self.data_collator.collate_batch
+            raise ValueError("The `data_collator` should be a simple callable (function, class with `__call__`).")
            warnings.warn(
                (
                    "The `data_collator` should now be a simple callable (function, class with `__call__`), classes "
                    + "with a `collate_batch` are deprecated and won't be supported in a future version."
                ),
                FutureWarning,
            )
        if args.max_steps > 0:
            logger.info("max_steps is given, it will override any value given in num_train_epochs")
@@ -1050,12 +1019,6 @@ class Trainer:
            logs (:obj:`Dict[str, float]`):
                The values to log.
        """
        if hasattr(self, "_log"):
            warnings.warn(
                "The `_log` method is deprecated and won't be called in a future version, define `log` in your subclass.",
                FutureWarning,
            )
            return self._log(logs)
        if self.state.epoch is not None:
            logs["epoch"] = self.state.epoch
@@ -1095,12 +1058,6 @@ class Trainer:
        Return:
            :obj:`torch.Tensor`: The tensor with training loss on this batch.
        """
        if hasattr(self, "_training_step"):
            warnings.warn(
                "The `_training_step` method is deprecated and won't be called in a future version, define `training_step` in your subclass.",
                FutureWarning,
            )
            return self._training_step(model, inputs, self.optimizer)
        model.train()
        inputs = self._prepare_inputs(inputs)
@@ -1140,18 +1097,6 @@ class Trainer:
        # We don't use .loss here since the model may return tuples instead of ModelOutput.
        return outputs[0]
    def is_local_master(self) -> bool:
        """
        Whether or not this process is the local (e.g., on one machine if training in a distributed fashion on several
        machines) main process.
        .. warning::
            This method is deprecated, use :meth:`~transformers.Trainer.is_local_process_zero` instead.
        """
        warnings.warn("This method is deprecated, use `Trainer.is_local_process_zero()` instead.", FutureWarning)
        return self.is_local_process_zero()
    def is_local_process_zero(self) -> bool:
        """
        Whether or not this process is the local (e.g., on one machine if training in a distributed fashion on several
@@ -1162,18 +1107,6 @@ class Trainer:
        else:
            return self.args.local_rank in [-1, 0]
    def is_world_master(self) -> bool:
        """
        Whether or not this process is the global main process (when training in a distributed fashion on several
        machines, this is only going to be :obj:`True` for one process).
        .. warning::
            This method is deprecated, use :meth:`~transformers.Trainer.is_world_process_zero` instead.
        """
        warnings.warn("This method is deprecated, use `Trainer.is_world_process_zero()` instead.", FutureWarning)
        return self.is_world_process_zero()
    def is_world_process_zero(self) -> bool:
        """
        Whether or not this process is the global main process (when training in a distributed fashion on several
@@ -1362,13 +1295,6 @@ class Trainer:
        Works both with or without labels.
        """
        if hasattr(self, "_prediction_loop"):
            warnings.warn(
                "The `_prediction_loop` method is deprecated and won't be called in a future version, define `prediction_loop` in your subclass.",
                FutureWarning,
            )
            return self._prediction_loop(dataloader, description, prediction_loss_only=prediction_loss_only)
        if not isinstance(dataloader.dataset, collections.abc.Sized):
            raise ValueError("dataset must implement __len__")
        prediction_loss_only = (
--- a/src/transformers/trainer_tf.py
+++ b/src/transformers/trainer_tf.py
@@ -3,7 +3,6 @@
 import datetime
 import math
 import os
 import warnings
 from typing import Callable, Dict, Optional, Tuple
@@ -66,8 +65,6 @@ class TFTrainer:
            :class:`~transformers.AdamWeightDecay`. The scheduler will default to an instance of
            :class:`tf.keras.optimizers.schedules.PolynomialDecay` if :obj:`args.num_warmup_steps` is 0 else an
            instance of :class:`~transformers.WarmUp`.
        kwargs:
            Deprecated keyword arguments.
    """
    def __init__(
@@ -82,7 +79,6 @@ class TFTrainer:
            None,
            None,
        ),
        **kwargs,
    ):
        assert parse(tf.__version__).release >= (2, 2, 0), (
            "You need to run the TensorFlow trainer with at least the version 2.2.0, your version is %r "
@@ -98,13 +94,6 @@ class TFTrainer:
        self.gradient_accumulator = GradientAccumulator()
        self.global_step = 0
        self.epoch_logging = 0
        if "prediction_loss_only" in kwargs:
            warnings.warn(
                "Passing `prediction_loss_only` as a keyword argument is deprecated and won't be possible in a future version. Use `args.prediction_loss_only` instead.",
                FutureWarning,
            )
            self.args.prediction_loss_only = kwargs.pop("prediction_loss_only")
        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
        if tb_writer is not None:
            self.tb_writer = tb_writer
@@ -249,12 +238,6 @@ class TFTrainer:
            WANDB_DISABLED:
                (Optional): boolean - defaults to false, set to "true" to disable wandb entirely.
        """
        if hasattr(self, "_setup_wandb"):
            warnings.warn(
                "The `_setup_wandb` method is deprecated and won't be called in a future version, define `setup_wandb` in your subclass.",
                FutureWarning,
            )
            return self._setup_wandb()
        logger.info('Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"')
        combined_dict = {**self.model.config.to_dict(), **self.args.to_sanitized_dict()}
@@ -304,14 +287,6 @@ class TFTrainer:
        Works both with or without labels.
        """
        if hasattr(self, "_prediction_loop"):
            warnings.warn(
                "The `_prediction_loop` method is deprecated and won't be called in a future version, define `prediction_loop` in your subclass.",
                FutureWarning,
            )
            return self._prediction_loop(
                dataset, steps, num_examples, description, prediction_loss_only=prediction_loss_only
            )
        prediction_loss_only = (
            prediction_loss_only if prediction_loss_only is not None else self.args.prediction_loss_only
@@ -393,12 +368,6 @@ class TFTrainer:
            logs (:obj:`Dict[str, float]`):
                The values to log.
        """
        if hasattr(self, "_log"):
            warnings.warn(
                "The `_log` method is deprecated and won't be called in a future version, define `log` in your subclass.",
                FutureWarning,
            )
            return self._log(logs)
        logs["epoch"] = self.epoch_logging
        if self.tb_writer:
@@ -733,12 +702,6 @@ class TFTrainer:
        Returns:
            A tuple of two :obj:`tf.Tensor`: The loss and logits.
        """
        if hasattr(self, "_run_model"):
            warnings.warn(
                "The `_run_model` method is deprecated and won't be called in a future version, define `run_model` in your subclass.",
                FutureWarning,
            )
            return self._run_model(features, labels, training)
        if self.args.past_index >= 0 and getattr(self, "_past", None) is not None:
            features["mems"] = self._past
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -1,7 +1,6 @@
 import dataclasses
 import json
 import os
 import warnings
 from dataclasses import dataclass, field
 from enum import Enum
 from typing import Any, Dict, List, Optional, Tuple
@@ -198,10 +197,6 @@ class TrainingArguments:
    do_train: bool = field(default=False, metadata={"help": "Whether to run training."})
    do_eval: bool = field(default=None, metadata={"help": "Whether to run eval on the dev set."})
    do_predict: bool = field(default=False, metadata={"help": "Whether to run predictions on the test set."})
    evaluate_during_training: bool = field(
        default=False,
        metadata={"help": "Run evaluation during training at each logging step."},
    )
    evaluation_strategy: EvaluationStrategy = field(
        default="no",
        metadata={"help": "Run evaluation during training at each logging step."},
@@ -340,12 +335,6 @@ class TrainingArguments:
    def __post_init__(self):
        if self.disable_tqdm is None:
            self.disable_tqdm = logger.getEffectiveLevel() > logging.WARN
        if self.evaluate_during_training is True:
            self.evaluation_strategy = EvaluationStrategy.STEPS
            warnings.warn(
                "The `evaluate_during_training` argument is deprecated in favor of `evaluation_strategy` (which has more options)",
                FutureWarning,
            )
        self.evaluation_strategy = EvaluationStrategy(self.evaluation_strategy)
        if self.do_eval is False and self.evaluation_strategy != EvaluationStrategy.NO:
            self.do_eval = True
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py
@@ -73,7 +73,6 @@ class {{cookiecutter.camelcase_modelname}}TokenizerFast(BertTokenizerFast):
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
 {%- elif cookiecutter.tokenizer_type == "Standalone" %}
 import warnings
 from typing import List, Optional
 from tokenizers import ByteLevelBPETokenizer
@@ -234,13 +233,6 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(PreTrainedTokenizer):
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
        if "is_pretokenized" in kwargs:
            warnings.warn(
                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
                FutureWarning,
            )
            is_split_into_words = kwargs.pop("is_pretokenized")
        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
        if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
            text = " " + text
@@ -285,29 +277,6 @@ class {{cookiecutter.camelcase_modelname}}TokenizerFast(PreTrainedTokenizerFast)
        )
        self.add_prefix_space = add_prefix_space
    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
        is_split_into_words = None
        if "is_pretokenized" in kwargs:
            warnings.warn(
                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
                FutureWarning,
            )
            is_split_into_words = kwargs.pop("is_pretokenized")
        is_split_into_words = kwargs.get("is_split_into_words", False) if is_split_into_words is None else is_split_into_words
        return super()._batch_encode_plus(*args, **kwargs)
    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
        is_split_into_words = None
        if "is_pretokenized" in kwargs:
            warnings.warn(
                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
                FutureWarning,
            )
        is_split_into_words = kwargs.get("is_split_into_words", False) if is_split_into_words is None else is_split_into_words
        return super()._encode_plus(*args, **kwargs)
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
        output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
        if token_ids_1 is None:
--- a/tests/test_modeling_gpt2.py
+++ b/tests/test_modeling_gpt2.py
@@ -213,7 +213,9 @@ class GPT2ModelTester:
        next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1)
        output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, token_type_ids=next_token_types, past=past)["last_hidden_state"]
+        output_from_past = model(next_tokens, token_type_ids=next_token_types, past_key_values=past)[
            "last_hidden_state"
        ]
        # select random slice
        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
@@ -255,7 +257,7 @@ class GPT2ModelTester:
        # get two different outputs
        output_from_no_past = model(next_input_ids, attention_mask=attn_mask)["last_hidden_state"]
-        output_from_past = model(next_tokens, past=past, attention_mask=attn_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, past_key_values=past, attention_mask=attn_mask)["last_hidden_state"]
        # select random slice
        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
@@ -286,7 +288,9 @@ class GPT2ModelTester:
        next_token_type_ids = torch.cat([token_type_ids, next_token_types], dim=-1)
        output_from_no_past = model(next_input_ids, token_type_ids=next_token_type_ids)["last_hidden_state"]
-        output_from_past = model(next_tokens, token_type_ids=next_token_types, past=past)["last_hidden_state"]
+        output_from_past = model(next_tokens, token_type_ids=next_token_types, past_key_values=past)[
            "last_hidden_state"
        ]
        self.parent.assertTrue(output_from_past.shape[1] == next_tokens.shape[1])
        # select random slice
--- a/tests/test_pipelines_fill_mask.py
+++ b/tests/test_pipelines_fill_mask.py
@@ -1,7 +1,5 @@
 import unittest
 import pytest
 from transformers import pipeline
 from transformers.testing_utils import require_tf, require_torch, slow
@@ -53,13 +51,6 @@ class FillMaskPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
    ]
    expected_check_keys = ["sequence"]
    @require_torch
    def test_torch_topk_deprecation(self):
        # At pipeline initialization only it was not enabled at pipeline
        # call site before
        with pytest.warns(FutureWarning, match=r".*use `top_k`.*"):
            pipeline(task="fill-mask", model=self.small_models[0], topk=1)
    @require_torch
    def test_torch_fill_mask(self):
        valid_inputs = "My name is <mask>"
--- a/tests/test_tokenization_auto.py
+++ b/tests/test_tokenization_auto.py
@@ -83,7 +83,7 @@ class AutoTokenizerTest(unittest.TestCase):
            else:
                self.assertEqual(tokenizer.do_lower_case, False)
-            self.assertEqual(tokenizer.max_len, 512)
+            self.assertEqual(tokenizer.model_max_length, 512)
    @require_tokenizers
    def test_tokenizer_identifier_non_existent(self):