[PyTorch Bart] Split Bart into different models (#9343)

* first try * remove old template * finish bart * finish mbart * delete unnecessary line * init pegasus * save intermediate * correct pegasus * finish pegasus * remove cookie cutter leftover * add marian * finish blenderbot * replace in file * correctly split blenderbot * delete "old" folder * correct "add statement" * adapt config for tf comp * correct configs for tf * remove ipdb * fix more stuff * fix mbart * push pegasus fix * fix mbart * more fixes * fix research projects code * finish docs for bart, mbart, and marian * delete unnecessary file * correct attn typo * correct configs * remove pegasus for seq class * correct peg docs * correct peg docs * finish configs * further improve docs * add copied from statements to mbart * fix copied from in mbart * add copy statements to marian * add copied from to marian * add pegasus copied from * finish pegasus * finish copied from * Apply suggestions from code review * make style * backward comp blenderbot * apply lysandres and sylvains suggestions * apply suggestions * push last fixes * fix docs * fix tok tests * fix imports code style * fix doc
2021-01-05 22:00:05 +01:00
parent 4eec5d0cf6
commit eef66035a2
59 changed files with 9273 additions and 2271 deletions
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@@ -120,11 +120,11 @@ from .models.bert import (
 from .models.bert_generation import BertGenerationConfig
 from .models.bert_japanese import BertJapaneseTokenizer, CharacterTokenizer, MecabTokenizer
 from .models.bertweet import BertweetTokenizer
-from .models.blenderbot import (
-    BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-    BlenderbotConfig,
+from .models.blenderbot import BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP, BlenderbotConfig, BlenderbotTokenizer
+from .models.blenderbot_small import (
+    BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    BlenderbotSmallConfig,
    BlenderbotSmallTokenizer,
-    BlenderbotTokenizer,
 )
 from .models.camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig
 from .models.ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, CTRLTokenizer
@@ -415,6 +415,11 @@ if is_torch_available():
        BlenderbotForConditionalGeneration,
        BlenderbotModel,
    )
+    from .models.blenderbot_small import (
+        BLENDERBOT_SMALL_PRETRAINED_MODEL_ARCHIVE_LIST,
+        BlenderbotSmallForConditionalGeneration,
+        BlenderbotSmallModel,
+    )
    from .models.camembert import (
        CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
        CamembertForCausalLM,
@@ -536,8 +541,13 @@ if is_torch_available():
        LxmertVisualFeatureEncoder,
        LxmertXLayer,
    )
-    from .models.marian import MarianMTModel
-    from .models.mbart import MBartForConditionalGeneration, MBartModel
+    from .models.marian import MarianModel, MarianMTModel
+    from .models.mbart import (
+        MBartForConditionalGeneration,
+        MBartForQuestionAnswering,
+        MBartForSequenceClassification,
+        MBartModel,
+    )
    from .models.mmbt import MMBTForClassification, MMBTModel, ModalEmbeddings
    from .models.mobilebert import (
        MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -23,6 +23,10 @@ from ..bart.configuration_bart import BART_PRETRAINED_CONFIG_ARCHIVE_MAP, BartCo
 from ..bert.configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, BertConfig
 from ..bert_generation.configuration_bert_generation import BertGenerationConfig
 from ..blenderbot.configuration_blenderbot import BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP, BlenderbotConfig
+from ..blenderbot_small.configuration_blenderbot_small import (
+    BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+    BlenderbotSmallConfig,
+)
 from ..camembert.configuration_camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig
 from ..ctrl.configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig
 from ..deberta.configuration_deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig
@@ -68,6 +72,7 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
    for pretrained_map in [
        # Add archive maps here
        LED_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
        BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
        BART_PRETRAINED_CONFIG_ARCHIVE_MAP,
        BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -108,6 +113,7 @@ CONFIG_MAPPING = OrderedDict(
    [
        # Add configs here
        ("led", LEDConfig),
+        ("blenderbot-small", BlenderbotSmallConfig),
        ("retribert", RetriBertConfig),
        ("mt5", MT5Config),
        ("t5", T5Config),
@@ -154,6 +160,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
    [
        # Add full (and cased) model names here
        ("led", "LED"),
+        ("blenderbot-small", "BlenderbotSmall"),
        ("retribert", "RetriBERT"),
        ("t5", "T5"),
        ("mobilebert", "MobileBERT"),
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -51,6 +51,7 @@ from ..bert.modeling_bert import (
 )
 from ..bert_generation.modeling_bert_generation import BertGenerationDecoder, BertGenerationEncoder
 from ..blenderbot.modeling_blenderbot import BlenderbotForConditionalGeneration, BlenderbotModel
+from ..blenderbot_small.modeling_blenderbot_small import BlenderbotSmallForConditionalGeneration, BlenderbotSmallModel
 from ..camembert.modeling_camembert import (
    CamembertForCausalLM,
    CamembertForMaskedLM,
@@ -116,8 +117,13 @@ from ..longformer.modeling_longformer import (
    LongformerModel,
 )
 from ..lxmert.modeling_lxmert import LxmertForPreTraining, LxmertForQuestionAnswering, LxmertModel
-from ..marian.modeling_marian import MarianMTModel
-from ..mbart.modeling_mbart import MBartForConditionalGeneration, MBartModel
+from ..marian.modeling_marian import MarianModel, MarianMTModel
+from ..mbart.modeling_mbart import (
+    MBartForConditionalGeneration,
+    MBartForQuestionAnswering,
+    MBartForSequenceClassification,
+    MBartModel,
+)
 from ..mobilebert.modeling_mobilebert import (
    MobileBertForMaskedLM,
    MobileBertForMultipleChoice,
@@ -215,6 +221,7 @@ from .configuration_auto import (
    BertConfig,
    BertGenerationConfig,
    BlenderbotConfig,
+    BlenderbotSmallConfig,
    CamembertConfig,
    CTRLConfig,
    DebertaConfig,
@@ -260,6 +267,7 @@ MODEL_MAPPING = OrderedDict(
    [
        # Base model mapping
        (LEDConfig, LEDModel),
+        (BlenderbotSmallConfig, BlenderbotSmallModel),
        (RetriBertConfig, RetriBertModel),
        (MT5Config, MT5Model),
        (T5Config, T5Model),
@@ -297,6 +305,7 @@ MODEL_MAPPING = OrderedDict(
        (ProphetNetConfig, ProphetNetModel),
        (MPNetConfig, MPNetModel),
        (TapasConfig, TapasModel),
+        (MarianConfig, MarianModel),
    ]
 )

@@ -336,6 +345,7 @@ MODEL_WITH_LM_HEAD_MAPPING = OrderedDict(
    [
        # Model with LM heads mapping
        (LEDConfig, LEDForConditionalGeneration),
+        (BlenderbotSmallConfig, BlenderbotSmallForConditionalGeneration),
        (LayoutLMConfig, LayoutLMForMaskedLM),
        (T5Config, T5ForConditionalGeneration),
        (DistilBertConfig, DistilBertForMaskedLM),
@@ -417,6 +427,7 @@ MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = OrderedDict(
    [
        # Model for Seq2Seq Causal LM mapping
        (LEDConfig, LEDForConditionalGeneration),
+        (BlenderbotSmallConfig, BlenderbotSmallForConditionalGeneration),
        (MT5Config, MT5ForConditionalGeneration),
        (T5Config, T5ForConditionalGeneration),
        (PegasusConfig, PegasusForConditionalGeneration),
@@ -439,6 +450,7 @@ MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = OrderedDict(
        (AlbertConfig, AlbertForSequenceClassification),
        (CamembertConfig, CamembertForSequenceClassification),
        (XLMRobertaConfig, XLMRobertaForSequenceClassification),
+        (MBartConfig, MBartForSequenceClassification),
        (BartConfig, BartForSequenceClassification),
        (LongformerConfig, LongformerForSequenceClassification),
        (RobertaConfig, RobertaForSequenceClassification),
@@ -469,6 +481,7 @@ MODEL_FOR_QUESTION_ANSWERING_MAPPING = OrderedDict(
        (AlbertConfig, AlbertForQuestionAnswering),
        (CamembertConfig, CamembertForQuestionAnswering),
        (BartConfig, BartForQuestionAnswering),
+        (MBartConfig, MBartForQuestionAnswering),
        (LongformerConfig, LongformerForQuestionAnswering),
        (XLMRobertaConfig, XLMRobertaForQuestionAnswering),
        (RobertaConfig, RobertaForQuestionAnswering),
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -24,7 +24,7 @@ from ..bart.tokenization_bart import BartTokenizer
 from ..bert.tokenization_bert import BertTokenizer
 from ..bert_japanese.tokenization_bert_japanese import BertJapaneseTokenizer
 from ..bertweet.tokenization_bertweet import BertweetTokenizer
-from ..blenderbot.tokenization_blenderbot import BlenderbotSmallTokenizer
+from ..blenderbot_small.tokenization_blenderbot_small import BlenderbotSmallTokenizer
 from ..ctrl.tokenization_ctrl import CTRLTokenizer
 from ..deberta.tokenization_deberta import DebertaTokenizer
 from ..distilbert.tokenization_distilbert import DistilBertTokenizer
@@ -197,12 +197,12 @@ TOKENIZER_MAPPING = OrderedDict(
        (AlbertConfig, (AlbertTokenizer, AlbertTokenizerFast)),
        (CamembertConfig, (CamembertTokenizer, CamembertTokenizerFast)),
        (PegasusConfig, (PegasusTokenizer, PegasusTokenizerFast)),
+        (MBartConfig, (BarthezTokenizer, BarthezTokenizerFast)),
        (MBartConfig, (MBartTokenizer, MBartTokenizerFast)),
        (XLMRobertaConfig, (XLMRobertaTokenizer, XLMRobertaTokenizerFast)),
        (MarianConfig, (MarianTokenizer, None)),
        (BlenderbotConfig, (BlenderbotSmallTokenizer, None)),
        (LongformerConfig, (LongformerTokenizer, LongformerTokenizerFast)),
-        (BartConfig, (BarthezTokenizer, BarthezTokenizerFast)),
        (BartConfig, (BartTokenizer, BartTokenizerFast)),
        (LongformerConfig, (LongformerTokenizer, LongformerTokenizerFast)),
        (RobertaConfig, (RobertaTokenizer, RobertaTokenizerFast)),
--- a/src/transformers/models/bart/init.py
+++ b/src/transformers/models/bart/init.py
@@ -15,9 +15,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from ...file_utils import is_tf_available, is_tokenizers_available, is_torch_available
-from .configuration_bart import BartConfig
+from .configuration_bart import BART_PRETRAINED_CONFIG_ARCHIVE_MAP, BartConfig
 from .tokenization_bart import BartTokenizer


--- a/src/transformers/models/bart/configuration_bart.py
+++ b/src/transformers/models/bart/configuration_bart.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2020 The Fairseq Authors and The HuggingFace Inc. team.
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" BART configuration """
+""" BART model configuration """

 from ...configuration_utils import PretrainedConfig
 from ...utils import logging
@@ -21,34 +21,33 @@ from ...utils import logging
 logger = logging.get_logger(__name__)

 BART_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/config.json",
    "facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/config.json",
-    "facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/config.json",
-    "facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/config.json",
-    "facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/config.json",
-    "facebook/mbart-large-en-ro": "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/config.json",
-    "yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/config.json",
+    # See all BART models at https://huggingface.co/models?filter=bart
 }


 class BartConfig(PretrainedConfig):
    r"""
    This is the configuration class to store the configuration of a :class:`~transformers.BartModel`. It is used to
-    instantiate a BART model according to the specified arguments, defining the model architecture.
+    instantiate a BART model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the BART `facebook/bart-large
+    <https://huggingface.co/facebook/bart-large>`__ architecture.

    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.

+
    Args:
        vocab_size (:obj:`int`, `optional`, defaults to 50265):
-            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.BartModel`.
+            Vocabulary size of the BART model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.BartModel` or
+            :class:`~transformers.TFBartModel`.
        d_model (:obj:`int`, `optional`, defaults to 1024):
            Dimensionality of the layers and the pooler layer.
        encoder_layers (:obj:`int`, `optional`, defaults to 12):
-            Number of encoder layers, 6 are used for the `bart-base` model.
+            Number of encoder layers.
        decoder_layers (:obj:`int`, `optional`, defaults to 12):
-            Number of decoder layers, 6 are used for the `bart-base` model.
+            Number of decoder layers.
        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
@@ -73,145 +72,113 @@ class BartConfig(PretrainedConfig):
            just in case (e.g., 512 or 1024 or 2048).
        init_std (:obj:`float`, `optional`, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        add_bias_logits (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            This should be completed, specific to marian.
-        normalize_before (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Call layernorm before attention ops.
-        normalize_embedding (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Call layernorm after embeddings.
-        static_position_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Don't learn positional embeddings, use sinusoidal.
-        add_final_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Why not add another layernorm?
-        do_blenderbot_90_layernorm (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Blenderbot-90m checkpoint uses `layernorm_embedding` one line earlier in the decoder.
-        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Scale embeddings by diving by sqrt(d_model).
-        eos_token_id (:obj:`int`, `optional`, defaults to 2)
-            End of stream token id.
-        pad_token_id (:obj:`int`, `optional`, defaults to 1)
-            Padding token id.
-        bos_token_id (:obj:`int`, `optional`, defaults to 0)
-            Beginning of stream token id.
+        force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``), only
+            :obj:`True` for `bart-large-cnn`.
        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
            https://arxiv.org/abs/1909.11556>`__ for more details.
        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
            https://arxiv.org/abs/1909.11556>`__ for more details.
-        extra_pos_embeddings: (:obj:`int`, `optional`, defaults to 2):
-            How many extra learned positional embeddings to use. Should be set to :obj:`pad_token_id+1`.
-        num_labels: (:obj:`int`, `optional`, defaults to 3):
-            The number of labels to use in :class:`~transformers.BartForSequenceClassification`.
-        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether this is an encoder/decoder model.
-        force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``), only
-            :obj:`True` for `bart-large-cnn`.
+        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Scale embeddings by diving by sqrt(d_model).
        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
+        num_labels: (:obj:`int`, `optional`, defaults to 3):
+            The number of labels to use in :class:`~transformers.BartForSequenceClassification`.
+
+    Example::
+
+        >>> from transformers import BartModel, BartConfig
+
+        >>> # Initializing a BART facebook/bart-large style configuration
+        >>> configuration = BartConfig()
+
+        >>> # Initializing a model from the facebook/bart-large style configuration
+        >>> model = BartModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """
    model_type = "bart"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
-        activation_dropout=0.0,
-        extra_pos_embeddings=2,
-        activation_function="gelu",
        vocab_size=50265,
-        d_model=1024,
-        encoder_ffn_dim=4096,
+        max_position_embeddings=1024,
        encoder_layers=12,
+        encoder_ffn_dim=4096,
        encoder_attention_heads=16,
-        decoder_ffn_dim=4096,
        decoder_layers=12,
+        decoder_ffn_dim=4096,
        decoder_attention_heads=16,
        encoder_layerdrop=0.0,
        decoder_layerdrop=0.0,
-        attention_dropout=0.0,
+        activation_function="gelu",
+        d_model=1024,
        dropout=0.1,
-        max_position_embeddings=1024,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
        init_std=0.02,
        classifier_dropout=0.0,
-        num_labels=3,
-        is_encoder_decoder=True,
-        normalize_before=False,
-        add_final_layer_norm=False,
-        do_blenderbot_90_layernorm=False,
        scale_embedding=False,
-        normalize_embedding=True,
-        static_position_embeddings=False,
-        add_bias_logits=False,
+        gradient_checkpointing=False,
        force_bos_token_to_be_generated=False,
        use_cache=True,
+        num_labels=3,
        pad_token_id=1,
        bos_token_id=0,
        eos_token_id=2,
-        **common_kwargs
+        is_encoder_decoder=True,
+        decoder_start_token_id=2,
+        **kwargs
    ):
-        r"""
-        :class:`~transformers.BartConfig` is the configuration class for `BartModel`.
-
-        Examples::
-
-            >>> from transformers import BartConfig, BartModel
-
-            >>> config = BartConfig.from_pretrained('facebook/bart-large')
-            >>> model = BartModel(config)
-
-        """
-        if "hidden_size" in common_kwargs:
-            raise ValueError("hidden size is called d_model")
        super().__init__(
            num_labels=num_labels,
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            is_encoder_decoder=is_encoder_decoder,
-            **common_kwargs,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs,
        )
+
        self.vocab_size = vocab_size
-        self.d_model = d_model  # encoder_embed_dim and decoder_embed_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
        self.encoder_ffn_dim = encoder_ffn_dim
-        self.encoder_layers = self.num_hidden_layers = encoder_layers
+        self.encoder_layers = encoder_layers
        self.encoder_attention_heads = encoder_attention_heads
-        self.encoder_layerdrop = encoder_layerdrop
-        self.decoder_layerdrop = decoder_layerdrop
        self.decoder_ffn_dim = decoder_ffn_dim
        self.decoder_layers = decoder_layers
        self.decoder_attention_heads = decoder_attention_heads
-        self.max_position_embeddings = max_position_embeddings
-        self.init_std = init_std  # Normal(0, this parameter)
-        self.activation_function = activation_function
-
-        # Params introduced for Mbart
-        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
-        self.normalize_embedding = normalize_embedding  # True for mbart, False otherwise
-        self.normalize_before = normalize_before  # combo of fairseq's encoder_ and decoder_normalize_before
-        self.add_final_layer_norm = add_final_layer_norm
-
-        # Params introduced for Marian
-        self.add_bias_logits = add_bias_logits
-        self.static_position_embeddings = static_position_embeddings
-
-        # 3 Types of Dropout
+        self.dropout = dropout
        self.attention_dropout = attention_dropout
        self.activation_dropout = activation_dropout
-        self.dropout = dropout
-
-        # Classifier stuff
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
        self.classifier_dropout = classifier_dropout
-
-        # pos embedding offset
-        self.extra_pos_embeddings = extra_pos_embeddings
-        # bart has a hack that offsets positional embeddings by 2, other models don't do this
-
-        self.force_bos_token_to_be_generated = force_bos_token_to_be_generated
-
-        self.do_blenderbot_90_layernorm = do_blenderbot_90_layernorm
-
        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.gradient_checkpointing = gradient_checkpointing
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        self.force_bos_token_to_be_generated = force_bos_token_to_be_generated  # only relevant for CNN
+
+        # IMPORTANT
+        # DELETE ALL OF THE FOLLOWING LINES AS SOON AS TF IS READY
+        self.extra_pos_embeddings = 2
+        self.normalize_before = False
+        self.add_final_layer_norm = False
+        self.do_blenderbot_90_layernorm = False
+        self.normalize_embedding = True
+        self.static_position_embeddings = False
+        self.add_bias_logits = False

    @property
    def num_attention_heads(self) -> int:
@@ -220,11 +187,3 @@ class BartConfig(PretrainedConfig):
    @property
    def hidden_size(self) -> int:
        return self.d_model
-
-    def is_valid_mbart(self) -> bool:
-        """Is the configuration aligned with the MBART paper."""
-        if self.normalize_before and self.add_final_layer_norm and self.scale_embedding:
-            return True
-        if self.normalize_before or self.add_final_layer_norm or self.scale_embedding:
-            logger.info("This configuration is a mixture of MBART and BART settings")
-        return False
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,17 +12,18 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch BART model, ported from the fairseq repo."""
+""" PyTorch BART model. """
+
+
 import math
 import random
 import warnings
 from typing import Optional, Tuple

-import numpy as np
 import torch
 import torch.nn.functional as F
 from torch import nn
-from torch.nn import CrossEntropyLoss, LayerNorm
+from torch.nn import CrossEntropyLoss

 from ...activations import ACT2FN
 from ...file_utils import (
@@ -52,32 +53,24 @@ _TOKENIZER_FOR_DOC = "BartTokenizer"


 BART_PRETRAINED_MODEL_ARCHIVE_LIST = [
-    "facebook/bart-base",
    "facebook/bart-large",
-    "facebook/bart-large-mnli",
-    "facebook/bart-large-cnn",
-    "facebook/bart-large-xsum",
-    "facebook/mbart-large-en-ro",
+    # See all BART models at https://huggingface.co/models?filter=bart
 ]
-# This list is incomplete. See all BART models at https://huggingface.co/models?filter=bart


-def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int):
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
    """
-    Shift input ids one token to the right, and wrap the last non pad token (usually <eos>).
+    Shift input ids one token to the right.
    """
-    prev_output_tokens = input_ids.clone()
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id

    assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
    # replace possible -100 values in labels by `pad_token_id`
-    prev_output_tokens.masked_fill_(prev_output_tokens == -100, pad_token_id)
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

-    index_of_eos = (prev_output_tokens.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)
-    decoder_start_tokens = prev_output_tokens.gather(1, index_of_eos).squeeze()
-    prev_output_tokens[:, 1:] = prev_output_tokens[:, :-1].clone()
-    prev_output_tokens[:, 0] = decoder_start_tokens
-
-    return prev_output_tokens
+    return shifted_input_ids


 def _make_causal_mask(input_ids_shape: torch.Size, dtype: torch.dtype, past_key_values_length: int = 0):
@@ -111,18 +104,15 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]

 class BartLearnedPositionalEmbedding(nn.Embedding):
    """
-    This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting
-    based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to
-    the forward function.
+    This module learns positional embeddings up to a fixed maximum size.
    """

-    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, offset: int):
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int):
+        assert padding_idx is not None, "`padding_idx` should not be None, but of type int"
        # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2
        # and adjust num_embeddings appropriately. Other models dont have this hack
-        self.offset = offset
-        assert padding_idx is not None, "`padding_idx` should not be None, but of type int"
-        num_embeddings += offset
-        super().__init__(num_embeddings, embedding_dim, padding_idx=padding_idx)
+        self.offset = 2
+        super().__init__(num_embeddings + self.offset, embedding_dim, padding_idx=padding_idx)

    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
        """`input_ids_shape` is expected to be [bsz x seqlen]."""
@@ -133,40 +123,6 @@ class BartLearnedPositionalEmbedding(nn.Embedding):
        return super().forward(positions + self.offset)


-class BartSinusoidalPositionalEmbedding(nn.Embedding):
-    """This module produces sinusoidal positional embeddings of any length."""
-
-    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
-        super().__init__(num_positions, embedding_dim)
-        self.weight = self._init_weight(self.weight)
-
-    @staticmethod
-    def _init_weight(out: nn.Parameter):
-        """
-        Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
-        the 2nd half of the vector. [dim // 2:]
-        """
-        n_pos, dim = out.shape
-        position_enc = np.array(
-            [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
-        )
-        out.requires_grad = False  # set early to avoid an error in pytorch-1.8+
-        sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
-        out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
-        out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
-        out.detach_()
-        return out
-
-    @torch.no_grad()
-    def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0):
-        """`input_ids_shape` is expected to be [bsz x seqlen]."""
-        bsz, seq_len = input_ids_shape[:2]
-        positions = torch.arange(
-            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
-        )
-        return super().forward(positions)
-
-
 class BartAttention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

@@ -310,14 +266,13 @@ class BartEncoderLayer(nn.Module):
            num_heads=config.encoder_attention_heads,
            dropout=config.attention_dropout,
        )
-        self.normalize_before = config.normalize_before
-        self.self_attn_layer_norm = LayerNorm(self.embed_dim)
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        self.dropout = config.dropout
        self.activation_fn = ACT2FN[config.activation_function]
        self.activation_dropout = config.activation_dropout
        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = LayerNorm(self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)

    def forward(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, output_attentions: bool = False):
        """
@@ -325,33 +280,36 @@ class BartEncoderLayer(nn.Module):
            hidden_states (:obj:`torch.FloatTensor`): input to the layer of shape `(seq_len, batch, embed_dim)`
            attention_mask (:obj:`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            output_attentions (:obj:`bool`): Whether the base model outputs attentions. This requires the attentions tensor to be reshaped in this function.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
        """
        residual = hidden_states
-        if self.normalize_before:
-            hidden_states = self.self_attn_layer_norm(hidden_states)
        hidden_states, attn_weights, _ = self.self_attn(
            hidden_states=hidden_states, attention_mask=attention_mask, output_attentions=output_attentions
        )
        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states
-        if not self.normalize_before:
-            hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states = self.self_attn_layer_norm(hidden_states)

        residual = hidden_states
-        if self.normalize_before:
-            hidden_states = self.final_layer_norm(hidden_states)
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
        hidden_states = self.fc2(hidden_states)
        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states
-        if not self.normalize_before:
-            hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)
+
        if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-        return hidden_states, attn_weights
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs


 class BartDecoderLayer(nn.Module):
@@ -368,19 +326,18 @@ class BartDecoderLayer(nn.Module):
        self.dropout = config.dropout
        self.activation_fn = ACT2FN[config.activation_function]
        self.activation_dropout = config.activation_dropout
-        self.normalize_before = config.normalize_before

-        self.self_attn_layer_norm = LayerNorm(self.embed_dim)
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        self.encoder_attn = BartAttention(
            self.embed_dim,
            config.decoder_attention_heads,
            dropout=config.attention_dropout,
            is_decoder=True,
        )
-        self.encoder_attn_layer_norm = LayerNorm(self.embed_dim)
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = LayerNorm(self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)

    def forward(
        self,
@@ -389,7 +346,8 @@ class BartDecoderLayer(nn.Module):
        encoder_hidden_states: Optional[torch.Tensor] = None,
        encoder_attention_mask: Optional[torch.Tensor] = None,
        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[torch.Tensor] = False,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
    ):
        """
        Args:
@@ -400,11 +358,11 @@ class BartDecoderLayer(nn.Module):
            encoder_attention_mask (:obj:`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            past_key_value (:obj:`Tuple(torch.FloatTensor)`): cached past key and value projection states
-            output_attentions (:obj:`bool`): Whether the base model outputs attentions. This requires the attentions tensor to be reshaped in this function.
+            output_attentions (:obj:`bool`, `optional`):
+                Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under
+                returned tensors for more detail.
        """
        residual = hidden_states
-        if self.normalize_before:
-            hidden_states = self.self_attn_layer_norm(hidden_states)

        # Self Attention
        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
@@ -418,16 +376,13 @@ class BartDecoderLayer(nn.Module):
        )
        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states
-        if not self.normalize_before:
-            hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states = self.self_attn_layer_norm(hidden_states)

        # Cross-Attention Block
        cross_attn_present_key_value = None
        cross_attn_weights = None
        if encoder_hidden_states is not None:
            residual = hidden_states
-            if self.normalize_before:
-                hidden_states = self.encoder_attn_layer_norm(hidden_states)

            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
@@ -440,30 +395,29 @@ class BartDecoderLayer(nn.Module):
            )
            hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
            hidden_states = residual + hidden_states
-            if not self.normalize_before:
-                hidden_states = self.encoder_attn_layer_norm(hidden_states)
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)

            # add cross-attn to positions 3,4 of present_key_value tuple
            present_key_value = present_key_value + cross_attn_present_key_value

        # Fully Connected
        residual = hidden_states
-        if self.normalize_before:
-            hidden_states = self.final_layer_norm(hidden_states)
        hidden_states = self.activation_fn(self.fc1(hidden_states))
        hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training)
        hidden_states = self.fc2(hidden_states)
        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)
        hidden_states = residual + hidden_states
-        if not self.normalize_before:
-            hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.final_layer_norm(hidden_states)

-        return (
-            hidden_states,
-            self_attn_weights,
-            present_key_value,
-            cross_attn_weights,
-        )
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs


 class BartClassificationHead(nn.Module):
@@ -500,8 +454,6 @@ class BartPretrainedModel(PreTrainedModel):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
-        elif isinstance(module, BartSinusoidalPositionalEmbedding):
-            pass
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
@@ -536,10 +488,10 @@ BART_START_DOCSTRING = r"""
    general usage and behavior.

    Parameters:
-        config (:class:`~transformers.BartConfig`): Model configuration class with all the parameters of the model.
-            Initializing with a config file does not load the weights associated with the model, only the
-            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
-            weights.
+        config (:class:`~transformers.BartConfig`):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """

 BART_GENERATION_EXAMPLE = r"""
@@ -547,9 +499,8 @@ BART_GENERATION_EXAMPLE = r"""

        >>> from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig

-        >>> # see ``examples/summarization/bart/run_eval.py`` for a longer example
-        >>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
-        >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+        >>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
+        >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

        >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
        >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')
@@ -557,6 +508,22 @@ BART_GENERATION_EXAMPLE = r"""
        >>> # Generate Summary
        >>> summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
        >>> print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])
+
+    Mask filling example::
+
+        >>> from transformers import BartTokenizer, BartForConditionalGeneration
+        >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
+        >>> TXT = "My friends are <mask> but they eat too many carbs."
+
+        >>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
+        >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
+        >>> logits = model(input_ids).logits
+
+        >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+        >>> probs = logits[0, masked_index].softmax(dim=0)
+        >>> values, predictions = probs.topk(5)
+
+        >>> tokenizer.decode(predictions).split()
 """

 BART_INPUTS_DOCSTRING = r"""
@@ -578,9 +545,22 @@ BART_INPUTS_DOCSTRING = r"""

            `What are attention masks? <../glossary.html#attention-mask>`__
        decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
-            Provide for translation and summarization training. By default, the model will create this tensor by
-            shifting the :obj:`input_ids` to the right, following the paper.
-        decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~transformers.BartTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+
+            Bart uses the :obj:`eos_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
+            :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
+            :obj:`past_key_values`).
+
+            For translation and summarization training, :obj:`decoder_input_ids` should be provided. If no
+            :obj:`decoder_input_ids` is provided, the model will create this tensor by shifting the :obj:`input_ids` to
+            the right for denoising pre-training following the paper.
+        decoder_attention_mask (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
            Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
            also be used by default.

@@ -641,30 +621,22 @@ class BartEncoder(BartPretrainedModel):
        self.layerdrop = config.encoder_layerdrop

        embed_dim = config.d_model
-        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
        self.padding_idx = config.pad_token_id
        self.max_source_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0

        if embed_tokens is not None:
            self.embed_tokens = embed_tokens
        else:
            self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)

-        if config.static_position_embeddings:
-            self.embed_positions = BartSinusoidalPositionalEmbedding(
-                config.max_position_embeddings, embed_dim, self.padding_idx
-            )
-        else:
-            self.embed_positions = BartLearnedPositionalEmbedding(
-                config.max_position_embeddings,
-                embed_dim,
-                self.padding_idx,
-                config.extra_pos_embeddings,
-            )
+        self.embed_positions = BartLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            embed_dim,
+            self.padding_idx,
+        )
        self.layers = nn.ModuleList([BartEncoderLayer(config) for _ in range(config.encoder_layers)])
-        self.layernorm_embedding = LayerNorm(embed_dim) if config.normalize_embedding else nn.Identity()
-        # mbart has one extra layer_norm
-        self.layer_norm = LayerNorm(config.d_model) if config.add_final_layer_norm else None
+        self.layernorm_embedding = nn.LayerNorm(embed_dim)

        self.init_weights()

@@ -747,15 +719,28 @@ class BartEncoder(BartPretrainedModel):
            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
            dropout_probability = random.uniform(0, 1)
            if self.training and (dropout_probability < self.layerdrop):  # skip the layer
-                attn = None
+                layer_outputs = (None, None)
            else:
-                hidden_states, attn = encoder_layer(hidden_states, attention_mask, output_attentions=output_attentions)
+                if getattr(self.config, "gradient_checkpointing", False):
+
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+
+                        return custom_forward
+
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(encoder_layer),
+                        hidden_states,
+                        attention_mask,
+                    )
+                else:
+                    layer_outputs = encoder_layer(hidden_states, attention_mask, output_attentions=output_attentions)
+
+                hidden_states = layer_outputs[0]

            if output_attentions:
-                all_attentions = all_attentions + (attn,)
-
-        if self.layer_norm:
-            hidden_states = self.layer_norm(hidden_states)
+                all_attentions = all_attentions + (layer_outputs[1],)

        if output_hidden_states:
            encoder_states = encoder_states + (hidden_states,)
@@ -780,7 +765,6 @@ class BartDecoder(BartPretrainedModel):
        super().__init__(config)
        self.dropout = config.dropout
        self.layerdrop = config.decoder_layerdrop
-        self.do_blenderbot_90_layernorm = config.do_blenderbot_90_layernorm  # layernorm variant
        self.padding_idx = config.pad_token_id
        self.max_target_positions = config.max_position_embeddings
        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
@@ -790,20 +774,13 @@ class BartDecoder(BartPretrainedModel):
        else:
            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)

-        if config.static_position_embeddings:
-            self.embed_positions = BartSinusoidalPositionalEmbedding(
-                config.max_position_embeddings, config.d_model, config.pad_token_id
-            )
-        else:
-            self.embed_positions = BartLearnedPositionalEmbedding(
-                config.max_position_embeddings,
-                config.d_model,
-                self.padding_idx,
-                config.extra_pos_embeddings,
-            )
+        self.embed_positions = BartLearnedPositionalEmbedding(
+            config.max_position_embeddings,
+            config.d_model,
+            self.padding_idx,
+        )
        self.layers = nn.ModuleList([BartDecoderLayer(config) for _ in range(config.decoder_layers)])
-        self.layernorm_embedding = LayerNorm(config.d_model) if config.normalize_embedding else nn.Identity()
-        self.layer_norm = LayerNorm(config.d_model) if config.add_final_layer_norm else None
+        self.layernorm_embedding = nn.LayerNorm(config.d_model)

        self.init_weights()

@@ -902,33 +879,6 @@ class BartDecoder(BartPretrainedModel):
                input_shape, inputs_embeds.dtype, past_key_values_length=past_key_values_length
            ).to(self.device)

-        # create decoder_padding_mask if not provided and needed
-        # 4.12.20 (PVP): Not a fan of this "magical" function that
-        # automatically creates attention_mask for padded tokens
-        # => this is inconsistent with other models
-        # => Pegasus uses the pad_token as decoder_start_token_id, so that this could
-        # pose some problems.
-        if (
-            attention_mask is None
-            and input_ids is not None
-            and input_shape[-1] > 1
-            and self.config.pad_token_id in input_ids
-        ):
-            # should be kept for backwards compatibility
-            attention_mask = input_ids.ne(self.config.pad_token_id).to(torch.long)
-            # never mask leading token, even if it is pad
-            attention_mask[:, 0] = attention_mask[:, 1]
-            if past_key_values_length > 0:
-                attention_mask = torch.cat(
-                    [
-                        torch.ones(
-                            (input_shape[0], past_key_values_length), dtype=torch.long, device=input_ids.device
-                        ),
-                        attention_mask,
-                    ],
-                    dim=-1,
-                )
-
        if attention_mask is not None and combined_attention_mask is not None:
            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
            combined_attention_mask = combined_attention_mask + _expand_mask(
@@ -943,12 +893,8 @@ class BartDecoder(BartPretrainedModel):
        # embed positions
        positions = self.embed_positions(input_shape, past_key_values_length)

-        if self.do_blenderbot_90_layernorm:
-            hidden_states = self.layernorm_embedding(inputs_embeds)
-            hidden_states += positions
-        else:
-            hidden_states = inputs_embeds + positions
-            hidden_states = self.layernorm_embedding(hidden_states)
+        hidden_states = inputs_embeds + positions
+        hidden_states = self.layernorm_embedding(hidden_states)

        hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training)

@@ -967,30 +913,51 @@ class BartDecoder(BartPretrainedModel):

            past_key_value = past_key_values[idx] if past_key_values is not None else None

-            hidden_states, layer_self_attn, present_key_value, layer_cross_attn = decoder_layer(
-                hidden_states,
-                attention_mask=combined_attention_mask,
-                encoder_hidden_states=encoder_hidden_states,
-                encoder_attention_mask=encoder_attention_mask,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-            )
+            if getattr(self.config, "gradient_checkpointing", False):
+                if use_cache:
+                    raise ValueError(
+                        "When using `gradient_checkpointing, make sure that `use_cache=False` and `config.use_cache=False`."
+                    )
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        # None for past_key_value
+                        return module(*inputs, output_attentions, use_cache)
+
+                    return custom_forward
+
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(decoder_layer),
+                    hidden_states,
+                    combined_attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    None,
+                )
+            else:
+
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=combined_attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]

            if use_cache:
-                next_decoder_cache += (present_key_value,)
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)

            if output_attentions:
-                all_self_attns += (layer_self_attn,)
-                all_cross_attentions += (layer_cross_attn,)
+                all_self_attns += (layer_outputs[1],)
+                all_cross_attentions += (layer_outputs[2],)

        # add hidden states from the last decoder layer
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

-        # if config.add_final_layer_norm (mBART)
-        if self.layer_norm:
-            hidden_states = self.layer_norm(hidden_states)
-
        next_cache = next_decoder_cache if use_cache else None
        if not return_dict:
            return tuple(
@@ -1060,12 +1027,12 @@ class BartModel(BartPretrainedModel):
        return_dict=None,
    ):

-        # 4.12.20 (PVP): Not a fan of this "magical" function and
-        # also wonder how often it's actually used ... keep now
-        # for backward compatibility
-        # -> is this used for backward compatibility
+        # different to other models, Bart automatically creates decoder_input_ids from
+        # input_ids if no decoder_input_ids are provided
        if decoder_input_ids is None and decoder_inputs_embeds is None:
-            decoder_input_ids = shift_tokens_right(input_ids, self.config.pad_token_id)
+            decoder_input_ids = shift_tokens_right(
+                input_ids, self.config.pad_token_id, self.config.decoder_start_token_id
+            )

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
@@ -1083,7 +1050,7 @@ class BartModel(BartPretrainedModel):
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )
-        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=False
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
            encoder_outputs = BaseModelOutput(
                last_hidden_state=encoder_outputs[0],
@@ -1192,31 +1159,14 @@ class BartForConditionalGeneration(BartPretrainedModel):
            (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``.

        Returns:
-
-        Conditional generation example::
-
-            >>> # Mask filling only works for bart-large
-            >>> from transformers import BartTokenizer, BartForConditionalGeneration
-            >>> tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
-            >>> TXT = "My friends are <mask> but they eat too many carbs."
-
-            >>> model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
-            >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
-            >>> logits = model(input_ids).logits
-
-            >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
-            >>> probs = logits[0, masked_index].softmax(dim=0)
-            >>> values, predictions = probs.topk(5)
-
-            >>> tokenizer.decode(predictions).split()
-            >>> # ['good', 'great', 'all', 'really', 'very']
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        if labels is not None:
-            use_cache = False
            if decoder_input_ids is None:
-                decoder_input_ids = shift_tokens_right(labels, self.config.pad_token_id)
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )

        outputs = self.model(
            input_ids,
@@ -1237,7 +1187,6 @@ class BartForConditionalGeneration(BartPretrainedModel):
        masked_lm_loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
-            # TODO(SS): do we need to ignore pad tokens in labels?
            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))

        if not return_dict:
@@ -1288,7 +1237,10 @@ class BartForConditionalGeneration(BartPretrainedModel):
    def _reorder_cache(past, beam_idx):
        reordered_past = ()
        for layer_past in past:
-            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+            # cached cross_attention states don't have to be reordered -> they are always the same
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+            )
        return reordered_past


--- a/src/transformers/models/bart/modeling_tf_bart.py
+++ b/src/transformers/models/bart/modeling_tf_bart.py
@@ -545,7 +545,7 @@ BART_INPUTS_DOCSTRING = r"""
        decoder_input_ids (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
            Provide for translation and summarization training. By default, the model will create this tensor by
            shifting the input_ids right, following the paper.
-        decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
+        decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
            will be made by default and ignore pad tokens. It is not recommended to set this for most use cases.
        encoder_outputs (:obj:`tf.FloatTensor`, `optional`):
            hidden states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
--- a/src/transformers/models/blenderbot/init.py
+++ b/src/transformers/models/blenderbot/init.py
@@ -18,7 +18,7 @@

 from ...file_utils import is_tf_available, is_torch_available
 from .configuration_blenderbot import BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP, BlenderbotConfig
-from .tokenization_blenderbot import BlenderbotSmallTokenizer, BlenderbotTokenizer
+from .tokenization_blenderbot import BlenderbotTokenizer


 if is_torch_available():
@@ -26,7 +26,9 @@ if is_torch_available():
        BLENDERBOT_PRETRAINED_MODEL_ARCHIVE_LIST,
        BlenderbotForConditionalGeneration,
        BlenderbotModel,
+        BlenderbotPreTrainedModel,
    )

+
 if is_tf_available():
    from .modeling_tf_blenderbot import TFBlenderbotForConditionalGeneration
--- a/src/transformers/models/blenderbot/configuration_blenderbot.py
+++ b/src/transformers/models/blenderbot/configuration_blenderbot.py
@@ -1,8 +1,7 @@
-#!/usr/bin/env python3
 # coding=utf-8
-# Copyright (c) Facebook, Inc. and Huggingface, 2020
+# Copyright 2021 The Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
 #
-# This source code is licensed under the MIT license found in the;
+# Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
@@ -13,46 +12,49 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# LICENSE file in the root directory of this source tree.
-"""
-BlenderbotConfig has the same signature as BartConfig. We only rewrite the signature in order to document
-blenderbot-90M defaults.
-"""
-from ..bart.configuration_bart import BartConfig
+""" Blenderbot model configuration """

+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)

 BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "facebook/blenderbot-3B": "https://cdn.huggingface.co/facebook/blenderbot-3B/config.json",
-    "facebook/blenderbot-90M": "https://cdn.huggingface.co/facebook/blenderbot-90M/config.json",
+    "facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/config.json",
+    # See all Blenderbot models at https://huggingface.co/models?filter=blenderbot
 }


-class BlenderbotConfig(BartConfig):
+class BlenderbotConfig(PretrainedConfig):
    r"""
-    This is the configuration class to store the configuration of a
-    :class:`~transformers.BlenderbotForConditionalGeneration`. It inherits from :class:`~transformers.BartConfig` and
-    has the same signature with different defaults.
+    This is the configuration class to store the configuration of a :class:`~transformers.BlenderbotModel`. It is used
+    to instantiate an Blenderbot model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Blenderbot
+    `facebook/blenderbot-3B <https://huggingface.co/facebook/blenderbot-3B>`__ architecture.

    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.

+
    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 54944):
-            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.BlenderbotForConditionalGeneration`.
-        d_model (:obj:`int`, `optional`, defaults to 512):
+        vocab_size (:obj:`int`, `optional`, defaults to 50265):
+            Vocabulary size of the Blenderbot model. Defines the number of different tokens that can be represented by
+            the :obj:`inputs_ids` passed when calling :class:`~transformers.BlenderbotModel` or
+            :class:`~transformers.TFBlenderbotModel`.
+        d_model (:obj:`int`, `optional`, defaults to 1024):
            Dimensionality of the layers and the pooler layer.
-        encoder_layers (:obj:`int`, `optional`, defaults to 8):
-            Number of encoder layers, 6 are used for the `blenderbot-90M` model.
-        decoder_layers (:obj:`int`, `optional`, defaults to 8):
-            Number of decoder layers, 6 are used for the `blenderbot-90M` model.
+        encoder_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of encoder layers.
+        decoder_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of decoder layers.
        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
            Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
+        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
+        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
@@ -65,117 +67,115 @@ class BlenderbotConfig(BartConfig):
            The dropout ratio for activations inside the fully connected layer.
        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
            The dropout ratio for classifier.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
        init_std (:obj:`float`, `optional`, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        add_bias_logits (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            This should be completed, specific to marian.
-        normalize_before (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Call layernorm before attention ops.
-        normalize_embedding (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Call layernorm after embeddings.
-        static_position_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Don't learn positional embeddings, use sinusoidal.
-        add_final_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Why not add another layernorm?
-        do_blenderbot_90_layernorm (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Blenderbot-90m checkpoint uses `layernorm_embedding` one line earlier in the decoder.
-        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Scale embeddings by diving by sqrt(d_model).
-        eos_token_id (:obj:`int`, `optional`, defaults to 2)
-            End of stream token id.
-        pad_token_id (:obj:`int`, `optional`, defaults to 1)
-            Padding token id.
-        bos_token_id (:obj:`int`, `optional`, defaults to 0)
-            Beginning of stream token id.
        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
            https://arxiv.org/abs/1909.11556>`__ for more details.
        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
            https://arxiv.org/abs/1909.11556>`__ for more details.
-        extra_pos_embeddings: (:obj:`int`, `optional`, defaults to 2):
-            How many extra learned positional embeddings to use. Should be set to :obj:`pad_token_id+1`.
-        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether this is an encoder/decoder model.
-        force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``),
+        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Scale embeddings by diving by sqrt(d_model).
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models)
+
+    Example::
+
+        >>> from transformers import BlenderbotModel, BlenderbotConfig
+
+        >>> # Initializing a Blenderbot facebook/blenderbot-3B style configuration
+        >>> configuration = BlenderbotConfig()
+
+        >>> # Initializing a model from the facebook/blenderbot-3B style configuration
+        >>> model = BlenderbotModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
    """
    model_type = "blenderbot"
+    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
-        activation_dropout=0.0,
-        extra_pos_embeddings=0,
-        activation_function="gelu",
-        vocab_size=54944,
-        d_model=512,
-        encoder_ffn_dim=2048,
-        encoder_layers=8,
-        encoder_attention_heads=16,
-        decoder_ffn_dim=2048,
-        decoder_layers=8,
-        decoder_attention_heads=16,
+        vocab_size=8008,
+        max_position_embeddings=128,
+        encoder_layers=2,
+        encoder_ffn_dim=10240,
+        encoder_attention_heads=32,
+        decoder_layers=24,
+        decoder_ffn_dim=10240,
+        decoder_attention_heads=32,
        encoder_layerdrop=0.0,
        decoder_layerdrop=0.0,
-        attention_dropout=0.0,
-        dropout=0.1,
-        max_position_embeddings=512,
-        classifier_dropout=0.0,
+        use_cache=True,
        is_encoder_decoder=True,
-        pad_token_id=1,
-        bos_token_id=0,
-        eos_token_id=2,
-        normalize_before=False,
-        add_final_layer_norm=False,
-        do_blenderbot_90_layernorm=True,
+        activation_function="gelu",
+        d_model=2560,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        decoder_start_token_id=1,
+        classifier_dropout=0.0,
        scale_embedding=False,
-        normalize_embedding=True,
-        static_position_embeddings=False,
-        add_bias_logits=False,
-        force_bos_token_to_be_generated=False,
-        **common_kwargs
+        gradient_checkpointing=False,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        **kwargs
    ):
-        r"""
-        Examples::
-
-            >>> from transformers import BlenderbotConfig
-            >>> config = BlenderbotConfig.from_pretrained('facebook/blenderbot-90M')
-
-        """
-        if "hidden_size" in common_kwargs:
-            raise ValueError("hidden size is called d_model")
        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            is_encoder_decoder=is_encoder_decoder,
-            vocab_size=vocab_size,
-            d_model=d_model,
-            encoder_ffn_dim=encoder_ffn_dim,
-            encoder_layers=encoder_layers,
-            encoder_layerdrop=encoder_layerdrop,
-            encoder_attention_heads=encoder_attention_heads,
-            decoder_layerdrop=decoder_layerdrop,
-            decoder_ffn_dim=decoder_ffn_dim,
-            decoder_layers=decoder_layers,
-            normalize_before=normalize_before,
-            normalize_embedding=normalize_embedding,
-            static_position_embeddings=static_position_embeddings,
-            add_bias_logits=add_bias_logits,
-            force_bos_token_to_be_generated=force_bos_token_to_be_generated,
-            do_blenderbot_90_layernorm=do_blenderbot_90_layernorm,
-            add_final_layer_norm=add_final_layer_norm,
-            scale_embedding=scale_embedding,
-            attention_dropout=attention_dropout,
-            dropout=dropout,
-            classifier_dropout=classifier_dropout,
-            activation_dropout=activation_dropout,
-            max_position_embeddings=max_position_embeddings,
-            extra_pos_embeddings=extra_pos_embeddings,
-            activation_function=activation_function,
-            decoder_attention_heads=decoder_attention_heads,
-            **common_kwargs,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs,
        )
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.gradient_checkpointing = gradient_checkpointing
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+
+        # IMPORTANT
+        # DELETE ALL OF THE FOLLOWING LINES AS SOON AS TF IS READY
+        self.extra_pos_embeddings = 0
+        self.normalize_before = True
+        self.add_final_layer_norm = True
+        self.do_blenderbot_90_layernorm = True
+        self.normalize_embedding = False
+        self.static_position_embeddings = False
+        self.add_bias_logits = False
+        self.force_bos_token_to_be_generated = False
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
--- a/src/transformers/models/blenderbot/tokenization_blenderbot.py
+++ b/src/transformers/models/blenderbot/tokenization_blenderbot.py
@@ -1,8 +1,7 @@
-#!/usr/bin/env python3
 # coding=utf-8
-# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright 2021 The Facebook Inc. and The HuggingFace Inc. team. All rights reserved.
 #
-# This source code is licensed under the MIT license found in the;
+# Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
@@ -13,15 +12,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# LICENSE file in the root directory of this source tree.
-""""BlenderbotTokenizer and BlenderbotSmallTokenizer"""
-import json
-import os
-from typing import Dict, List, Optional, Tuple
+"""Tokenization class for Blenderbot."""

-import regex as re
+from typing import List

-from ...tokenization_utils import PreTrainedTokenizer
 from ...utils import logging
 from ..roberta.tokenization_roberta import RobertaTokenizer

@@ -93,177 +87,3 @@ def get_pairs(word):

    pairs = set(pairs)
    return pairs
-
-
-class BlenderbotSmallTokenizer(PreTrainedTokenizer):
-    """
-    Constructs a Blenderbot-90M tokenizer based on BPE (Byte-Pair-Encoding)
-
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
-    Users should refer to the superclass for more information regarding methods.
-
-    Args:
-        vocab_file (:obj:`str`):
-            File containing the vocabulary.
-        merges_file (:obj:`str`):
-            Path to the merges file.
-        bos_token (:obj:`str`, `optional`, defaults to :obj:`"__start__"`):
-            The beginning of sentence token.
-        eos_token (:obj:`str`, `optional`, defaults to :obj:`"__end__"`):
-            The end of sentence token.
-        unk_token (:obj:`str`, `optional`, defaults to :obj:`"__unk__"`):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        pad_token (:obj:`str`, `optional`, defaults to :obj:`"__pad__"`):
-            The token used for padding, for example when batching sequences of different lengths.
-        **kwargs
-            Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer`
-    """
-
-    vocab_files_names = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
-    pretrained_vocab_files_map = {
-        "vocab_file": {"facebook/blenderbot-90M": "https://cdn.huggingface.co/facebook/blenderbot-90M/vocab.json"},
-        "merges_file": {"facebook/blenderbot-90M": "https://cdn.huggingface.co/facebook/blenderbot-90M/merges.txt"},
-    }
-    max_model_input_sizes = {"facebook/blenderbot-90M": 512}
-
-    def __init__(
-        self,
-        vocab_file,
-        merges_file,
-        bos_token="__start__",
-        eos_token="__end__",
-        unk_token="__unk__",
-        pad_token="__null__",
-        **kwargs
-    ):
-        super().__init__(unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, pad_token=pad_token, **kwargs)
-
-        with open(vocab_file, encoding="utf-8") as vocab_handle:
-            self.encoder = json.load(vocab_handle)
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        with open(merges_file, encoding="utf-8") as merges_handle:
-            merges = merges_handle.read().split("\n")[1:-1]
-        merges = [tuple(merge.split()) for merge in merges]
-        self.bpe_ranks = dict(zip(merges, range(len(merges))))
-        self.cache = {}
-
-    @property
-    def vocab_size(self) -> int:
-        return len(self.encoder)
-
-    def get_vocab(self) -> Dict:
-        return dict(self.encoder, **self.added_tokens_encoder)
-
-    def bpe(self, token: str) -> str:
-        if token in self.cache:
-            return self.cache[token]
-        token = re.sub("([.,!?()])", r" \1", token)
-        token = re.sub("(')", r" \1 ", token)
-        token = re.sub(r"\s{2,}", " ", token)
-        if "\n" in token:
-            token = token.replace("\n", " __newln__")
-
-        tokens = token.split(" ")
-        words = []
-        for token in tokens:
-            if not len(token):
-                continue
-
-            token = token.lower()
-            word = tuple(token)
-            word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
-            pairs = get_pairs(word)
-
-            if not pairs:
-                words.append(token)
-                continue
-
-            while True:
-                bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
-                if bigram not in self.bpe_ranks:
-                    break
-                first, second = bigram
-                new_word = []
-                i = 0
-
-                while i < len(word):
-                    try:
-                        j = word.index(first, i)
-                        new_word.extend(word[i:j])
-                        i = j
-                    except ValueError:
-                        new_word.extend(word[i:])
-                        break
-
-                    if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
-                        new_word.append(first + second)
-                        i += 2
-                    else:
-                        new_word.append(word[i])
-                        i += 1
-                new_word = tuple(new_word)
-                word = new_word
-                if len(word) == 1:
-                    break
-                else:
-                    pairs = get_pairs(word)
-            word = "@@ ".join(word)
-            word = word[:-4]
-
-            self.cache[token] = word
-            words.append(word)
-        return " ".join(words)
-
-    def _tokenize(self, text: str) -> List[str]:
-        """ Split a string into tokens using BPE."""
-        split_tokens = []
-
-        words = re.findall(r"\S+\n?", text)
-
-        for token in words:
-            split_tokens.extend([t for t in self.bpe(token).split(" ")])
-        return split_tokens
-
-    def _convert_token_to_id(self, token: str) -> int:
-        """ Converts a token to an id using the vocab. """
-        token = token.lower()
-        return self.encoder.get(token, self.encoder.get(self.unk_token))
-
-    def _convert_id_to_token(self, index: int) -> str:
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.decoder.get(index, self.unk_token)
-
-    def convert_tokens_to_string(self, tokens: List[str]) -> str:
-        """ Converts a sequence of tokens  in a single string. """
-        out_string = " ".join(tokens).replace("@@ ", "").strip()
-        return out_string
-
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
-        if not os.path.isdir(save_directory):
-            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
-            return
-        vocab_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
-        )
-        merge_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
-        )
-
-        with open(vocab_file, "w", encoding="utf-8") as f:
-            f.write(json.dumps(self.encoder, ensure_ascii=False))
-
-        index = 0
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write("#version: 0.2\n")
-            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        "Saving vocabulary to {}: BPE merge indices are not consecutive."
-                        " Please check that the tokenizer is not corrupted!".format(merge_file)
-                    )
-                    index = token_index
-                writer.write(" ".join(bpe_tokens) + "\n")
-                index += 1
-
-        return vocab_file, merge_file
--- a/src/transformers/models/blenderbot_small/init.py
+++ b/src/transformers/models/blenderbot_small/init.py
@@ -0,0 +1,29 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ...file_utils import is_torch_available
+from .configuration_blenderbot_small import BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP, BlenderbotSmallConfig
+from .tokenization_blenderbot_small import BlenderbotSmallTokenizer
+
+
+if is_torch_available():
+    from .modeling_blenderbot_small import (
+        BLENDERBOT_SMALL_PRETRAINED_MODEL_ARCHIVE_LIST,
+        BlenderbotSmallForConditionalGeneration,
+        BlenderbotSmallModel,
+        BlenderbotSmallPreTrainedModel,
+    )
--- a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
@@ -0,0 +1,170 @@
+# coding=utf-8
+# Copyright 2021 The Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BlenderbotSmall model configuration """
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/config.json",
+    # See all BlenderbotSmall models at https://huggingface.co/models?filter=blenderbot_small
+}
+
+
+class BlenderbotSmallConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.BlenderbotSmallModel`. It is
+    used to instantiate an BlenderbotSmall model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the BlenderbotSmall
+    `facebook/blenderbot_small-90M <https://huggingface.co/facebook/blenderbot_small-90M>`__ architecture.
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 50265):
+            Vocabulary size of the BlenderbotSmall model. Defines the number of different tokens that can be
+            represented by the :obj:`inputs_ids` passed when calling :class:`~transformers.BlenderbotSmallModel` or
+            :class:`~transformers.TFBlenderbotSmallModel`.
+        d_model (:obj:`int`, `optional`, defaults to 512):
+            Dimensionality of the layers and the pooler layer.
+        encoder_layers (:obj:`int`, `optional`, defaults to 8):
+            Number of encoder layers.
+        decoder_layers (:obj:`int`, `optional`, defaults to 8):
+            Number of decoder layers.
+        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+        dropout (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for classifier.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        init_std (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
+            https://arxiv.org/abs/1909.11556>`__ for more details.
+        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
+            https://arxiv.org/abs/1909.11556>`__ for more details.
+        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Scale embeddings by diving by sqrt(d_model).
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models)
+
+    Example::
+
+        >>> from transformers import BlenderbotSmallModel, BlenderbotSmallConfig
+
+        >>> # Initializing a BlenderbotSmall facebook/blenderbot_small-90M style configuration
+        >>> configuration = BlenderbotSmallConfig()
+
+        >>> # Initializing a model from the facebook/blenderbot_small-90M style configuration
+        >>> model = BlenderbotSmallModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+    """
+    model_type = "blenderbot-small"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=50265,
+        max_position_embeddings=512,
+        encoder_layers=8,
+        encoder_ffn_dim=2048,
+        encoder_attention_heads=16,
+        decoder_layers=8,
+        decoder_ffn_dim=2048,
+        decoder_attention_heads=16,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        use_cache=True,
+        is_encoder_decoder=True,
+        activation_function="gelu",
+        d_model=512,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        decoder_start_token_id=1,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        gradient_checkpointing=False,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        **kwargs
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs,
+        )
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.gradient_checkpointing = gradient_checkpointing
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
--- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
@@ -0,0 +1,228 @@
+# coding=utf-8
+# Copyright 2021 The Facebook Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization class for BlenderbotSmall."""
+
+import json
+import os
+from typing import Dict, List, Optional, Tuple
+
+import regex as re
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+    # "tokenizer_config_file": "tokenizer_config.json",
+}
+
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+
+    pairs = set(pairs)
+    return pairs
+
+
+class BlenderbotSmallTokenizer(PreTrainedTokenizer):
+    """
+    Constructs a Blenderbot-90M tokenizer based on BPE (Byte-Pair-Encoding)
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    Users should refer to the superclass for more information regarding methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            File containing the vocabulary.
+        merges_file (:obj:`str`):
+            Path to the merges file.
+        bos_token (:obj:`str`, `optional`, defaults to :obj:`"__start__"`):
+            The beginning of sentence token.
+        eos_token (:obj:`str`, `optional`, defaults to :obj:`"__end__"`):
+            The end of sentence token.
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"__unk__"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"__pad__"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        **kwargs
+            Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer`
+    """
+
+    vocab_files_names = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
+    pretrained_vocab_files_map = {
+        "vocab_file": {
+            "facebook/blenderbot_small-90M": "https://cdn.huggingface.co/facebook/blenderbot_small-90M/vocab.json"
+        },
+        "merges_file": {
+            "facebook/blenderbot_small-90M": "https://cdn.huggingface.co/facebook/blenderbot_small-90M/merges.txt"
+        },
+    }
+    max_model_input_sizes = {"facebook/blenderbot_small-90M": 512}
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        bos_token="__start__",
+        eos_token="__end__",
+        unk_token="__unk__",
+        pad_token="__null__",
+        **kwargs
+    ):
+        super().__init__(unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, pad_token=pad_token, **kwargs)
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            merges = merges_handle.read().split("\n")[1:-1]
+        merges = [tuple(merge.split()) for merge in merges]
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {}
+
+    @property
+    def vocab_size(self) -> int:
+        return len(self.encoder)
+
+    def get_vocab(self) -> Dict:
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def bpe(self, token: str) -> str:
+        if token in self.cache:
+            return self.cache[token]
+        token = re.sub("([.,!?()])", r" \1", token)
+        token = re.sub("(')", r" \1 ", token)
+        token = re.sub(r"\s{2,}", " ", token)
+        if "\n" in token:
+            token = token.replace("\n", " __newln__")
+
+        tokens = token.split(" ")
+        words = []
+        for token in tokens:
+            if not len(token):
+                continue
+
+            token = token.lower()
+            word = tuple(token)
+            word = tuple(list(word[:-1]) + [word[-1] + "</w>"])
+            pairs = get_pairs(word)
+
+            if not pairs:
+                words.append(token)
+                continue
+
+            while True:
+                bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+                if bigram not in self.bpe_ranks:
+                    break
+                first, second = bigram
+                new_word = []
+                i = 0
+
+                while i < len(word):
+                    try:
+                        j = word.index(first, i)
+                        new_word.extend(word[i:j])
+                        i = j
+                    except ValueError:
+                        new_word.extend(word[i:])
+                        break
+
+                    if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                        new_word.append(first + second)
+                        i += 2
+                    else:
+                        new_word.append(word[i])
+                        i += 1
+                new_word = tuple(new_word)
+                word = new_word
+                if len(word) == 1:
+                    break
+                else:
+                    pairs = get_pairs(word)
+            word = "@@ ".join(word)
+            word = word[:-4]
+
+            self.cache[token] = word
+            words.append(word)
+        return " ".join(words)
+
+    def _tokenize(self, text: str) -> List[str]:
+        """ Split a string into tokens using BPE."""
+        split_tokens = []
+
+        words = re.findall(r"\S+\n?", text)
+
+        for token in words:
+            split_tokens.extend([t for t in self.bpe(token).split(" ")])
+        return split_tokens
+
+    def _convert_token_to_id(self, token: str) -> int:
+        """ Converts a token to an id using the vocab. """
+        token = token.lower()
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index: int) -> str:
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """ Converts a sequence of tokens  in a single string. """
+        out_string = " ".join(tokens).replace("@@ ", "").strip()
+        return out_string
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        merge_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write("#version: 0.2\n")
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        "Saving vocabulary to {}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!".format(merge_file)
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+
+        return vocab_file, merge_file
--- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
+++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
@@ -0,0 +1,103 @@
+# coding=utf-8
+# Copyright 2021, The Facebook, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast tokenization class for BlenderbotSmall."""
+from typing import List, Optional
+
+from tokenizers import ByteLevelBPETokenizer
+
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import logging
+from .tokenization_blenderbot_small import BlenderbotSmallTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {}
+
+PRETRAINED_VOCAB_FILES_MAP = {}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "facebook/blenderbot_small-90M": 512,
+}
+
+
+class BlenderbotSmallTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" BlenderbotSmall tokenizer (backed by HuggingFace's `tokenizers` library).
+
+    Args:
+        vocab_file (:obj:`str`):
+            Path to the vocabulary file.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    slow_tokenizer_class = BlenderbotSmallTokenizer
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        unk_token="<|endoftext|>",
+        bos_token="<|endoftext|>",
+        eos_token="<|endoftext|>",
+        add_prefix_space=False,
+        trim_offsets=True,
+        **kwargs
+    ):
+        super().__init__(
+            ByteLevelBPETokenizer(
+                vocab_file=vocab_file,
+                merges_file=merges_file,
+                add_prefix_space=add_prefix_space,
+                trim_offsets=trim_offsets,
+            ),
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            **kwargs,
+        )
+        self.add_prefix_space = add_prefix_space
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
+        if token_ids_1 is None:
+            return output
+
+        return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. BlenderbotSmall
+        does not make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of zeros.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
--- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
@@ -77,10 +77,21 @@ ENCODER_DECODER_INPUTS_DOCSTRING = r"""

            `What are attention masks? <../glossary.html#attention-mask>`__
        decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~transformers.PreTrainedTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+
+            If :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
+            :obj:`past_key_values`).
+
            Provide for sequence to sequence training to the decoder. Indices can be obtained using
            :class:`~transformers.PretrainedTokenizer`. See :meth:`transformers.PreTrainedTokenizer.encode` and
            :meth:`transformers.PreTrainedTokenizer.__call__` for details.
-        decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
+        decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
            Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
            also be used by default.
        encoder_outputs (:obj:`tuple(torch.FloatTensor)`, `optional`):
--- a/src/transformers/models/fsmt/modeling_fsmt.py
+++ b/src/transformers/models/fsmt/modeling_fsmt.py
@@ -235,7 +235,7 @@ FSMT_INPUTS_DOCSTRING = r"""
        decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
            Provide for translation and summarization training. By default, the model will create this tensor by
            shifting the input_ids right, following the paper.
-        decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
+        decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
            Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
            also be used by default. If you want to change padding behavior, you should read
            :func:`modeling_fstm._prepare_fstm_decoder_inputs` and modify. See diagram 1 in the paper for more info on
--- a/src/transformers/models/marian/init.py
+++ b/src/transformers/models/marian/init.py
@@ -15,16 +15,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-from ...file_utils import is_sentencepiece_available, is_tf_available, is_torch_available
-from .configuration_marian import MarianConfig
+from ...file_utils import is_sentencepiece_available, is_tf_available, is_tokenizers_available, is_torch_available
+from .configuration_marian import MARIAN_PRETRAINED_CONFIG_ARCHIVE_MAP, MarianConfig


 if is_sentencepiece_available():
    from .tokenization_marian import MarianTokenizer

 if is_torch_available():
-    from .modeling_marian import MarianMTModel
+    from .modeling_marian import (
+        MARIAN_PRETRAINED_MODEL_ARCHIVE_LIST,
+        MarianModel,
+        MarianMTModel,
+        MarianPreTrainedModel,
+    )

 if is_tf_available():
    from .modeling_tf_marian import TFMarianMTModel
--- a/src/transformers/models/marian/configuration_marian.py
+++ b/src/transformers/models/marian/configuration_marian.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2020 The OPUS-NMT Team, Marian team, and The HuggingFace Inc. team.
+# Copyright 2021 The Marian Team Authors and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,40 +14,48 @@
 # limitations under the License.
 """ Marian model configuration """

-from ..bart.configuration_bart import BartConfig
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging


-PRETRAINED_CONFIG_ARCHIVE_MAP = {
+logger = logging.get_logger(__name__)
+
+MARIAN_PRETRAINED_CONFIG_ARCHIVE_MAP = {
    "Helsinki-NLP/opus-mt-en-de": "https://huggingface.co/Helsinki-NLP/opus-mt-en-de/resolve/main/config.json",
+    # See all Marian models at https://huggingface.co/models?filter=marian
 }


-class MarianConfig(BartConfig):
-    """
-    This is the configuration class to store the configuration of a :class:`~transformers.MarianMTModel`. It is used to
-    instantiate a Marian model according to the specified arguments, defining the model architecture.
+class MarianConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.MarianModel`. It is used to
+    instantiate an Marian model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Marian
+    `Helsinki-NLP/opus-mt-en-de <https://huggingface.co/Helsinki-NLP/opus-mt-en-de>`__ architecture.

    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.

+
    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 58101):
+        vocab_size (:obj:`int`, `optional`, defaults to 50265):
            Vocabulary size of the Marian model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.MarianMTModel`.
-        d_model (:obj:`int`, `optional`, defaults to 512):
+            :obj:`inputs_ids` passed when calling :class:`~transformers.MarianModel` or
+            :class:`~transformers.TFMarianModel`.
+        d_model (:obj:`int`, `optional`, defaults to 1024):
            Dimensionality of the layers and the pooler layer.
-        encoder_layers (:obj:`int`, `optional`, defaults to 6):
+        encoder_layers (:obj:`int`, `optional`, defaults to 12):
            Number of encoder layers.
-        decoder_layers (:obj:`int`, `optional`, defaults to 6):
+        decoder_layers (:obj:`int`, `optional`, defaults to 12):
            Number of decoder layers.
-        encoder_attention_heads (:obj:`int`, `optional`, defaults to 8):
+        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (:obj:`int`, `optional`, defaults to 8):
+        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
            Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
-        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 2048):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
+        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
@@ -59,42 +67,113 @@ class MarianConfig(BartConfig):
            The dropout ratio for activations inside the fully connected layer.
        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
            The dropout ratio for classifier.
-        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
        init_std (:obj:`float`, `optional`, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        add_bias_logits (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            This should be completed, specific to marian.
-        normalize_before (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Call layernorm before attention ops.
-        normalize_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Call layernorm after embeddings.
-        static_position_embeddings (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Don't learn positional embeddings, use sinusoidal.
-        add_final_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Why not add another layernorm?
-        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Scale embeddings by diving by sqrt(d_model).
-        eos_token_id (:obj:`int`, `optional`, defaults to 2)
-            End of stream token id.
-        pad_token_id (:obj:`int`, `optional`, defaults to 1)
-            Padding token id.
-        bos_token_id (:obj:`int`, `optional`, defaults to 0)
-            Beginning of stream token id.
        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
            https://arxiv.org/abs/1909.11556>`__ for more details.
        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
            https://arxiv.org/abs/1909.11556>`__ for more details.
-        extra_pos_embeddings: (:obj:`int`, `optional`, defaults to 2):
-            How many extra learned positional embeddings to use.
-        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether this is an encoder/decoder model
-        force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``).
-    """
+        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Scale embeddings by diving by sqrt(d_model).
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models)

+    Examples::
+
+        >>> from transformers import MarianModel, MarianConfig
+
+        >>> # Initializing a Marian Helsinki-NLP/opus-mt-en-de style configuration
+        >>> configuration = MarianConfig()
+
+        >>> # Initializing a model from the Helsinki-NLP/opus-mt-en-de style configuration
+        >>> model = MarianModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+    """
    model_type = "marian"
    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=50265,
+        max_position_embeddings=1024,
+        encoder_layers=12,
+        encoder_ffn_dim=4096,
+        encoder_attention_heads=16,
+        decoder_layers=12,
+        decoder_ffn_dim=4096,
+        decoder_attention_heads=16,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        use_cache=True,
+        is_encoder_decoder=True,
+        activation_function="gelu",
+        d_model=1024,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        decoder_start_token_id=58100,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        gradient_checkpointing=False,
+        pad_token_id=58100,
+        eos_token_id=0,
+        **kwargs
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs,
+        )
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.gradient_checkpointing = gradient_checkpointing
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+
+        # IMPORTANT
+        # DELETE ALL OF THE FOLLOWING LINES AS SOON AS TF IS READY
+        self.extra_pos_embeddings = 0
+        self.normalize_before = False
+        self.add_final_layer_norm = False
+        self.do_blenderbot_90_layernorm = False
+        self.normalize_embedding = False
+        self.static_position_embeddings = True
+        self.add_bias_logits = False
+        self.force_bos_token_to_be_generated = False
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
--- a/src/transformers/models/marian/tokenization_marian.py
+++ b/src/transformers/models/marian/tokenization_marian.py
@@ -84,7 +84,7 @@ class MarianTokenizer(PreTrainedTokenizer):
        >>> tok = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-de')
        >>> src_texts = [ "I am a small frog.", "Tom asked his teacher for advice."]
        >>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."]  # optional
-        >>> batch_enc: BatchEncoding = tok.prepare_seq2seq_batch(src_texts, tgt_texts=tgt_texts, return_tensors="pt")
+        >>> batch_enc = tok.prepare_seq2seq_batch(src_texts, tgt_texts=tgt_texts, return_tensors="pt")
        >>> # keys  [input_ids, attention_mask, labels].
        >>> # model(**batch) should work
    """
--- a/src/transformers/models/mbart/init.py
+++ b/src/transformers/models/mbart/init.py
@@ -15,9 +15,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from ...file_utils import is_sentencepiece_available, is_tf_available, is_tokenizers_available, is_torch_available
-from .configuration_mbart import MBartConfig
+from .configuration_mbart import MBART_PRETRAINED_CONFIG_ARCHIVE_MAP, MBartConfig


 if is_sentencepiece_available():
@@ -27,7 +26,14 @@ if is_tokenizers_available():
    from .tokenization_mbart_fast import MBartTokenizerFast

 if is_torch_available():
-    from .modeling_mbart import MBartForConditionalGeneration, MBartModel
+    from .modeling_mbart import (
+        MBART_PRETRAINED_MODEL_ARCHIVE_LIST,
+        MBartForConditionalGeneration,
+        MBartForQuestionAnswering,
+        MBartForSequenceClassification,
+        MBartModel,
+        MBartPreTrainedModel,
+    )

 if is_tf_available():
    from .modeling_tf_mbart import TFMBartForConditionalGeneration
--- a/src/transformers/models/mbart/configuration_mbart.py
+++ b/src/transformers/models/mbart/configuration_mbart.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2020 The Fairseq Authors and The HuggingFace Inc. team.
+# Copyright 2021, The Facebook AI Research Team and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,33 +12,36 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" MBART configuration """
+""" MBART model configuration """

+from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-from ..bart.configuration_bart import BartConfig


 logger = logging.get_logger(__name__)

 MBART_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "facebook/mbart-large-en-ro": "https://huggingface.co/facebook/mbart-large-en-ro/resolve/main/config.json",
    "facebook/mbart-large-cc25": "https://huggingface.co/facebook/mbart-large-cc25/resolve/main/config.json",
+    # See all MBART models at https://huggingface.co/models?filter=mbart
 }


-class MBartConfig(BartConfig):
-    """
-    This is the configuration class to store the configuration of a
-    :class:`~transformers.MBartForConditionalGeneration`. It is used to instantiate a BART model according to the
-    specified arguments, defining the model architecture.
+class MBartConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.MBartModel`. It is used to
+    instantiate an MBART model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the MBART `facebook/mbart-large-cc25
+    <https://huggingface.co/facebook/mbart-large-cc25>`__ architecture.

    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.

+
    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 250027):
+        vocab_size (:obj:`int`, `optional`, defaults to 50265):
            Vocabulary size of the MBART model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.MBartForConditionalGeneration`.
+            :obj:`inputs_ids` passed when calling :class:`~transformers.MBartModel` or
+            :class:`~transformers.TFMBartModel`.
        d_model (:obj:`int`, `optional`, defaults to 1024):
            Dimensionality of the layers and the pooler layer.
        encoder_layers (:obj:`int`, `optional`, defaults to 12):
@@ -50,9 +53,9 @@ class MBartConfig(BartConfig):
        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
            Number of attention heads for each attention layer in the Transformer decoder.
        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
@@ -69,37 +72,108 @@ class MBartConfig(BartConfig):
            just in case (e.g., 512 or 1024 or 2048).
        init_std (:obj:`float`, `optional`, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        add_bias_logits (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            This should be completed, specific to marian.
-        normalize_before (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Call layernorm before attention ops.
-        normalize_embedding (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Call layernorm after embeddings. Only True for Bart.
-        static_position_embeddings (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Don't learn positional embeddings, use sinusoidal.
-        add_final_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Why not add another layernorm?
-        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Scale embeddings by diving by sqrt(d_model).
-        eos_token_id (:obj:`int`, `optional`, defaults to 2)
-            End of stream token id.
-        pad_token_id (:obj:`int`, `optional`, defaults to 1)
-            Padding token id.
-        bos_token_id (:obj:`int`, `optional`, defaults to 0)
-            Beginning of stream token id.
        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
            https://arxiv.org/abs/1909.11556>`__ for more details.
        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
            https://arxiv.org/abs/1909.11556>`__ for more details.
-        extra_pos_embeddings: (:obj:`int`, `optional`, defaults to 2):
-            How many extra learned positional embeddings to use. Should be equal to :obj:`pad_token_id+1`.
-        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether this is an encoder/decoder model
-        force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``).
-    """
+        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Scale embeddings by diving by sqrt(d_model).
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models)

+    Example::
+
+        >>> from transformers import MBartModel, MBartConfig
+
+        >>> # Initializing a MBART facebook/mbart-large-cc25 style configuration
+        >>> configuration = MBartConfig()
+
+        >>> # Initializing a model from the facebook/mbart-large-cc25 style configuration
+        >>> model = MBartModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+    """
    model_type = "mbart"
    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=50265,
+        max_position_embeddings=1024,
+        encoder_layers=12,
+        encoder_ffn_dim=4096,
+        encoder_attention_heads=16,
+        decoder_layers=12,
+        decoder_ffn_dim=4096,
+        decoder_attention_heads=16,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        use_cache=True,
+        is_encoder_decoder=True,
+        activation_function="gelu",
+        d_model=1024,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        gradient_checkpointing=False,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            **kwargs,
+        )
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.gradient_checkpointing = gradient_checkpointing
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+
+        # IMPORTANT
+        # DELETE ALL OF THE FOLLOWING LINES AS SOON AS TF IS READY
+        self.extra_pos_embeddings = 2
+        self.normalize_before = True
+        self.add_final_layer_norm = True
+        self.do_blenderbot_90_layernorm = False
+        self.normalize_embedding = True
+        self.static_position_embeddings = False
+        self.add_bias_logits = False
+        self.force_bos_token_to_be_generated = False
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
--- a/src/transformers/models/pegasus/init.py
+++ b/src/transformers/models/pegasus/init.py
@@ -15,9 +15,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from ...file_utils import is_sentencepiece_available, is_tf_available, is_tokenizers_available, is_torch_available
-from .configuration_pegasus import PegasusConfig
+from .configuration_pegasus import PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP, PegasusConfig


 if is_sentencepiece_available():
@@ -27,7 +26,12 @@ if is_tokenizers_available():
    from .tokenization_pegasus_fast import PegasusTokenizerFast

 if is_torch_available():
-    from .modeling_pegasus import PegasusForConditionalGeneration, PegasusModel
+    from .modeling_pegasus import (
+        PEGASUS_PRETRAINED_MODEL_ARCHIVE_LIST,
+        PegasusForConditionalGeneration,
+        PegasusModel,
+        PegasusPreTrainedModel,
+    )

 if is_tf_available():
    from .modeling_tf_pegasus import TFPegasusForConditionalGeneration
--- a/src/transformers/models/pegasus/configuration_pegasus.py
+++ b/src/transformers/models/pegasus/configuration_pegasus.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2020 Google and The HuggingFace Inc. team.
+# Copyright 2021, Google and The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,84 +14,48 @@
 # limitations under the License.
 """ PEGASUS model configuration """

+from ...configuration_utils import PretrainedConfig
 from ...utils import logging
-from ..bart.configuration_bart import BartConfig


 logger = logging.get_logger(__name__)

-# These config values do not vary between checkpoints
-DEFAULTS = dict(
-    vocab_size=96103,
-    max_position_embeddings=512,
-    d_model=1024,
-    encoder_ffn_dim=4096,
-    decoder_ffn_dim=4096,
-    encoder_attention_heads=16,
-    decoder_attention_heads=16,
-    encoder_layers=16,
-    decoder_layers=16,
-    dropout=0.1,
-    attention_dropout=0.1,
-    activation_dropout=0.1,
-    pad_token_id=0,
-    eos_token_id=1,
-    is_encoder_decoder=True,
-    normalize_before=True,
-    scale_embedding=True,
-    normalize_embedding=False,
-    add_final_layer_norm=True,
-    static_position_embeddings=True,
-    num_beams=8,
-    activation_function="relu",
-)
-# Config values that vary between checkpoints: for testing and conversion
-task_specific_params = {
-    # These are task specific params for pegasus-large and normal params for finetuned checkpoints
-    "summarization_xsum": {"length_penalty": 0.6, "max_length": 64, "max_position_embeddings": 512},
-    "summarization_cnn_dailymail": {"length_penalty": 0.8, "max_length": 128, "max_position_embeddings": 1024},
-    "summarization_newsroom": {"length_penalty": 0.8, "max_length": 128, "max_position_embeddings": 512},
-    "summarization_wikihow": {"length_penalty": 0.6, "max_length": 256, "max_position_embeddings": 512},
-    "summarization_multi_news": {"length_penalty": 0.8, "max_length": 256, "max_position_embeddings": 1024},
-    "summarization_reddit_tifu": {"length_penalty": 0.6, "max_length": 128, "max_position_embeddings": 512},
-    "summarization_big_patent": {"length_penalty": 0.7, "max_length": 256, "max_position_embeddings": 1024},
-    "summarization_arxiv": {"length_penalty": 0.8, "max_length": 256, "max_position_embeddings": 1024},
-    "summarization_pubmed": {"length_penalty": 0.8, "max_length": 256, "max_position_embeddings": 1024},
-    "summarization_gigaword": {"length_penalty": 0.6, "max_length": 32, "max_position_embeddings": 128},
-    "summarization_aeslc": {"length_penalty": 0.6, "max_length": 32, "max_position_embeddings": 512},
-    "summarization_billsum": {"length_penalty": 0.6, "max_length": 256, "max_position_embeddings": 1024},
-    # this last entry is useless -- just for consistency
-    "summarization_large": {"length_penalty": 0.8, "max_length": 256, "max_position_embeddings": 1024},
+PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "google/pegasus-large": "https://huggingface.co/google/pegasus-large/resolve/main/config.json",
+    # See all PEGASUS models at https://huggingface.co/models?filter=pegasus
 }


-class PegasusConfig(BartConfig):
-    """
-    This is the configuration class to store the configuration of a
-    :class:`~transformers.PegasusForConditionalGeneration`. It is used to instantiate a Pegasus model according to the
-    specified arguments, defining the model architecture.
+class PegasusConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.PegasusModel`. It is used to
+    instantiate an PEGASUS model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the PEGASUS `google/pegasus-large
+    <https://huggingface.co/google/pegasus-large>`__ architecture.

    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.

+
    Args:
-        vocab_size (:obj:`int`, `optional`, defaults to 96103):
-            Vocabulary size of the Pegasus model. Defines the number of different tokens that can be represented by the
-            :obj:`inputs_ids` passed when calling :class:`~transformers.PegasusForConditionalGeneration`.
+        vocab_size (:obj:`int`, `optional`, defaults to 50265):
+            Vocabulary size of the PEGASUS model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.PegasusModel` or
+            :class:`~transformers.TFPegasusModel`.
        d_model (:obj:`int`, `optional`, defaults to 1024):
            Dimensionality of the layers and the pooler layer.
-        encoder_layers (:obj:`int`, `optional`, defaults to 16):
+        encoder_layers (:obj:`int`, `optional`, defaults to 12):
            Number of encoder layers.
-        decoder_layers (:obj:`int`, `optional`, defaults to 16):
+        decoder_layers (:obj:`int`, `optional`, defaults to 12):
            Number of decoder layers.
        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
            Number of attention heads for each attention layer in the Transformer decoder.
        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in decoder.
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string,
            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
@@ -108,38 +72,108 @@ class PegasusConfig(BartConfig):
            just in case (e.g., 512 or 1024 or 2048).
        init_std (:obj:`float`, `optional`, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        add_bias_logits (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            This should be completed, specific to marian.
-        normalize_before (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Call layernorm before attention ops.
-        normalize_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Call layernorm after embeddings.
-        static_position_embeddings (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Don't learn positional embeddings, use sinusoidal.
-        add_final_layer_norm (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Why not add another layernorm?
-        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Scale embeddings by diving by sqrt(d_model).
-        eos_token_id (:obj:`int`, `optional`, defaults to 2)
-            End of stream token id.
-        pad_token_id (:obj:`int`, `optional`, defaults to 1)
-            Padding token id.
-        bos_token_id (:obj:`int`, `optional`, defaults to 0)
-            Beginning of stream token id.
        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
            https://arxiv.org/abs/1909.11556>`__ for more details.
        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
            https://arxiv.org/abs/1909.11556>`__ for more details.
-        extra_pos_embeddings: (:obj:`int`, `optional`, defaults to 2):
-            How many extra learned positional embeddings to use. Should be pad_token_id+1 for bart.
-        is_encoder_decoder (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether this is an encoder/decoder model
-        force_bos_token_to_be_generated (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether or not to force BOS token to be generated at step 1 (after ``decoder_start_token_id``).
-    """
+        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+        scale_embedding (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Scale embeddings by diving by sqrt(d_model).
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models)

+    Example::
+
+        >>> from transformers import PegasusModel, PegasusConfig
+
+        >>> # Initializing a PEGASUS google/pegasus-large style configuration
+        >>> configuration = PegasusConfig()
+
+        >>> # Initializing a model from the google/pegasus-large style configuration
+        >>> model = PegasusModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+    """
    model_type = "pegasus"
    keys_to_ignore_at_inference = ["past_key_values"]
-    # The implementation of the config object is in BartConfig
+
+    def __init__(
+        self,
+        vocab_size=50265,
+        max_position_embeddings=1024,
+        encoder_layers=12,
+        encoder_ffn_dim=4096,
+        encoder_attention_heads=16,
+        decoder_layers=12,
+        decoder_ffn_dim=4096,
+        decoder_attention_heads=16,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        use_cache=True,
+        is_encoder_decoder=True,
+        activation_function="gelu",
+        d_model=1024,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        decoder_start_token_id=0,
+        classifier_dropout=0.0,
+        scale_embedding=False,
+        gradient_checkpointing=False,
+        pad_token_id=0,
+        eos_token_id=1,
+        **kwargs
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs,
+        )
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.gradient_checkpointing = gradient_checkpointing
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+
+        # IMPORTANT
+        # DELETE ALL OF THE FOLLOWING LINES AS SOON AS TF IS READY
+        self.extra_pos_embeddings = 0
+        self.normalize_before = True
+        self.add_final_layer_norm = True
+        self.do_blenderbot_90_layernorm = False
+        self.normalize_embedding = False
+        self.static_position_embeddings = True
+        self.add_bias_logits = False
+        self.force_bos_token_to_be_generated = False
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
--- a/src/transformers/models/prophetnet/modeling_prophetnet.py
+++ b/src/transformers/models/prophetnet/modeling_prophetnet.py
@@ -88,9 +88,19 @@ PROPHETNET_INPUTS_DOCSTRING = r"""

            `What are attention masks? <../glossary.html#attention-mask>`__
        decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
-            Provide for translation and summarization training. By default, the model will create this tensor by
-            shifting the :obj:`input_ids` to the right, following the paper.
-        decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~transformers.PreTrainedTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+
+            ProphetNet uses the :obj:`eos_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
+            :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
+            :obj:`past_key_values`).
+
+        decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
            Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
            also be used by default.

--- a/src/transformers/models/t5/modeling_t5.py
+++ b/src/transformers/models/t5/modeling_t5.py
@@ -1028,14 +1028,22 @@ T5_INPUTS_DOCSTRING = r"""

            `What are attention masks? <../glossary.html#attention-mask>`__
        decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
-            Provide for sequence to sequence training. T5 uses the :obj:`pad_token_id` as the starting token for
-            :obj:`decoder_input_ids` generation. If :obj:`past_key_values` is used, optionally only the last
-            :obj:`decoder_input_ids` have to be input (see :obj:`past_key_values`).
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~transformers.BartTokenizer`. See
+            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+
+            T5 uses the :obj:`pad_token_id` as the starting token for :obj:`decoder_input_ids` generation. If
+            :obj:`past_key_values` is used, optionally only the last :obj:`decoder_input_ids` have to be input (see
+            :obj:`past_key_values`).

            To know more on how to prepare :obj:`decoder_input_ids` for pretraining take a look at `T5 Training
            <./t5.html#training>`__. If :obj:`decoder_input_ids` and :obj:`decoder_inputs_embeds` are both unset,
            :obj:`decoder_input_ids` takes the value of :obj:`input_ids`.
-        decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
+        decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
            Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
            also be used by default.
        encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
--- a/src/transformers/models/t5/modeling_tf_t5.py
+++ b/src/transformers/models/t5/modeling_tf_t5.py
@@ -922,7 +922,7 @@ T5_INPUTS_DOCSTRING = r"""
            - 0 for tokens that are **masked**.

            `What are attention masks? <../glossary.html#attention-mask>`__
-        decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
+        decoder_attention_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
            Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
            also be used by default.
        encoder_outputs (:obj:`tuple(tuple(tf.FloatTensor)`, `optional`):
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -609,6 +609,27 @@ class BlenderbotModel:
        requires_pytorch(self)


+BLENDERBOT_SMALL_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class BlenderbotSmallForConditionalGeneration:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class BlenderbotSmallModel:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
 CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None


@@ -1327,6 +1348,15 @@ class LxmertXLayer:
        requires_pytorch(self)


+class MarianModel:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
 class MarianMTModel:
    def __init__(self, *args, **kwargs):
        requires_pytorch(self)
@@ -1345,6 +1375,24 @@ class MBartForConditionalGeneration:
        requires_pytorch(self)


+class MBartForQuestionAnswering:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
+class MBartForSequenceClassification:
+    def __init__(self, *args, **kwargs):
+        requires_pytorch(self)
+
+    @classmethod
+    def from_pretrained(self, *args, **kwargs):
+        requires_pytorch(self)
+
+
 class MBartModel:
    def __init__(self, *args, **kwargs):
        requires_pytorch(self)