Model Templates for Seq2Seq (#9251)

* adapt cookie cutter * fix copy past statement * delete copy statements for now * remove unused import from template * make doc rst * correct config docstring * correct training * correct inputs processing tf enc dec * make style * adapt templates * clean tabs * correct tensor -> Tensor naming * correct indent * correct templates * fix the test * break lines to avoid > 119 * Apply suggestions from code review
2020-12-22 23:41:20 +01:00
parent e6c1f1cad8
commit cbe63949d7
21 changed files with 3561 additions and 21 deletions
--- a/.github/workflows/model-templates.yml
+++ b/.github/workflows/model-templates.yml
@@ -40,6 +40,8 @@ jobs:
          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-seq-2-seq-bart-tokenizer.json --path=templates/adding_a_new_model
+          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-seq-2-seq-bart-tokenizer.json --path=templates/adding_a_new_model
          make style
          python utils/check_table.py --fix_and_overwrite
          python utils/check_dummies.py --fix_and_overwrite
--- a/src/transformers/models/bart/modeling_tf_bart.py
+++ b/src/transformers/models/bart/modeling_tf_bart.py
@@ -1077,7 +1077,7 @@ class TFBartModel(TFBartPretrainedModel):

        decoder_outputs = self.decoder(
            inputs["decoder_input_ids"],
-            attention_mask=decoder_attention_mask,
+            attention_mask=inputs["decoder_attention_mask"],
            encoder_hidden_states=inputs["encoder_outputs"][0],
            encoder_attention_mask=inputs["attention_mask"],
            past_key_values=inputs["past_key_values"],
@@ -1228,6 +1228,7 @@ class TFBartForConditionalGeneration(TFBartPretrainedModel):
            output_attentions=inputs["output_attentions"],
            output_hidden_states=inputs["output_hidden_states"],
            return_dict=inputs["return_dict"],
+            training=inputs["training"],
        )
        lm_logits = self.model.shared(outputs[0], mode="linear")
        lm_logits = lm_logits + self.final_logits_bias
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/init.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/init.py
@@ -30,6 +30,7 @@ if is_tokenizers_available():
    from .tokenization_{{cookiecutter.lowercase_modelname}}_fast import {{cookiecutter.camelcase_modelname}}TokenizerFast

 {%- if (cookiecutter.generate_tensorflow_and_pytorch == "PyTorch & TensorFlow" or cookiecutter.generate_tensorflow_and_pytorch == "PyTorch") %}
+{% if cookiecutter.is_encoder_decoder_model == "False" %}
 if is_torch_available():
    from .modeling_{{cookiecutter.lowercase_modelname}} import (
        {{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST,
@@ -44,8 +45,20 @@ if is_torch_available():
        {{cookiecutter.camelcase_modelname}}PreTrainedModel,
        load_tf_weights_in_{{cookiecutter.lowercase_modelname}},
    )
+{% else %}
+if is_torch_available():
+    from .modeling_{{cookiecutter.lowercase_modelname}} import (
+        {{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST,
+        {{cookiecutter.camelcase_modelname}}ForConditionalGeneration,
+        {{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
+        {{cookiecutter.camelcase_modelname}}ForSequenceClassification,
+        {{cookiecutter.camelcase_modelname}}Model,
+        {{cookiecutter.camelcase_modelname}}PreTrainedModel,
+    )
+{% endif %}
 {% endif %}
 {%- if (cookiecutter.generate_tensorflow_and_pytorch == "PyTorch & TensorFlow" or cookiecutter.generate_tensorflow_and_pytorch == "TensorFlow") %}
+{% if cookiecutter.is_encoder_decoder_model == "False" %}
 if is_tf_available():
    from .modeling_tf_{{cookiecutter.lowercase_modelname}} import (
        TF_{{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST,
@@ -59,4 +72,12 @@ if is_tf_available():
        TF{{cookiecutter.camelcase_modelname}}Model,
        TF{{cookiecutter.camelcase_modelname}}PreTrainedModel,
    )
+{% else %}
+if is_tf_available():
+    from .modeling_tf_{{cookiecutter.lowercase_modelname}} import (
+        TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration,
+        TF{{cookiecutter.camelcase_modelname}}Model,
+        TF{{cookiecutter.camelcase_modelname}}PreTrainedModel,
+    )
+{% endif %}
 {% endif %}
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration.json
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration.json
@@ -6,5 +6,6 @@
  "authors": "{{cookiecutter.authors}}",
  "checkpoint_identifier": "{{cookiecutter.checkpoint_identifier}}",
  "tokenizer_type": "{{cookiecutter.tokenizer_type}}",
-  "generate_tensorflow_and_pytorch": "{{cookiecutter.generate_tensorflow_and_pytorch}}"
+  "generate_tensorflow_and_pytorch": "{{cookiecutter.generate_tensorflow_and_pytorch}}",
+  "is_encoder_decoder_model": ["True", "False"]
 }
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/configuration_{{cookiecutter.lowercase_modelname}}.py
@@ -39,6 +39,7 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig):


    Args:
+        {% if cookiecutter.is_encoder_decoder_model == "False" -%}
        vocab_size (:obj:`int`, `optional`, defaults to 30522):
            Vocabulary size of the {{cookiecutter.modelname}} model. Defines the number of different tokens that can be represented by the
            :obj:`inputs_ids` passed when calling :class:`~transformers.{{cookiecutter.camelcase_modelname}}Model` or
@@ -70,6 +71,50 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
+        {% else -%}
+        vocab_size (:obj:`int`, `optional`, defaults to 50265):
+            Vocabulary size of the {{cookiecutter.modelname}} model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.{{cookiecutter.camelcase_modelname}}Model` or
+            :class:`~transformers.TF{{cookiecutter.camelcase_modelname}}Model`.
+        d_model (:obj:`int`, `optional`, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        encoder_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of encoder layers.
+        decoder_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of decoder layers.
+        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+        dropout (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for classifier.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        init_std (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
+            https://arxiv.org/abs/1909.11556>`__ for more details.
+        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
+            https://arxiv.org/abs/1909.11556>`__ for more details.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        {% endif -%}

    Example::

@@ -88,9 +133,9 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig):

    def __init__(
        self,
+        {% if cookiecutter.is_encoder_decoder_model == "False" -%}
        vocab_size=30522,
        hidden_size=768,
-        is_encoder_decoder=False,
        num_hidden_layers=12,
        num_attention_heads=12,
        intermediate_size=3072,
@@ -101,6 +146,29 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig):
        type_vocab_size=2,
        initializer_range=0.02,
        layer_norm_eps=1e-12,
+        is_encoder_decoder=False,
+        {% else -%}
+        vocab_size=50265,
+        max_position_embeddings=1024,
+        encoder_layers=12,
+        encoder_ffn_dim=4096,
+        encoder_attention_heads=16,
+        decoder_layers=12,
+        decoder_ffn_dim=4096,
+        decoder_attention_heads=16,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        use_cache=True,
+        is_encoder_decoder=True,
+        activation_function="gelu",
+        d_model=1024,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        decoder_start_token_id=2,
+        classifier_dropout=0.0,
+        {% endif -%}
        pad_token_id=1,
        bos_token_id=0,
        eos_token_id=2,
@@ -108,13 +176,19 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig):
    ):
        super().__init__(
            pad_token_id=pad_token_id,
-            is_encoder_decoder=is_encoder_decoder,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
+            {% if cookiecutter.is_encoder_decoder_model == "False" -%}
+            {% else -%}
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            {% endif -%}
            **kwargs
        )

        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        {% if cookiecutter.is_encoder_decoder_model == "False" -%}
        self.hidden_size = hidden_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
@@ -122,8 +196,36 @@ class {{cookiecutter.camelcase_modelname}}Config(PretrainedConfig):
        self.hidden_act = hidden_act
        self.hidden_dropout_prob = hidden_dropout_prob
        self.attention_probs_dropout_prob = attention_probs_dropout_prob
-        self.max_position_embeddings = max_position_embeddings
-        self.type_vocab_size = type_vocab_size
        self.initializer_range = initializer_range
+        self.type_vocab_size = type_vocab_size
        self.layer_norm_eps = layer_norm_eps
+        {% else -%}
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        {% endif -%}

+    {% if cookiecutter.is_encoder_decoder_model == "False" %}
+    {%- else %}
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
+    {%- endif %}
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_tf_{{cookiecutter.lowercase_modelname}}.py
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/modeling_{{cookiecutter.lowercase_modelname}}.py
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_tf_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_tf_{{cookiecutter.lowercase_modelname}}.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+{% if cookiecutter.is_encoder_decoder_model == "False" %}

 import unittest

@@ -318,3 +319,272 @@ class TF{{cookiecutter.camelcase_modelname}}ModelIntegrationTest(unittest.TestCa
            ]
        )
        tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=1e-4)
+
+{% else %}
+import unittest
+
+from transformers import {{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}Tokenizer, is_tf_available
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_tf, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from transformers import TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration, TF{{cookiecutter.camelcase_modelname}}Model
+
+
+@require_tf
+class TF{{cookiecutter.camelcase_modelname}}ModelTester:
+    config_cls = {{cookiecutter.camelcase_modelname}}Config
+    config_updates = {}
+    hidden_act = "gelu"
+
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=5,
+        num_attention_heads=4,
+        intermediate_size=37,
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+
+    def prepare_config_and_inputs_for_common(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length - 1], self.vocab_size)
+        eos_tensor = tf.expand_dims(tf.constant([self.eos_token_id] * self.batch_size), 1)
+        input_ids = tf.concat([input_ids, eos_tensor], axis=1)
+
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = self.config_cls(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_ids=[2],
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+            decoder_start_token_id=self.pad_token_id,
+            **self.config_updates,
+        )
+        inputs_dict = prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, inputs_dict
+
+    def check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = TF{{cookiecutter.camelcase_modelname}}Model(config=config).get_decoder()
+        input_ids = inputs_dict["input_ids"]
+
+        input_ids = input_ids[:1, :]
+        attention_mask = inputs_dict["attention_mask"][:1, :]
+        self.batch_size = 1
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+        past_key_values = past_key_values[1]
+
+        # create hypothetical next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = tf.cast(ids_tensor((self.batch_size, 3), 2), tf.int8)
+
+        # append to next input_ids and
+        next_input_ids = tf.concat([input_ids, next_tokens], axis=-1)
+        next_attention_mask = tf.concat([attention_mask, next_attn_mask], axis=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)[0]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)[0]
+
+        self.parent.assertEqual(next_tokens.shape[1], output_from_past.shape[1])
+
+        # select random slice
+        random_slice_idx = int(ids_tensor((1,), output_from_past.shape[-1]))
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx]
+        output_from_past_slice = output_from_past[:, :, random_slice_idx]
+
+        # test that outputs are equal for slice
+        tf.debugging.assert_near(output_from_past_slice, output_from_no_past_slice, rtol=1e-3)
+
+
+def prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(
+    config,
+    input_ids,
+    decoder_input_ids,
+    attention_mask=None,
+    decoder_attention_mask=None,
+):
+    if attention_mask is None:
+        attention_mask = tf.cast(tf.math.not_equal(input_ids, config.pad_token_id), tf.int8)
+    if decoder_attention_mask is None:
+        decoder_attention_mask = tf.cast(tf.math.not_equal(decoder_input_ids, config.pad_token_id), tf.int8)
+    return {
+        "input_ids": input_ids,
+        "decoder_input_ids": decoder_input_ids,
+        "attention_mask": attention_mask,
+        "decoder_attention_mask": decoder_attention_mask,
+    }
+
+
+@require_tf
+class TF{{cookiecutter.camelcase_modelname}}ModelTest(TFModelTesterMixin, unittest.TestCase):
+    all_model_classes = (TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration, TF{{cookiecutter.camelcase_modelname}}Model) if is_tf_available() else ()
+    all_generative_model_classes = (TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration,) if is_tf_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+
+    def setUp(self):
+        self.model_tester = TF{{cookiecutter.camelcase_modelname}}ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class={{cookiecutter.camelcase_modelname}}Config)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_decoder_model_past_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_model_common_attributes(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
+            x = model.get_output_layer_with_bias()
+            assert x is None
+            name = model.get_prefix_bias_name()
+            assert name is None
+
+
+def _assert_tensors_equal(a, b, atol=1e-12, prefix=""):
+    """If tensors not close, or a and b arent both tensors, raise a nice Assertion error."""
+    if a is None and b is None:
+        return True
+    try:
+        if tf.debugging.assert_near(a, b, atol=atol):
+            return True
+        raise
+    except Exception:
+        msg = "{} != {}".format(a, b)
+        if prefix:
+            msg = prefix + ": " + msg
+        raise AssertionError(msg)
+
+
+def _long_tensor(tok_lst):
+    return tf.constant(tok_lst, dtype=tf.int32)
+
+
+TOLERANCE = 1e-4
+
+
+@slow
+@require_sentencepiece
+@require_tokenizers
+@require_tf
+class TF{{cookiecutter.camelcase_modelname}}ModelIntegrationTest(unittest.TestCase):
+    def test_inference_no_head(self):
+        model = TF{{cookiecutter.camelcase_modelname}}Model.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
+        # change to intended input here
+        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        decoder_input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        inputs_dict = prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(model.config, input_ids, decoder_input_ids)
+        output = model(**inputs_dict)[0]
+        expected_shape = (1, 11, 1024)
+        self.assertEqual(output.shape, expected_shape)
+        # change to expected output here
+        expected_slice = tf.Tensor(
+            [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]],
+        )
+        self.assertTrue(tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=TOLERANCE))
+
+    def test_inference_with_head(self):
+        model = TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
+        # change to intended input here
+        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        decoder_input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        inputs_dict = prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(model.config, input_ids, decoder_input_ids)
+        output = model(**inputs_dict)[0]
+        expected_shape = (1, 11, 1024)
+        self.assertEqual(output.shape, expected_shape)
+        # change to expected output here
+        expected_slice = tf.Tensor(
+            [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]],
+        )
+        self.assertTrue(tf.debugging.assert_near(output[:, :3, :3], expected_slice, atol=TOLERANCE))
+
+    def test_seq_to_seq_generation(self):
+        hf = TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
+        tok = {{cookiecutter.camelcase_modelname}}Tokenizer.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
+
+        batch_input = [
+            # string 1,
+            # string 2,
+            # string 3,
+            # string 4,
+        ]
+
+        # The below article tests that we don't add any hypotheses outside of the top n_beams
+        dct = tok.batch_encode_plus(
+            batch_input,
+            max_length=512,
+            padding="max_length",
+            truncation_strategy="only_first",
+            truncation=True,
+            return_tensors="tf",
+        )
+
+        hypotheses_batch = hf.generate(
+            input_ids=dct["input_ids"],
+            attention_mask=dct["attention_mask"],
+            num_beams=2,
+        )
+
+        EXPECTED = [
+            # here expected 1,
+            # here expected 2,
+            # here expected 3,
+            # here expected 4,
+        ]
+
+        generated = tok.batch_decode(
+            hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True
+        )
+        assert generated == EXPECTED
+{%- endif %}
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/test_modeling_{{cookiecutter.lowercase_modelname}}.py
@@ -15,6 +15,7 @@
 """ Testing suite for the PyTorch {{cookiecutter.modelname}} model. """


+{% if cookiecutter.is_encoder_decoder_model == "False" -%}
 import unittest

 from tests.test_modeling_common import floats_tensor
@@ -406,3 +407,395 @@ class {{cookiecutter.camelcase_modelname}}ModelIntegrationTest(unittest.TestCase
        )

        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+
+
+{% else -%}
+import copy
+import tempfile
+import unittest
+
+import timeout_decorator  # noqa
+
+from transformers import is_torch_available
+from transformers.file_utils import cached_property
+from transformers.testing_utils import require_sentencepiece, require_tokenizers, require_torch, slow, torch_device
+
+from .test_configuration_common import ConfigTester
+from .test_generation_utils import GenerationTesterMixin
+from .test_modeling_common import ModelTesterMixin, ids_tensor
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        {{cookiecutter.camelcase_modelname}}Config,
+        {{cookiecutter.camelcase_modelname}}ForConditionalGeneration,
+        {{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
+        {{cookiecutter.camelcase_modelname}}ForSequenceClassification,
+        {{cookiecutter.camelcase_modelname}}Model,
+        {{cookiecutter.camelcase_modelname}}Tokenizer,
+    )
+    from transformers.models.{{cookiecutter.lowercase_modelname}}.modeling_{{cookiecutter.lowercase_modelname}} import (
+        {{cookiecutter.camelcase_modelname}}Decoder,
+        {{cookiecutter.camelcase_modelname}}Encoder,
+    )
+
+
+def prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(
+    config,
+    input_ids,
+    decoder_input_ids,
+    attention_mask=None,
+    decoder_attention_mask=None,
+):
+    if attention_mask is None:
+        attention_mask = input_ids.ne(config.pad_token_id)
+    if decoder_attention_mask is None:
+        decoder_attention_mask = decoder_input_ids.ne(config.pad_token_id)
+    return {
+        "input_ids": input_ids,
+        "decoder_input_ids": decoder_input_ids,
+        "attention_mask": attention_mask,
+        "decoder_attention_mask": attention_mask,
+    }
+
+
+@require_torch
+class {{cookiecutter.camelcase_modelname}}ModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_labels=False,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=20,
+        eos_token_id=2,
+        pad_token_id=1,
+        bos_token_id=0,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.eos_token_id = eos_token_id
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
+            3,
+        )
+        input_ids[:, -1] = self.eos_token_id  # Eos Token
+
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        config = {{cookiecutter.camelcase_modelname}}Config(
+            vocab_size=self.vocab_size,
+            d_model=self.hidden_size,
+            encoder_layers=self.num_hidden_layers,
+            decoder_layers=self.num_hidden_layers,
+            encoder_attention_heads=self.num_attention_heads,
+            decoder_attention_heads=self.num_attention_heads,
+            encoder_ffn_dim=self.intermediate_size,
+            decoder_ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            eos_token_id=self.eos_token_id,
+            bos_token_id=self.bos_token_id,
+            pad_token_id=self.pad_token_id,
+        )
+        inputs_dict = prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(config, input_ids, decoder_input_ids)
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+    def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
+        model = {{cookiecutter.camelcase_modelname}}Model(config=config).get_decoder().to(torch_device).eval()
+        input_ids = inputs_dict["input_ids"]
+        attention_mask = inputs_dict["attention_mask"]
+
+        # first forward pass
+        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)
+
+        output, past_key_values = outputs.to_tuple()
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_attn_mask = ids_tensor((self.batch_size, 3), 2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([attention_mask, next_attn_mask], dim=-1)
+
+        output_from_no_past = model(next_input_ids, attention_mask=next_attention_mask)["last_hidden_state"]
+        output_from_past = model(next_tokens, attention_mask=next_attention_mask, past_key_values=past_key_values)["last_hidden_state"]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-2))
+
+    def check_encoder_decoder_model_standalone(self, config, inputs_dict):
+        model = {{cookiecutter.camelcase_modelname}}Model(config=config).to(torch_device).eval()
+        outputs = model(**inputs_dict)
+
+        encoder_last_hidden_state = outputs.encoder_last_hidden_state
+        last_hidden_state = outputs.last_hidden_state
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            encoder = model.get_encoder()
+            encoder.save_pretrained(tmpdirname)
+            encoder = {{cookiecutter.camelcase_modelname}}Encoder.from_pretrained(tmpdirname).to(torch_device)
+
+        encoder_last_hidden_state_2 = encoder(inputs_dict["input_ids"], attention_mask=inputs_dict["attention_mask"])[
+            0
+        ]
+
+        self.parent.assertTrue((encoder_last_hidden_state_2 - encoder_last_hidden_state).abs().max().item() < 1e-3)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            decoder = model.get_decoder()
+            decoder.save_pretrained(tmpdirname)
+            decoder = {{cookiecutter.camelcase_modelname}}Decoder.from_pretrained(tmpdirname).to(torch_device)
+
+        last_hidden_state_2 = decoder(
+            input_ids=inputs_dict["decoder_input_ids"],
+            attention_mask=inputs_dict["decoder_attention_mask"],
+            encoder_hidden_states=encoder_last_hidden_state,
+            encoder_attention_mask=inputs_dict["attention_mask"],
+        )[0]
+
+        self.parent.assertTrue((last_hidden_state_2 - last_hidden_state).abs().max().item() < 1e-3)
+
+
+@require_torch
+class {{cookiecutter.camelcase_modelname}}ModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        ({{cookiecutter.camelcase_modelname}}Model, {{cookiecutter.camelcase_modelname}}ForConditionalGeneration, {{cookiecutter.camelcase_modelname}}ForSequenceClassification, {{cookiecutter.camelcase_modelname}}ForQuestionAnswering)
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = ({{cookiecutter.camelcase_modelname}}ForConditionalGeneration,) if is_torch_available() else ()
+    is_encoder_decoder = True
+    test_pruning = False
+    test_head_masking = False
+    test_missing_keys = False
+
+    def setUp(self):
+        self.model_tester = {{cookiecutter.camelcase_modelname}}ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class={{cookiecutter.camelcase_modelname}}Config)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_initialization_more(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        model = {{cookiecutter.camelcase_modelname}}Model(config)
+        model.to(torch_device)
+        model.eval()
+        # test init
+        self.assertTrue((model.encoder.embed_tokens.weight == model.shared.weight).all().item())
+
+        def _check_var(module):
+            """Check that we initialized various parameters from N(0, config.init_std)."""
+            self.assertAlmostEqual(torch.std(module.weight).item(), config.init_std, 2)
+
+        _check_var(model.encoder.embed_tokens)
+        _check_var(model.encoder.layers[0].self_attn.k_proj)
+        _check_var(model.encoder.layers[0].fc1)
+        _check_var(model.encoder.embed_positions)
+
+    def test_save_load_strict(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    def test_encoder_decoder_model_standalone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common()
+        self.model_tester.check_encoder_decoder_model_standalone(*config_and_inputs)
+
+    # {{cookiecutter.camelcase_modelname}}ForSequenceClassification does not support inputs_embeds
+    def test_inputs_embeds(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in ({{cookiecutter.camelcase_modelname}}Model, {{cookiecutter.camelcase_modelname}}ForConditionalGeneration, {{cookiecutter.camelcase_modelname}}ForQuestionAnswering):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+
+            if not self.is_encoder_decoder:
+                input_ids = inputs["input_ids"]
+                del inputs["input_ids"]
+            else:
+                encoder_input_ids = inputs["input_ids"]
+                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
+                del inputs["input_ids"]
+                inputs.pop("decoder_input_ids", None)
+
+            wte = model.get_input_embeddings()
+            if not self.is_encoder_decoder:
+                inputs["inputs_embeds"] = wte(input_ids)
+            else:
+                inputs["inputs_embeds"] = wte(encoder_input_ids)
+                inputs["decoder_inputs_embeds"] = wte(decoder_input_ids)
+
+            with torch.no_grad():
+                model(**inputs)[0]
+
+    def test_generate_fp16(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs()
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        model = {{cookiecutter.camelcase_modelname}}ForConditionalGeneration(config).eval().to(torch_device)
+        if torch_device == "cuda":
+            model.half()
+        model.generate(input_ids, attention_mask=attention_mask)
+        model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
+
+
+def assert_tensors_close(a, b, atol=1e-12, prefix=""):
+    """If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
+    if a is None and b is None:
+        return True
+    try:
+        if torch.allclose(a, b, atol=atol):
+            return True
+        raise
+    except Exception:
+        pct_different = (torch.gt((a - b).abs(), atol)).float().mean().item()
+        if a.numel() > 100:
+            msg = f"tensor values are {pct_different:.1%} percent different."
+        else:
+            msg = f"{a} != {b}"
+        if prefix:
+            msg = prefix + ": " + msg
+        raise AssertionError(msg)
+
+
+def _long_tensor(tok_lst):
+    return torch.tensor(tok_lst, dtype=torch.long, device=torch_device)
+
+
+TOLERANCE = 1e-4
+
+
+@require_torch
+@require_sentencepiece
+@require_tokenizers
+@slow
+class {{cookiecutter.camelcase_modelname}}ModelIntegrationTests(unittest.TestCase):
+    @cached_property
+    def default_tokenizer(self):
+        return {{cookiecutter.camelcase_modelname}}Tokenizer.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
+
+    def test_inference_no_head(self):
+        model = {{cookiecutter.camelcase_modelname}}Model.from_pretrained('{{cookiecutter.checkpoint_identifier}}').to(torch_device)
+        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        decoder_input_ids = _long_tensor([[2, 0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588]])
+        inputs_dict = prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(model.config, input_ids, decoder_input_ids)
+        with torch.no_grad():
+            output = model(**inputs_dict)[0]
+        expected_shape = torch.Size((1, 11, 1024))
+        self.assertEqual(output.shape, expected_shape)
+        # change to expected output here
+        expected_slice = torch.tensor(
+            [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device
+        )
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
+
+    def test_inference_head(self):
+        model = {{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained('{{cookiecutter.checkpoint_identifier}}').to(torch_device)
+
+        # change to intended input
+        input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        decoder_input_ids = _long_tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        inputs_dict = prepare_{{cookiecutter.lowercase_modelname}}_inputs_dict(model.config, input_ids, decoder_input_ids)
+        with torch.no_grad():
+            output = model(**inputs_dict)[0]
+        expected_shape = torch.Size((1, 11, model.config.vocab_size))
+        self.assertEqual(output.shape, expected_shape)
+        # change to expected output here
+        expected_slice = torch.tensor(
+            [[0.7144, 0.8143, -1.2813], [0.7144, 0.8143, -1.2813], [-0.0467, 2.5911, -2.1845]], device=torch_device
+        )
+        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=TOLERANCE))
+
+    def test_seq_to_seq_generation(self):
+        hf = {{cookiecutter.camelcase_modelname}}ForConditionalGeneration.from_pretrained('{{cookiecutter.checkpoint_identifier}}').to(torch_device)
+        tok = {{cookiecutter.camelcase_modelname}}Tokenizer.from_pretrained('{{cookiecutter.checkpoint_identifier}}')
+
+        batch_input = [
+            # string 1,
+            # string 2,
+            # string 3,
+            # string 4,
+        ]
+
+        # The below article tests that we don't add any hypotheses outside of the top n_beams
+        dct = tok.batch_encode_plus(
+            batch_input,
+            max_length=512,
+            padding="max_length",
+            truncation_strategy="only_first",
+            truncation=True,
+            return_tensors="pt",
+        )
+
+        hypotheses_batch = hf.generate(
+            input_ids=dct["input_ids"].to(torch_device),
+            attention_mask=dct["attention_mask"].to(torch_device),
+            num_beams=2,
+        )
+
+        EXPECTED = [
+            # here expected 1,
+            # here expected 2,
+            # here expected 3,
+            # here expected 4,
+        ]
+
+        generated = tok.batch_decode(
+            hypotheses_batch.tolist(), clean_up_tokenization_spaces=True, skip_special_tokens=True
+        )
+        assert generated == EXPECTED
+{% endif -%}
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/to_replace_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/to_replace_{{cookiecutter.lowercase_modelname}}.py
@@ -28,6 +28,7 @@
 # To replace in: "src/transformers/__init__.py"
 # Below: "if is_torch_available():" if generating PyTorch
 # Replace with:
+{% if cookiecutter.is_encoder_decoder_model == "False" %}
    from .models.{{cookiecutter.lowercase_modelname}} import (
        {{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST,
        {{cookiecutter.camelcase_modelname}}ForMaskedLM,
@@ -41,10 +42,20 @@
        {{cookiecutter.camelcase_modelname}}PreTrainedModel,
        load_tf_weights_in_{{cookiecutter.lowercase_modelname}},
    )
+{% else %}
+    from .models.{{cookiecutter.lowercase_modelname}} import (
+        {{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST,
+        {{cookiecutter.camelcase_modelname}}ForConditionalGeneration,
+        {{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
+        {{cookiecutter.camelcase_modelname}}ForSequenceClassification,
+        {{cookiecutter.camelcase_modelname}}Model,
+    )
+{% endif -%}
 # End.

 # Below: "if is_tf_available():" if generating TensorFlow
 # Replace with:
+{% if cookiecutter.is_encoder_decoder_model == "False" %}
    from .models.{{cookiecutter.lowercase_modelname}} import (
        TF_{{cookiecutter.uppercase_modelname}}_PRETRAINED_MODEL_ARCHIVE_LIST,
        TF{{cookiecutter.camelcase_modelname}}ForMaskedLM,
@@ -57,11 +68,19 @@
        TF{{cookiecutter.camelcase_modelname}}Model,
        TF{{cookiecutter.camelcase_modelname}}PreTrainedModel,
    )
+{% else %}
+    from .models.{{cookiecutter.lowercase_modelname}} import (
+        TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration,
+        TF{{cookiecutter.camelcase_modelname}}Model,
+        TF{{cookiecutter.camelcase_modelname}}PreTrainedModel,
+    )
+{% endif -%}
 # End.

 # Below: "if is_tokenizers_available():"
 # Replace with:
-    from models.{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}TokenizerFast
+    from .models.{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}TokenizerFast
+# End.

 # Below: "from .models.albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig"
 # Replace with:
@@ -101,6 +120,7 @@ from ..{{cookiecutter.lowercase_modelname}}.configuration_{{cookiecutter.lowerca

 # Below: "# Add modeling imports here"
 # Replace with:
+{% if cookiecutter.is_encoder_decoder_model == "False" -%}
 from ..{{cookiecutter.lowercase_modelname}}.modeling_{{cookiecutter.lowercase_modelname}} import (
    {{cookiecutter.camelcase_modelname}}ForMaskedLM,
    {{cookiecutter.camelcase_modelname}}ForCausalLM,
@@ -110,6 +130,14 @@ from ..{{cookiecutter.lowercase_modelname}}.modeling_{{cookiecutter.lowercase_mo
    {{cookiecutter.camelcase_modelname}}ForTokenClassification,
    {{cookiecutter.camelcase_modelname}}Model,
 )
+{% else -%}
+from ..{{cookiecutter.lowercase_modelname}}.modeling_{{cookiecutter.lowercase_modelname}} import (
+    {{cookiecutter.camelcase_modelname}}ForConditionalGeneration,
+    {{cookiecutter.camelcase_modelname}}ForQuestionAnswering,
+    {{cookiecutter.camelcase_modelname}}ForSequenceClassification,
+    {{cookiecutter.camelcase_modelname}}Model,
+)
+{% endif -%}
 # End.

 # Below: "# Base model mapping"
@@ -119,17 +147,27 @@ from ..{{cookiecutter.lowercase_modelname}}.modeling_{{cookiecutter.lowercase_mo

 # Below: "# Model with LM heads mapping"
 # Replace with:
+{% if cookiecutter.is_encoder_decoder_model == "False" -%}
        ({{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}ForMaskedLM),
+{% else %}
+        ({{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}ForConditionalGeneration),
+{% endif -%}
 # End.

 # Below: "# Model for Causal LM mapping"
 # Replace with:
+{% if cookiecutter.is_encoder_decoder_model == "False" -%}
        ({{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}ForCausalLM),
+{% else -%}
+{% endif -%}
 # End.

 # Below: "# Model for Masked LM mapping"
 # Replace with:
+{% if cookiecutter.is_encoder_decoder_model == "False" -%}
        ({{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}ForMaskedLM),
+{% else -%}
+{% endif -%}
 # End.

 # Below: "# Model for Sequence Classification mapping"
@@ -144,14 +182,27 @@ from ..{{cookiecutter.lowercase_modelname}}.modeling_{{cookiecutter.lowercase_mo

 # Below: "# Model for Token Classification mapping"
 # Replace with:
+{% if cookiecutter.is_encoder_decoder_model == "False" -%}
        ({{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}ForTokenClassification),
+{% else -%}
+{% endif -%}
 # End.

 # Below: "# Model for Multiple Choice mapping"
 # Replace with:
+{% if cookiecutter.is_encoder_decoder_model == "False" -%}
        ({{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}ForMultipleChoice),
+{% else -%}
+{% endif -%}
 # End.

+# Below: "# Model for Seq2Seq Causal LM mapping"
+# Replace with:
+{% if cookiecutter.is_encoder_decoder_model == "False" -%}
+{% else %}
+        ({{cookiecutter.camelcase_modelname}}Config, {{cookiecutter.camelcase_modelname}}ForConditionalGeneration),
+{% endif -%}
+# End.

 # To replace in: "src/transformers/models/auto/modeling_tf_auto.py" if generating TensorFlow
 # Below: "from .configuration_auto import ("
@@ -161,6 +212,7 @@ from ..{{cookiecutter.lowercase_modelname}}.modeling_{{cookiecutter.lowercase_mo

 # Below: "# Add modeling imports here"
 # Replace with:
+{% if cookiecutter.is_encoder_decoder_model == "False" -%}
 from ..{{cookiecutter.lowercase_modelname}}.modeling_tf_{{cookiecutter.lowercase_modelname}} import (
    TF{{cookiecutter.camelcase_modelname}}ForMaskedLM,
    TF{{cookiecutter.camelcase_modelname}}ForCausalLM,
@@ -170,6 +222,12 @@ from ..{{cookiecutter.lowercase_modelname}}.modeling_tf_{{cookiecutter.lowercase
    TF{{cookiecutter.camelcase_modelname}}ForTokenClassification,
    TF{{cookiecutter.camelcase_modelname}}Model,
 )
+{% else -%}
+from ..{{cookiecutter.lowercase_modelname}}.modeling_tf_{{cookiecutter.lowercase_modelname}} import (
+    TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration,
+    TF{{cookiecutter.camelcase_modelname}}Model,
+)
+{% endif -%}
 # End.

 # Below: "# Base model mapping"
@@ -179,35 +237,65 @@ from ..{{cookiecutter.lowercase_modelname}}.modeling_tf_{{cookiecutter.lowercase

 # Below: "# Model with LM heads mapping"
 # Replace with:
+{% if cookiecutter.is_encoder_decoder_model == "False" -%}
        ({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}ForMaskedLM),
+{% else %}
+        ({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration),
+{% endif -%}
 # End.

 # Below: "# Model for Causal LM mapping"
 # Replace with:
+{% if cookiecutter.is_encoder_decoder_model == "False" -%}
        ({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}ForCausalLM),
+{% else -%}
+{% endif -%}
 # End.

 # Below: "# Model for Masked LM mapping"
 # Replace with:
+{% if cookiecutter.is_encoder_decoder_model == "False" -%}
        ({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}ForMaskedLM),
+{% else -%}
+{% endif -%}
 # End.

 # Below: "# Model for Sequence Classification mapping"
 # Replace with:
+{% if cookiecutter.is_encoder_decoder_model == "False" -%}
        ({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification),
+{% else -%}
+{% endif -%}
 # End.

 # Below: "# Model for Question Answering mapping"
 # Replace with:
+{% if cookiecutter.is_encoder_decoder_model == "False" -%}
        ({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering),
+{% else -%}
+{% endif -%}
 # End.

 # Below: "# Model for Token Classification mapping"
 # Replace with:
+{% if cookiecutter.is_encoder_decoder_model == "False" -%}
        ({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}ForTokenClassification),
+{% else -%}
+{% endif -%}
 # End.

 # Below: "# Model for Multiple Choice mapping"
 # Replace with:
+{% if cookiecutter.is_encoder_decoder_model == "False" -%}
        ({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}ForMultipleChoice),
+{% else -%}
+{% endif -%}
+# End.
+
+# Below: "# Model for Seq2Seq Causal LM mapping"
+# Replace with:
+{% if cookiecutter.is_encoder_decoder_model == "False" -%}
+{% else %}
+        ({{cookiecutter.camelcase_modelname}}Config, TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration),
+{% endif -%}
 # End.
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_fast_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_fast_{{cookiecutter.lowercase_modelname}}.py
@@ -56,6 +56,47 @@ class {{cookiecutter.camelcase_modelname}}TokenizerFast(BertTokenizerFast):
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
    slow_tokenizer_class = {{cookiecutter.camelcase_modelname}}Tokenizer
+
+{%- elif cookiecutter.tokenizer_type == "Based on BART" %}
+from ...utils import logging
+from ..bart.tokenization_bart_fast import BartTokenizerFast
+from .tokenization_{{cookiecutter.lowercase_modelname}} import {{cookiecutter.camelcase_modelname}}Tokenizer
+
+
+logger = logging.get_logger(__name__)
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/vocab.json",
+    },
+    "merges_file": {
+        "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/merges.txt",
+    },
+    "tokenizer_file": {
+        "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/tokenizer.json",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "{{cookiecutter.checkpoint_identifier}}": 1024,
+}
+
+
+class {{cookiecutter.camelcase_modelname}}TokenizerFast(BartTokenizerFast):
+    r"""
+    Construct a "fast" {{cookiecutter.modelname}} tokenizer (backed by HuggingFace's `tokenizers` library).
+
+    :class:`~transformers.{{cookiecutter.camelcase_modelname}}TokenizerFast` is identical to :class:`~transformers.BartTokenizerFast` and runs
+    end-to-end tokenization: punctuation splitting and wordpiece.
+
+    Refer to superclass :class:`~transformers.BartTokenizerFast` for usage examples and documentation concerning
+    parameters.
+    """
+
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    slow_tokenizer_class = {{cookiecutter.camelcase_modelname}}Tokenizer
+
 {%- elif cookiecutter.tokenizer_type == "Standalone" %}
 from typing import List, Optional

--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/tokenization_{{cookiecutter.lowercase_modelname}}.py
@@ -54,6 +54,45 @@ class {{cookiecutter.camelcase_modelname}}Tokenizer(BertTokenizer):
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+
+{%- elif cookiecutter.tokenizer_type == "Based on BART" %}
+from ...utils import logging
+from ..bart.tokenization_bart import BartTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/vocab.json",
+    },
+    "merges_file": {
+        "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/merges.txt",
+    },
+    "tokenizer_file": {
+        "{{cookiecutter.checkpoint_identifier}}": "https://huggingface.co/{{cookiecutter.checkpoint_identifier}}/resolve/main/tokenizer.json",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "{{cookiecutter.checkpoint_identifier}}": 1024,
+}
+
+
+class {{cookiecutter.camelcase_modelname}}Tokenizer(BartTokenizer):
+    """
+    Construct a {{cookiecutter.modelname}} tokenizer.
+
+    :class:`~transformers.{{cookiecutter.camelcase_modelname}}Tokenizer` is identical to :class:`~transformers.BartTokenizer` and runs end-to-end
+    tokenization: punctuation splitting and wordpiece.
+
+    Refer to superclass :class:`~transformers.BartTokenizer` for usage examples and documentation concerning
+    parameters.
+    """
+
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
 {%- elif cookiecutter.tokenizer_type == "Standalone" %}
 from typing import List, Optional

@@ -289,5 +328,4 @@ class {{cookiecutter.camelcase_modelname}}TokenizerFast(PreTrainedTokenizerFast)
            return len(cls + token_ids_0 + sep) * [0]
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

-
 {% endif %}
--- a/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.rst
+++ b/templates/adding_a_new_model/cookiecutter-template-{{cookiecutter.modelname}}/{{cookiecutter.lowercase_modelname}}.rst
@@ -57,7 +57,7 @@ Tips:
 .. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}Model
    :members: forward

-
+{% if cookiecutter.is_encoder_decoder_model == "False" %}
 {{cookiecutter.camelcase_modelname}}ForCausalLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -83,7 +83,7 @@ Tips:
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}ForMultipleChoice
-    :members:
+    :members: forward


 {{cookiecutter.camelcase_modelname}}ForTokenClassification
@@ -99,6 +99,29 @@ Tips:
 .. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
    :members: forward

+{%- else %}
+{{cookiecutter.camelcase_modelname}}ForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}ForConditionalGeneration
+    :members: forward
+
+
+{{cookiecutter.camelcase_modelname}}ForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}ForSequenceClassification
+    :members: forward
+
+
+{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
+    :members: forward
+
+
+{% endif -%}
 {% endif -%}
 {% if "TensorFlow" in cookiecutter.generate_tensorflow_and_pytorch -%}

@@ -108,7 +131,7 @@ TF{{cookiecutter.camelcase_modelname}}Model
 .. autoclass:: transformers.TF{{cookiecutter.camelcase_modelname}}Model
    :members: call

-
+{% if cookiecutter.is_encoder_decoder_model == "False" %}
 TF{{cookiecutter.camelcase_modelname}}ForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -120,7 +143,7 @@ TF{{cookiecutter.camelcase_modelname}}ForCausalLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TF{{cookiecutter.camelcase_modelname}}ForCausalLM
-    :members: forward
+    :members: call


 TF{{cookiecutter.camelcase_modelname}}ForSequenceClassification
@@ -151,4 +174,11 @@ TF{{cookiecutter.camelcase_modelname}}ForQuestionAnswering
    :members: call


+{%- else %}
+TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TF{{cookiecutter.camelcase_modelname}}ForConditionalGeneration
+    :members: call
+{% endif -%}
 {% endif -%}
--- a/templates/adding_a_new_model/cookiecutter.json
+++ b/templates/adding_a_new_model/cookiecutter.json
@@ -5,6 +5,7 @@
  "camelcase_modelname": "BrandNewBert",
  "authors": "The HuggingFace Team",
  "checkpoint_identifier": "brand-new-bert-base-cased",
-  "tokenizer_type": ["Based on BERT", "Standalone"],
-  "generate_tensorflow_and_pytorch": ["PyTorch & TensorFlow", "PyTorch", "TensorFlow"]
+  "tokenizer_type": ["Based on BERT", "Based on BART", "Standalone"],
+  "generate_tensorflow_and_pytorch": ["PyTorch & TensorFlow", "PyTorch", "TensorFlow"],
+  "is_encoder_decoder_model": ["True", "False"]
 }
--- a/templates/adding_a_new_model/tests/encoder-bert-tokenizer.json
+++ b/templates/adding_a_new_model/tests/encoder-bert-tokenizer.json
@@ -6,5 +6,6 @@
  "authors": "The HuggingFace Team",
  "checkpoint_identifier": "brand-new-bert-base-cased",
  "tokenizer_type": "Based on BERT",
-  "generate_tensorflow_and_pytorch": "PyTorch & TensorFlow"
+  "generate_tensorflow_and_pytorch": "PyTorch & TensorFlow",
+  "is_encoder_decoder_model": "False"
 }
--- a/templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json
+++ b/templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json
@@ -6,5 +6,6 @@
  "authors": "The HuggingFace Team",
  "checkpoint_identifier": "brand-new-bert-base-cased",
  "tokenizer_type": "Based on BERT",
-  "generate_tensorflow_and_pytorch": "PyTorch"
+  "generate_tensorflow_and_pytorch": "PyTorch",
+  "is_encoder_decoder_model": "False"
 }
--- a/templates/adding_a_new_model/tests/pt-seq-2-seq-bart-tokenizer.json
+++ b/templates/adding_a_new_model/tests/pt-seq-2-seq-bart-tokenizer.json
@@ -0,0 +1,11 @@
+{
+  "modelname": "NewENCDEC",
+  "uppercase_modelname": "NEW_ENC_DEC",
+  "lowercase_modelname": "new_enc_dec",
+  "camelcase_modelname": "NewEncDec",
+  "authors": "The HuggingFace Team",
+  "checkpoint_identifier": "new-enc-dec-base",
+  "tokenizer_type": "Based on BART",
+  "generate_tensorflow_and_pytorch": "PyTorch",
+  "is_encoder_decoder_model": "True"
+}
--- a/templates/adding_a_new_model/tests/standalone.json
+++ b/templates/adding_a_new_model/tests/standalone.json
@@ -6,5 +6,6 @@
  "authors": "The HuggingFace Team",
  "checkpoint_identifier": "bi-brand-new-bert-base-cased",
  "tokenizer_type": "Standalone",
-  "generate_tensorflow_and_pytorch": "PyTorch & TensorFlow"
+  "generate_tensorflow_and_pytorch": "PyTorch & TensorFlow",
+  "is_encoder_decoder_model": "False"
 }
--- a/templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json
+++ b/templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json
@@ -6,5 +6,6 @@
  "authors": "The HuggingFace Team",
  "checkpoint_identifier": "brand-new-bert-base-cased",
  "tokenizer_type": "Based on BERT",
-  "generate_tensorflow_and_pytorch": "TensorFlow"
+  "generate_tensorflow_and_pytorch": "TensorFlow",
+  "is_encoder_decoder_model": "False"
 }
--- a/templates/adding_a_new_model/tests/tf-seq-2-seq-bart-tokenizer.json
+++ b/templates/adding_a_new_model/tests/tf-seq-2-seq-bart-tokenizer.json
@@ -0,0 +1,11 @@
+{
+  "modelname": "NewTFENCDEC",
+  "uppercase_modelname": "NEW_TF_ENC_DEC",
+  "lowercase_modelname": "new_tf_enc_dec",
+  "camelcase_modelname": "NewTFEncDec",
+  "authors": "The HuggingFace Team",
+  "checkpoint_identifier": "new-tf-enc-dec-base",
+  "tokenizer_type": "Based on BART",
+  "generate_tensorflow_and_pytorch": "TensorFlow",
+  "is_encoder_decoder_model": "True"
+}
--- a/tests/test_modeling_tf_bart.py
+++ b/tests/test_modeling_tf_bart.py
@@ -94,7 +94,7 @@ class TFBartModelTester:
        self.batch_size = 1

        # first forward pass
-        outputs = model(input_ids, use_cache=True)
+        outputs = model(input_ids, attention_mask=attention_mask, use_cache=True)

        output, past_key_values = outputs.to_tuple()
        past_key_values = past_key_values[1]