diff --git a/docs/source/quickstart.md b/docs/source/quickstart.md
index 438f78ebcf..30a4401244 100644
--- a/docs/source/quickstart.md
+++ b/docs/source/quickstart.md
@@ -220,96 +220,3 @@ print(sequence)
 ```
 
 The model only requires a single token as input as all the previous tokens' key/value pairs are contained in the `past`.
-
-### Model2Model example
-
-Encoder-decoder architectures require two tokenized inputs: one for the encoder and the other one for the decoder. Let's assume that we want to use `Model2Model` for generative question answering, and start by tokenizing the question and answer that will be fed to the model.
-
-```python
-import torch
-from transformers import BertTokenizer, Model2Model
-
-# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
-import logging
-logging.basicConfig(level=logging.INFO)
-
-# Load pre-trained model tokenizer (vocabulary)
-tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-
-# Encode the input to the encoder (the question)
-question = "Who was Jim Henson?"
-encoded_question = tokenizer.encode(question)
-
-# Encode the input to the decoder (the answer)
-answer = "Jim Henson was a puppeteer"
-encoded_answer = tokenizer.encode(answer)
-
-# Convert inputs to PyTorch tensors
-question_tensor = torch.tensor([encoded_question])
-answer_tensor = torch.tensor([encoded_answer])
-```
-
-Let's see how we can use `Model2Model` to get the value of the loss associated with this (question, answer) pair:
-
-```python
-# In order to compute the loss we need to provide language model
-# labels (the token ids that the model should have produced) to
-# the decoder.
-lm_labels =  encoded_answer
-labels_tensor = torch.tensor([lm_labels])
-
-# Load pre-trained model (weights)
-model = Model2Model.from_pretrained('bert-base-uncased')
-
-# Set the model in evaluation mode to deactivate the DropOut modules
-# This is IMPORTANT to have reproducible results during evaluation!
-model.eval()
-
-# If you have a GPU, put everything on cuda
-question_tensor = question_tensor.to('cuda')
-answer_tensor = answer_tensor.to('cuda')
-labels_tensor = labels_tensor.to('cuda')
-model.to('cuda')
-
-# Predict hidden states features for each layer
-with torch.no_grad():
-    # See the models docstrings for the detail of the inputs
-    outputs = model(question_tensor, answer_tensor, decoder_lm_labels=labels_tensor)
-    # Transformers models always output tuples.
-    # See the models docstrings for the detail of all the outputs
-    # In our case, the first element is the value of the LM loss 
-    lm_loss = outputs[0]
-```
-
-This loss can be used to fine-tune `Model2Model` on the question answering task. Assuming that we fine-tuned the model, let us now see how to generate an answer:
-
-```python
-# Let's re-use the previous question
-question = "Who was Jim Henson?"
-encoded_question = tokenizer.encode(question)
-question_tensor = torch.tensor([encoded_question])
-
-# This time we try to generate the answer, so we start with an empty sequence
-answer = "[CLS]"
-encoded_answer = tokenizer.encode(answer, add_special_tokens=False)
-answer_tensor = torch.tensor([encoded_answer])
-
-# Load pre-trained model (weights)
-model = Model2Model.from_pretrained('fine-tuned-weights')
-model.eval()
-
-# If you have a GPU, put everything on cuda
-question_tensor = question_tensor.to('cuda')
-answer_tensor = answer_tensor.to('cuda')
-model.to('cuda')
-
-# Predict all tokens
-with torch.no_grad():
-    outputs = model(question_tensor, answer_tensor)
-    predictions = outputs[0]
-
-# confirm we were able to predict 'jim'
-predicted_index = torch.argmax(predictions[0, -1]).item()
-predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
-assert predicted_token == 'jim'
-```
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index ac283ff7c8..ad6869f4c4 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -241,7 +241,7 @@ if is_torch_available():
         CamembertForTokenClassification,
         CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
     )
-    from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
+    from .modeling_encoder_decoder import PreTrainedEncoderDecoder
     from .modeling_t5 import (
         T5PreTrainedModel,
         T5Model,
diff --git a/src/transformers/modeling_encoder_decoder.py b/src/transformers/modeling_encoder_decoder.py
index 4c5603b217..649d1e858f 100644
--- a/src/transformers/modeling_encoder_decoder.py
+++ b/src/transformers/modeling_encoder_decoder.py
@@ -234,62 +234,3 @@ class PreTrainedEncoderDecoder(nn.Module):
         decoder_outputs = self.decoder(decoder_input_ids, **kwargs_decoder)
 
         return decoder_outputs + encoder_outputs
-
-
-class Model2Model(PreTrainedEncoderDecoder):
-    r"""
-        :class:`~transformers.Model2Model` instantiates a Seq2Seq2 model
-        where both of the encoder and decoder are of the same family. If the
-        name of or that path to a pretrained model is specified the encoder and
-        the decoder will be initialized with the pretrained weight (the
-        cross-attention will be intialized randomly if its weights are not
-        present).
-
-        It is possible to override this behavior and initialize, say, the decoder randomly
-        by creating it beforehand as follows
-
-            config = BertConfig.from_pretrained()
-            decoder = BertForMaskedLM(config)
-            model = Model2Model.from_pretrained('bert-base-uncased', decoder_model=decoder)
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.tie_weights()
-
-    def tie_weights(self):
-        """ Tying the encoder and decoders' embeddings together.
-
-       We need for each to get down to the embedding weights. However the
-        different model classes are inconsistent to that respect:
-        - BertModel: embeddings.word_embeddings
-        - RoBERTa: embeddings.word_embeddings
-        - XLMModel: embeddings
-        - GPT2: wte
-        - BertForMaskedLM: bert.embeddings.word_embeddings
-        - RobertaForMaskedLM: roberta.embeddings.word_embeddings
-
-        argument of the XEmbedding layer for each model, but it is "blocked"
-        by a model-specific keyword (bert, )...
-        """
-        # self._tie_or_clone_weights(self.encoder, self.decoder)
-        pass
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
-
-        if (
-            "bert" not in pretrained_model_name_or_path
-            or "roberta" in pretrained_model_name_or_path
-            or "distilbert" in pretrained_model_name_or_path
-        ):
-            raise ValueError("Only the Bert model is currently supported.")
-
-        model = super().from_pretrained(
-            encoder_pretrained_model_name_or_path=pretrained_model_name_or_path,
-            decoder_pretrained_model_name_or_path=pretrained_model_name_or_path,
-            *args,
-            **kwargs,
-        )
-
-        return model
diff --git a/tests/test_modeling_encoder_decoder.py b/tests/test_modeling_encoder_decoder.py
deleted file mode 100644
index ac01e7b561..0000000000
--- a/tests/test_modeling_encoder_decoder.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Hugging Face Inc. Team
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import unittest
-
-from transformers import is_torch_available
-
-from .utils import require_torch, slow
-
-
-if is_torch_available():
-    from transformers import BertModel, BertForMaskedLM, Model2Model
-    from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
-
-
-@require_torch
-class EncoderDecoderModelTest(unittest.TestCase):
-    @slow
-    def test_model2model_from_pretrained(self):
-        logging.basicConfig(level=logging.INFO)
-        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
-            model = Model2Model.from_pretrained(model_name)
-            self.assertIsInstance(model.encoder, BertModel)
-            self.assertIsInstance(model.decoder, BertForMaskedLM)
-            self.assertEqual(model.decoder.config.is_decoder, True)
-            self.assertEqual(model.encoder.config.is_decoder, False)
-
-    def test_model2model_from_pretrained_not_bert(self):
-        logging.basicConfig(level=logging.INFO)
-        with self.assertRaises(ValueError):
-            _ = Model2Model.from_pretrained("roberta")
-
-        with self.assertRaises(ValueError):
-            _ = Model2Model.from_pretrained("distilbert")
-
-        with self.assertRaises(ValueError):
-            _ = Model2Model.from_pretrained("does-not-exist")