Delete all mentions of Model2Model (#3019)

2020-02-26 11:36:27 -05:00
parent bb7c468520
commit 9df74b8bc4
4 changed files with 1 additions and 203 deletions
--- a/docs/source/quickstart.md
+++ b/docs/source/quickstart.md
@@ -220,96 +220,3 @@ print(sequence)
 ```
 The model only requires a single token as input as all the previous tokens' key/value pairs are contained in the `past`.
 ### Model2Model example
 Encoder-decoder architectures require two tokenized inputs: one for the encoder and the other one for the decoder. Let's assume that we want to use `Model2Model` for generative question answering, and start by tokenizing the question and answer that will be fed to the model.
 ```python
 import torch
 from transformers import BertTokenizer, Model2Model
 # OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
 import logging
 logging.basicConfig(level=logging.INFO)
 # Load pre-trained model tokenizer (vocabulary)
 tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 # Encode the input to the encoder (the question)
 question = "Who was Jim Henson?"
 encoded_question = tokenizer.encode(question)
 # Encode the input to the decoder (the answer)
 answer = "Jim Henson was a puppeteer"
 encoded_answer = tokenizer.encode(answer)
 # Convert inputs to PyTorch tensors
 question_tensor = torch.tensor([encoded_question])
 answer_tensor = torch.tensor([encoded_answer])
 ```
 Let's see how we can use `Model2Model` to get the value of the loss associated with this (question, answer) pair:
 ```python
 # In order to compute the loss we need to provide language model
 # labels (the token ids that the model should have produced) to
 # the decoder.
 lm_labels =  encoded_answer
 labels_tensor = torch.tensor([lm_labels])
 # Load pre-trained model (weights)
 model = Model2Model.from_pretrained('bert-base-uncased')
 # Set the model in evaluation mode to deactivate the DropOut modules
 # This is IMPORTANT to have reproducible results during evaluation!
 model.eval()
 # If you have a GPU, put everything on cuda
 question_tensor = question_tensor.to('cuda')
 answer_tensor = answer_tensor.to('cuda')
 labels_tensor = labels_tensor.to('cuda')
 model.to('cuda')
 # Predict hidden states features for each layer
 with torch.no_grad():
    # See the models docstrings for the detail of the inputs
    outputs = model(question_tensor, answer_tensor, decoder_lm_labels=labels_tensor)
    # Transformers models always output tuples.
    # See the models docstrings for the detail of all the outputs
    # In our case, the first element is the value of the LM loss 
    lm_loss = outputs[0]
 ```
 This loss can be used to fine-tune `Model2Model` on the question answering task. Assuming that we fine-tuned the model, let us now see how to generate an answer:
 ```python
 # Let's re-use the previous question
 question = "Who was Jim Henson?"
 encoded_question = tokenizer.encode(question)
 question_tensor = torch.tensor([encoded_question])
 # This time we try to generate the answer, so we start with an empty sequence
 answer = "[CLS]"
 encoded_answer = tokenizer.encode(answer, add_special_tokens=False)
 answer_tensor = torch.tensor([encoded_answer])
 # Load pre-trained model (weights)
 model = Model2Model.from_pretrained('fine-tuned-weights')
 model.eval()
 # If you have a GPU, put everything on cuda
 question_tensor = question_tensor.to('cuda')
 answer_tensor = answer_tensor.to('cuda')
 model.to('cuda')
 # Predict all tokens
 with torch.no_grad():
    outputs = model(question_tensor, answer_tensor)
    predictions = outputs[0]
 # confirm we were able to predict 'jim'
 predicted_index = torch.argmax(predictions[0, -1]).item()
 predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
 assert predicted_token == 'jim'
 ```
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@@ -241,7 +241,7 @@ if is_torch_available():
        CamembertForTokenClassification,
        CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
    )
-    from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
+    from .modeling_encoder_decoder import PreTrainedEncoderDecoder
    from .modeling_t5 import (
        T5PreTrainedModel,
        T5Model,
--- a/src/transformers/modeling_encoder_decoder.py
+++ b/src/transformers/modeling_encoder_decoder.py
@@ -234,62 +234,3 @@ class PreTrainedEncoderDecoder(nn.Module):
        decoder_outputs = self.decoder(decoder_input_ids, **kwargs_decoder)
        return decoder_outputs + encoder_outputs
 class Model2Model(PreTrainedEncoderDecoder):
    r"""
        :class:`~transformers.Model2Model` instantiates a Seq2Seq2 model
        where both of the encoder and decoder are of the same family. If the
        name of or that path to a pretrained model is specified the encoder and
        the decoder will be initialized with the pretrained weight (the
        cross-attention will be intialized randomly if its weights are not
        present).
        It is possible to override this behavior and initialize, say, the decoder randomly
        by creating it beforehand as follows
            config = BertConfig.from_pretrained()
            decoder = BertForMaskedLM(config)
            model = Model2Model.from_pretrained('bert-base-uncased', decoder_model=decoder)
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.tie_weights()
    def tie_weights(self):
        """ Tying the encoder and decoders' embeddings together.
       We need for each to get down to the embedding weights. However the
        different model classes are inconsistent to that respect:
        - BertModel: embeddings.word_embeddings
        - RoBERTa: embeddings.word_embeddings
        - XLMModel: embeddings
        - GPT2: wte
        - BertForMaskedLM: bert.embeddings.word_embeddings
        - RobertaForMaskedLM: roberta.embeddings.word_embeddings
        argument of the XEmbedding layer for each model, but it is "blocked"
        by a model-specific keyword (bert, )...
        """
        # self._tie_or_clone_weights(self.encoder, self.decoder)
        pass
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
        if (
            "bert" not in pretrained_model_name_or_path
            or "roberta" in pretrained_model_name_or_path
            or "distilbert" in pretrained_model_name_or_path
        ):
            raise ValueError("Only the Bert model is currently supported.")
        model = super().from_pretrained(
            encoder_pretrained_model_name_or_path=pretrained_model_name_or_path,
            decoder_pretrained_model_name_or_path=pretrained_model_name_or_path,
            *args,
            **kwargs,
        )
        return model
--- a/tests/test_modeling_encoder_decoder.py
+++ b/tests/test_modeling_encoder_decoder.py
@@ -1,50 +0,0 @@
 # coding=utf-8
 # Copyright 2018 The Hugging Face Inc. Team
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 #     http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
 import unittest
 from transformers import is_torch_available
 from .utils import require_torch, slow
 if is_torch_available():
    from transformers import BertModel, BertForMaskedLM, Model2Model
    from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
@require_torch
 class EncoderDecoderModelTest(unittest.TestCase):
    @slow
    def test_model2model_from_pretrained(self):
        logging.basicConfig(level=logging.INFO)
        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
            model = Model2Model.from_pretrained(model_name)
            self.assertIsInstance(model.encoder, BertModel)
            self.assertIsInstance(model.decoder, BertForMaskedLM)
            self.assertEqual(model.decoder.config.is_decoder, True)
            self.assertEqual(model.encoder.config.is_decoder, False)
    def test_model2model_from_pretrained_not_bert(self):
        logging.basicConfig(level=logging.INFO)
        with self.assertRaises(ValueError):
            _ = Model2Model.from_pretrained("roberta")
        with self.assertRaises(ValueError):
            _ = Model2Model.from_pretrained("distilbert")
        with self.assertRaises(ValueError):
            _ = Model2Model.from_pretrained("does-not-exist")