diff --git a/docs/source/quickstart.md b/docs/source/quickstart.md index 438f78ebcf..30a4401244 100644 --- a/docs/source/quickstart.md +++ b/docs/source/quickstart.md @@ -220,96 +220,3 @@ print(sequence) ``` The model only requires a single token as input as all the previous tokens' key/value pairs are contained in the `past`. - -### Model2Model example - -Encoder-decoder architectures require two tokenized inputs: one for the encoder and the other one for the decoder. Let's assume that we want to use `Model2Model` for generative question answering, and start by tokenizing the question and answer that will be fed to the model. - -```python -import torch -from transformers import BertTokenizer, Model2Model - -# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows -import logging -logging.basicConfig(level=logging.INFO) - -# Load pre-trained model tokenizer (vocabulary) -tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') - -# Encode the input to the encoder (the question) -question = "Who was Jim Henson?" -encoded_question = tokenizer.encode(question) - -# Encode the input to the decoder (the answer) -answer = "Jim Henson was a puppeteer" -encoded_answer = tokenizer.encode(answer) - -# Convert inputs to PyTorch tensors -question_tensor = torch.tensor([encoded_question]) -answer_tensor = torch.tensor([encoded_answer]) -``` - -Let's see how we can use `Model2Model` to get the value of the loss associated with this (question, answer) pair: - -```python -# In order to compute the loss we need to provide language model -# labels (the token ids that the model should have produced) to -# the decoder. -lm_labels = encoded_answer -labels_tensor = torch.tensor([lm_labels]) - -# Load pre-trained model (weights) -model = Model2Model.from_pretrained('bert-base-uncased') - -# Set the model in evaluation mode to deactivate the DropOut modules -# This is IMPORTANT to have reproducible results during evaluation! -model.eval() - -# If you have a GPU, put everything on cuda -question_tensor = question_tensor.to('cuda') -answer_tensor = answer_tensor.to('cuda') -labels_tensor = labels_tensor.to('cuda') -model.to('cuda') - -# Predict hidden states features for each layer -with torch.no_grad(): - # See the models docstrings for the detail of the inputs - outputs = model(question_tensor, answer_tensor, decoder_lm_labels=labels_tensor) - # Transformers models always output tuples. - # See the models docstrings for the detail of all the outputs - # In our case, the first element is the value of the LM loss - lm_loss = outputs[0] -``` - -This loss can be used to fine-tune `Model2Model` on the question answering task. Assuming that we fine-tuned the model, let us now see how to generate an answer: - -```python -# Let's re-use the previous question -question = "Who was Jim Henson?" -encoded_question = tokenizer.encode(question) -question_tensor = torch.tensor([encoded_question]) - -# This time we try to generate the answer, so we start with an empty sequence -answer = "[CLS]" -encoded_answer = tokenizer.encode(answer, add_special_tokens=False) -answer_tensor = torch.tensor([encoded_answer]) - -# Load pre-trained model (weights) -model = Model2Model.from_pretrained('fine-tuned-weights') -model.eval() - -# If you have a GPU, put everything on cuda -question_tensor = question_tensor.to('cuda') -answer_tensor = answer_tensor.to('cuda') -model.to('cuda') - -# Predict all tokens -with torch.no_grad(): - outputs = model(question_tensor, answer_tensor) - predictions = outputs[0] - -# confirm we were able to predict 'jim' -predicted_index = torch.argmax(predictions[0, -1]).item() -predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] -assert predicted_token == 'jim' -``` diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index ac283ff7c8..ad6869f4c4 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -241,7 +241,7 @@ if is_torch_available(): CamembertForTokenClassification, CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP, ) - from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model + from .modeling_encoder_decoder import PreTrainedEncoderDecoder from .modeling_t5 import ( T5PreTrainedModel, T5Model, diff --git a/src/transformers/modeling_encoder_decoder.py b/src/transformers/modeling_encoder_decoder.py index 4c5603b217..649d1e858f 100644 --- a/src/transformers/modeling_encoder_decoder.py +++ b/src/transformers/modeling_encoder_decoder.py @@ -234,62 +234,3 @@ class PreTrainedEncoderDecoder(nn.Module): decoder_outputs = self.decoder(decoder_input_ids, **kwargs_decoder) return decoder_outputs + encoder_outputs - - -class Model2Model(PreTrainedEncoderDecoder): - r""" - :class:`~transformers.Model2Model` instantiates a Seq2Seq2 model - where both of the encoder and decoder are of the same family. If the - name of or that path to a pretrained model is specified the encoder and - the decoder will be initialized with the pretrained weight (the - cross-attention will be intialized randomly if its weights are not - present). - - It is possible to override this behavior and initialize, say, the decoder randomly - by creating it beforehand as follows - - config = BertConfig.from_pretrained() - decoder = BertForMaskedLM(config) - model = Model2Model.from_pretrained('bert-base-uncased', decoder_model=decoder) - """ - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.tie_weights() - - def tie_weights(self): - """ Tying the encoder and decoders' embeddings together. - - We need for each to get down to the embedding weights. However the - different model classes are inconsistent to that respect: - - BertModel: embeddings.word_embeddings - - RoBERTa: embeddings.word_embeddings - - XLMModel: embeddings - - GPT2: wte - - BertForMaskedLM: bert.embeddings.word_embeddings - - RobertaForMaskedLM: roberta.embeddings.word_embeddings - - argument of the XEmbedding layer for each model, but it is "blocked" - by a model-specific keyword (bert, )... - """ - # self._tie_or_clone_weights(self.encoder, self.decoder) - pass - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): - - if ( - "bert" not in pretrained_model_name_or_path - or "roberta" in pretrained_model_name_or_path - or "distilbert" in pretrained_model_name_or_path - ): - raise ValueError("Only the Bert model is currently supported.") - - model = super().from_pretrained( - encoder_pretrained_model_name_or_path=pretrained_model_name_or_path, - decoder_pretrained_model_name_or_path=pretrained_model_name_or_path, - *args, - **kwargs, - ) - - return model diff --git a/tests/test_modeling_encoder_decoder.py b/tests/test_modeling_encoder_decoder.py deleted file mode 100644 index ac01e7b561..0000000000 --- a/tests/test_modeling_encoder_decoder.py +++ /dev/null @@ -1,50 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The Hugging Face Inc. Team -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -import unittest - -from transformers import is_torch_available - -from .utils import require_torch, slow - - -if is_torch_available(): - from transformers import BertModel, BertForMaskedLM, Model2Model - from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP - - -@require_torch -class EncoderDecoderModelTest(unittest.TestCase): - @slow - def test_model2model_from_pretrained(self): - logging.basicConfig(level=logging.INFO) - for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - model = Model2Model.from_pretrained(model_name) - self.assertIsInstance(model.encoder, BertModel) - self.assertIsInstance(model.decoder, BertForMaskedLM) - self.assertEqual(model.decoder.config.is_decoder, True) - self.assertEqual(model.encoder.config.is_decoder, False) - - def test_model2model_from_pretrained_not_bert(self): - logging.basicConfig(level=logging.INFO) - with self.assertRaises(ValueError): - _ = Model2Model.from_pretrained("roberta") - - with self.assertRaises(ValueError): - _ = Model2Model.from_pretrained("distilbert") - - with self.assertRaises(ValueError): - _ = Model2Model.from_pretrained("does-not-exist")