From e714412fe6b38346a1f73525b701e030857b2f21 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Tue, 21 Jul 2020 18:13:55 -0400 Subject: [PATCH] Update doc to new model outputs (#5946) * Update doc to new model outputs * Fix outputs in quicktour --- docs/source/quicktour.rst | 23 +++++++++++++++++------ docs/source/task_summary.rst | 10 +++++----- docs/source/training.rst | 4 ++-- src/transformers/file_utils.py | 21 ++++++++++++++------- src/transformers/modeling_albert.py | 3 ++- src/transformers/modeling_bart.py | 2 +- src/transformers/modeling_bert.py | 8 ++++---- src/transformers/modeling_distilbert.py | 4 ++-- src/transformers/modeling_dpr.py | 11 +++++------ src/transformers/modeling_electra.py | 3 +-- src/transformers/modeling_gpt2.py | 3 ++- src/transformers/modeling_longformer.py | 10 +++++++--- src/transformers/modeling_mobilebert.py | 7 +++++-- src/transformers/modeling_openai.py | 3 ++- src/transformers/modeling_xlm.py | 2 +- src/transformers/modeling_xlnet.py | 6 +++--- 16 files changed, 73 insertions(+), 47 deletions(-) diff --git a/docs/source/quicktour.rst b/docs/source/quicktour.rst index 5c54cfd41b..d0ac2e9d81 100644 --- a/docs/source/quicktour.rst +++ b/docs/source/quicktour.rst @@ -230,13 +230,18 @@ final activations of the model. >>> ## PYTORCH CODE >>> print(pt_outputs) - (tensor([[-4.0833, 4.3364], - [ 0.0818, -0.0418]], grad_fn=),) + SequenceClassifierOutput(loss=None, logits=tensor([[-4.0833, 4.3364], + [ 0.0818, -0.0418]], grad_fn=), hidden_states=None, attentions=None) >>> ## TENSORFLOW CODE >>> print(tf_outputs) (,) + array([[-4.0832963 , 4.336414 ], + [ 0.08181786, -0.04179301]], dtype=float32)>,) + +The model can return more than just the final activations, which is why the PyTorch output is a special class and the +TensorFlow output is a tuple. Here we only asked for the final activations, so we get a tuple with one element on the +TensorFlow side and a :class:`~transformers.modeling_outputs.SequenceClassifierOutput` with just the ``logits`` field +filled on the PyTorch side. .. note:: @@ -249,7 +254,7 @@ Let's apply the SoftMax activation to get predictions. >>> ## PYTORCH CODE >>> import torch.nn.functional as F - >>> pt_predictions = F.softmax(pt_outputs[0], dim=-1) + >>> pt_predictions = F.softmax(pt_outputs.logits, dim=-1) >>> ## TENSORFLOW CODE >>> import tensorflow as tf >>> tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1) @@ -262,7 +267,7 @@ We can see we get the numbers from before: >>> print(tf_predictions) tf.Tensor( [[2.2042994e-04 9.9977952e-01] - [5.3086078e-01 4.6913919e-01]], shape=(2, 2), dtype=float32) + [5.3086340e-01 4.6913657e-01]], shape=(2, 2), dtype=float32) >>> ## PYTORCH CODE >>> print(pt_predictions) tensor([[2.2043e-04, 9.9978e-01], @@ -285,6 +290,12 @@ training loop. 🤗 Transformers also provides a :class:`~transformers.Trainer` you are using TensorFlow) class to help with your training (taking care of things such as distributed training, mixed precision, etc.). See the :doc:`training tutorial ` for more details. +.. note:: + + Pytorch model outputs are special dataclasses so that you can get autocompletion for their attributes in an IDE. + They also behave like a tuple or a dictionary (e.g., you can index with an integer, a slice or a string) in which + case the attributes not set (that have :obj:`None` values) are ignored. + Once your model is fine-tuned, you can save it with its tokenizer in the following way: :: diff --git a/docs/source/task_summary.rst b/docs/source/task_summary.rst index 0a425b52cf..90e065b481 100644 --- a/docs/source/task_summary.rst +++ b/docs/source/task_summary.rst @@ -98,8 +98,8 @@ of each other. The process is the following: >>> paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="pt") >>> not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="pt") - >>> paraphrase_classification_logits = model(**paraphrase)[0] - >>> not_paraphrase_classification_logits = model(**not_paraphrase)[0] + >>> paraphrase_classification_logits = model(**paraphrase).logits + >>> not_paraphrase_classification_logits = model(**not_paraphrase).logits >>> paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0] >>> not_paraphrase_results = torch.softmax(not_paraphrase_classification_logits, dim=1).tolist()[0] @@ -375,7 +375,7 @@ Here is an example doing masked language modeling using a model and a tokenizer. >>> input = tokenizer.encode(sequence, return_tensors="pt") >>> mask_token_index = torch.where(input == tokenizer.mask_token_id)[1] - >>> token_logits = model(input)[0] + >>> token_logits = model(input).logits >>> mask_token_logits = token_logits[0, mask_token_index, :] >>> top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist() @@ -436,7 +436,7 @@ Here is an example using the tokenizer and model and leveraging the :func:`~tran >>> input_ids = tokenizer.encode(sequence, return_tensors="pt") >>> # get logits of last hidden state - >>> next_token_logits = model(input_ids)[0][:, -1, :] + >>> next_token_logits = model(input_ids).logits[:, -1, :] >>> # filter >>> filtered_next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0) @@ -666,7 +666,7 @@ Here is an example doing named entity recognition using a model and a tokenizer. >>> tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence))) >>> inputs = tokenizer.encode(sequence, return_tensors="pt") - >>> outputs = model(inputs)[0] + >>> outputs = model(inputs).logits >>> predictions = torch.argmax(outputs, dim=2) >>> ## TENSORFLOW CODE >>> from transformers import TFAutoModelForTokenClassification, AutoTokenizer diff --git a/docs/source/training.rst b/docs/source/training.rst index c7fdc02574..7ddfcc40fb 100644 --- a/docs/source/training.rst +++ b/docs/source/training.rst @@ -99,7 +99,7 @@ backwards pass and update the weights: labels = torch.tensor([1,0]).unsqueeze(0) outputs = model(input_ids, attention_mask=attention_mask, labels=labels) - loss = outputs[0] + loss = outputs.loss loss.backward() optimizer.step() @@ -111,7 +111,7 @@ The following is equivalent to the previous example: from torch.nn import functional as F labels = torch.tensor([1,0]).unsqueeze(0) outputs = model(input_ids, attention_mask=attention_mask) - loss = F.cross_entropy(labels, outputs[0]) + loss = F.cross_entropy(labels, outputs.logitd) loss.backward() optimizer.step() diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py index 33688b7676..57b8c3d310 100644 --- a/src/transformers/file_utils.py +++ b/src/transformers/file_utils.py @@ -226,7 +226,8 @@ PT_TOKEN_CLASSIFICATION_SAMPLE = r""" >>> labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0) # Batch size 1 >>> outputs = model(**inputs, labels=labels) - >>> loss, scores = outputs[:2] + >>> loss = outputs.loss + >>> logits = outputs.logits """ PT_QUESTION_ANSWERING_SAMPLE = r""" @@ -243,7 +244,9 @@ PT_QUESTION_ANSWERING_SAMPLE = r""" >>> end_positions = torch.tensor([3]) >>> outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions) - >>> loss, start_scores, end_scores = outputs[:3] + >>> loss = outputs.loss + >>> start_scores = outputs.start_scores + >>> end_scores = outputs.end_scores """ PT_SEQUENCE_CLASSIFICATION_SAMPLE = r""" @@ -258,7 +261,8 @@ PT_SEQUENCE_CLASSIFICATION_SAMPLE = r""" >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 >>> outputs = model(**inputs, labels=labels) - >>> loss, logits = outputs[:2] + >>> loss = outputs.loss + >>> logits = outputs.logits """ PT_MASKED_LM_SAMPLE = r""" @@ -273,7 +277,8 @@ PT_MASKED_LM_SAMPLE = r""" >>> input_ids = tokenizer("Hello, my dog is cute", return_tensors="pt")["input_ids"] >>> outputs = model(input_ids, labels=input_ids) - >>> loss, prediction_scores = outputs[:2] + >>> loss = outputs.loss + >>> prediction_logits = outputs.logits """ PT_BASE_MODEL_SAMPLE = r""" @@ -288,7 +293,7 @@ PT_BASE_MODEL_SAMPLE = r""" >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs) - >>> last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple + >>> last_hidden_states = outputs.last_hidden_state """ PT_MULTIPLE_CHOICE_SAMPLE = r""" @@ -309,7 +314,8 @@ PT_MULTIPLE_CHOICE_SAMPLE = r""" >>> outputs = model(**{{k: v.unsqueeze(0) for k,v in encoding.items()}}, labels=labels) # batch size is 1 >>> # the linear classifier still needs to be trained - >>> loss, logits = outputs[:2] + >>> loss = outputs.loss + >>> logits = outputs.logits """ PT_CAUSAL_LM_SAMPLE = r""" @@ -323,7 +329,8 @@ PT_CAUSAL_LM_SAMPLE = r""" >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs, labels=inputs["input_ids"]) - >>> loss, logits = outputs[:2] + >>> loss = outputs.loss + >>> logits = outputs.logits """ TF_TOKEN_CLASSIFICATION_SAMPLE = r""" diff --git a/src/transformers/modeling_albert.py b/src/transformers/modeling_albert.py index c1f1f73c76..f50ce673f2 100644 --- a/src/transformers/modeling_albert.py +++ b/src/transformers/modeling_albert.py @@ -683,7 +683,8 @@ class AlbertForPreTraining(AlbertPreTrainedModel): >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 >>> outputs = model(input_ids) - >>> prediction_scores, sop_scores = outputs[:2] + >>> prediction_logits = outputs.prediction_logits + >>> sop_logits = outputs.sop_logits """ diff --git a/src/transformers/modeling_bart.py b/src/transformers/modeling_bart.py index d15142a0ca..66a6527f43 100644 --- a/src/transformers/modeling_bart.py +++ b/src/transformers/modeling_bart.py @@ -996,7 +996,7 @@ class BartForConditionalGeneration(PretrainedBartModel): model = BartForConditionalGeneration.from_pretrained('facebook/bart-large') input_ids = tokenizer([TXT], return_tensors='pt')['input_ids'] - logits = model(input_ids)[0] + logits = model(input_ids).logits masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item() probs = logits[0, masked_index].softmax(dim=0) diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py index d2f6c37102..281ecd766f 100644 --- a/src/transformers/modeling_bert.py +++ b/src/transformers/modeling_bert.py @@ -873,8 +873,8 @@ class BertForPreTraining(BertPreTrainedModel): >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs) - >>> prediction_scores, seq_relationship_scores = outputs[:2] - + >>> prediction_logits = outptus.prediction_logits + >>> seq_relationship_logits = outputs.seq_relationship_logits """ if "masked_lm_labels" in kwargs: warnings.warn( @@ -978,7 +978,7 @@ class BertLMHeadModel(BertPreTrainedModel): >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs) - >>> prediction_scores = outputs.prediction_scores + >>> prediction_logits = outputs.logits """ return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple @@ -1181,7 +1181,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel): >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt') >>> outputs = model(**encoding, next_sentence_label=torch.LongTensor([1])) - >>> logits = outputs.seq_relationship_scores + >>> logits = outputs.logits >>> assert logits[0, 0] < logits[0, 1] # next sentence was random """ return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple diff --git a/src/transformers/modeling_distilbert.py b/src/transformers/modeling_distilbert.py index fd5034f3b0..9c3f4e0319 100644 --- a/src/transformers/modeling_distilbert.py +++ b/src/transformers/modeling_distilbert.py @@ -876,8 +876,8 @@ class DistilBertForMultipleChoice(DistilBertPreTrainedModel): >>> outputs = model(**{k: v.unsqueeze(0) for k,v in encoding.items()}, labels=labels) # batch size is 1 >>> # the linear classifier still needs to be trained - >>> loss, logits = outputs[:2] - + >>> loss = outputs.loss + >>> logits = outputs.logits """ return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] diff --git a/src/transformers/modeling_dpr.py b/src/transformers/modeling_dpr.py index bce8b4e17c..7cffaabdc0 100644 --- a/src/transformers/modeling_dpr.py +++ b/src/transformers/modeling_dpr.py @@ -423,8 +423,7 @@ class DPRContextEncoder(DPRPretrainedContextEncoder): tokenizer = DPRContextEncoderTokenizer.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base') model = DPRContextEncoder.from_pretrained('facebook/dpr-ctx_encoder-single-nq-base') input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"] - embeddings = model(input_ids)[0] # the embeddings of the given context. - + embeddings = model(input_ids).pooler_output """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions @@ -502,7 +501,7 @@ class DPRQuestionEncoder(DPRPretrainedQuestionEncoder): tokenizer = DPRQuestionEncoderTokenizer.from_pretrained('facebook/dpr-question_encoder-single-nq-base') model = DPRQuestionEncoder.from_pretrained('facebook/dpr-question_encoder-single-nq-base') input_ids = tokenizer("Hello, is my dog cute ?", return_tensors='pt')["input_ids"] - embeddings = model(input_ids)[0] # the embeddings of the given question. + embeddings = model(input_ids).pooler_output """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( @@ -583,9 +582,9 @@ class DPRReader(DPRPretrainedReader): return_tensors='pt' ) outputs = model(**encoded_inputs) - start_logits = outputs[0] # The logits of the start of the spans - end_logits = outputs[1] # The logits of the end of the spans - relevance_logits = outputs[2] # The relevance scores of the passages + start_logits = outputs.stat_logits + end_logits = outputs.end_logits + relevance_logits = outputs.relevance_logits """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions diff --git a/src/transformers/modeling_electra.py b/src/transformers/modeling_electra.py index 267dbea7d9..8f24343cca 100644 --- a/src/transformers/modeling_electra.py +++ b/src/transformers/modeling_electra.py @@ -525,8 +525,7 @@ class ElectraForPreTraining(ElectraPreTrainedModel): >>> model = ElectraForPreTraining.from_pretrained('google/electra-small-discriminator') >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 - >>> scores = model(input_ids)[0] - + >>> logits = model(input_ids).logits """ return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple diff --git a/src/transformers/modeling_gpt2.py b/src/transformers/modeling_gpt2.py index e13715844d..010513bc99 100644 --- a/src/transformers/modeling_gpt2.py +++ b/src/transformers/modeling_gpt2.py @@ -754,7 +754,8 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): >>> mc_token_ids = torch.tensor([cls_token_location]) # Batch size: 1 >>> outputs = model(input_ids, mc_token_ids=mc_token_ids) - >>> lm_prediction_scores, mc_prediction_scores = outputs[:2] + >>> lm_logits = outputs.lm_logits + >>> mc_logits = outputs.mc_logits """ if "lm_labels" in kwargs: diff --git a/src/transformers/modeling_longformer.py b/src/transformers/modeling_longformer.py index 9d5e0d5b53..c440af07a0 100644 --- a/src/transformers/modeling_longformer.py +++ b/src/transformers/modeling_longformer.py @@ -1090,7 +1090,9 @@ class LongformerForMaskedLM(LongformerPreTrainedModel): >>> attention_mask = None # default is local attention everywhere, which is a good choice for MaskedLM ... # check ``LongformerModel.forward`` for more details how to set `attention_mask` - >>> loss, prediction_scores = model(input_ids, attention_mask=attention_mask, labels=input_ids) + >>> outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids) + >>> loss = outputs.loss + >>> prediction_logits = output.logits """ if "masked_lm_labels" in kwargs: @@ -1299,10 +1301,12 @@ class LongformerForQuestionAnswering(BertPreTrainedModel): >>> # the forward method will automatically set global attention on question tokens >>> attention_mask = encoding["attention_mask"] - >>> start_scores, end_scores = model(input_ids, attention_mask=attention_mask) + >>> outputs = model(input_ids, attention_mask=attention_mask) + >>> start_logits = outputs.start_logits + >>> end_logits = outputs.end_logits >>> all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist()) - >>> answer_tokens = all_tokens[torch.argmax(start_scores) :torch.argmax(end_scores)+1] + >>> answer_tokens = all_tokens[torch.argmax(start_logits) :torch.argmax(end_logits)+1] >>> answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens)) # remove space prepending space token """ diff --git a/src/transformers/modeling_mobilebert.py b/src/transformers/modeling_mobilebert.py index b62035f541..a32957c522 100644 --- a/src/transformers/modeling_mobilebert.py +++ b/src/transformers/modeling_mobilebert.py @@ -979,7 +979,8 @@ class MobileBertForPreTraining(MobileBertPreTrainedModel): >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 >>> outputs = model(input_ids) - >>> prediction_scores, seq_relationship_scores = outputs[:2] + >>> prediction_logits = outptus.prediction_logits + >>> seq_relationship_logits = outputs.seq_relationship_logits """ return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple @@ -1186,7 +1187,9 @@ class MobileBertForNextSentencePrediction(MobileBertPreTrainedModel): >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light." >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt') - >>> loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1])) + >>> outputs = model(**encoding, next_sentence_label=torch.LongTensor([1])) + >>> loss = outputs.loss + >>> logits = outputs.logits """ return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple diff --git a/src/transformers/modeling_openai.py b/src/transformers/modeling_openai.py index 5e6a0adbc5..5365b943af 100644 --- a/src/transformers/modeling_openai.py +++ b/src/transformers/modeling_openai.py @@ -659,7 +659,8 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): mc_token_ids = torch.tensor([input_ids.size(-1)-1, input_ids.size(-1)-1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, mc_token_ids=mc_token_ids) - lm_prediction_scores, mc_prediction_scores = outputs[:2] + lm_logits = outputs.lm_logits + mc_logits = outputs.mc_logits """ return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple if "lm_labels" in kwargs: diff --git a/src/transformers/modeling_xlm.py b/src/transformers/modeling_xlm.py index 3847d91756..96c287faef 100644 --- a/src/transformers/modeling_xlm.py +++ b/src/transformers/modeling_xlm.py @@ -989,7 +989,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel): >>> end_positions = torch.tensor([3]) >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) - >>> loss = outputs[0] + >>> loss = outputs.loss """ return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple diff --git a/src/transformers/modeling_xlnet.py b/src/transformers/modeling_xlnet.py index 1748271521..4448313817 100644 --- a/src/transformers/modeling_xlnet.py +++ b/src/transformers/modeling_xlnet.py @@ -1366,8 +1366,8 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token) outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping, labels=labels) - loss, next_token_logits = outputs[:2] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] - + loss = outputs.loss + next_token_logits = outputs.logits # Logits have shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] """ return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache) @@ -1876,7 +1876,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): >>> end_positions = torch.tensor([3]) >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) - >>> loss = outputs[0] + >>> loss = outputs.loss """ return_tuple = return_tuple if return_tuple is not None else self.config.use_return_tuple use_cache = self.training or (use_cache if use_cache is not None else self.config.use_cache)