From 3e1bc27e1b80183d3d3bd7aa2c2596a039a08ad3 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Mon, 20 Jan 2020 13:24:04 -0500 Subject: [PATCH] Pytorch RoBERTa --- docs/source/model_doc/roberta.rst | 26 +- src/transformers/modeling_bert.py | 2 +- src/transformers/modeling_roberta.py | 486 ++++++++++++--------------- 3 files changed, 234 insertions(+), 280 deletions(-) diff --git a/docs/source/model_doc/roberta.rst b/docs/source/model_doc/roberta.rst index aa7b728cec..3592aadead 100644 --- a/docs/source/model_doc/roberta.rst +++ b/docs/source/model_doc/roberta.rst @@ -1,35 +1,45 @@ RoBERTa ---------------------------------------------------- -``RobertaConfig`` +The RoBERTa model was proposed in `RoBERTa: A Robustly Optimized BERT Pretraining Approach`_ +by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, +Veselin Stoyanov. It is based on Google's BERT model released in 2018. + +It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining +objective and training with much larger mini-batches and learning rates. + +This implementation is the same as BertModel with a tiny embeddings tweak as well as a setup for Roberta pretrained +models. + +RobertaConfig ~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.RobertaConfig :members: -``RobertaTokenizer`` +RobertaTokenizer ~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.RobertaTokenizer :members: -``RobertaModel`` +RobertaModel ~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.RobertaModel :members: -``RobertaForMaskedLM`` +RobertaForMaskedLM ~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.RobertaForMaskedLM :members: -``RobertaForSequenceClassification`` +RobertaForSequenceClassification ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.RobertaForSequenceClassification @@ -42,21 +52,21 @@ RobertaForTokenClassification .. autoclass:: transformers.RobertaForTokenClassification :members: -``TFRobertaModel`` +TFRobertaModel ~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.TFRobertaModel :members: -``TFRobertaForMaskedLM`` +TFRobertaForMaskedLM ~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.TFRobertaForMaskedLM :members: -``TFRobertaForSequenceClassification`` +TFRobertaForSequenceClassification ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. autoclass:: transformers.TFRobertaForSequenceClassification diff --git a/src/transformers/modeling_bert.py b/src/transformers/modeling_bert.py index 61c7dd6dc3..67a86ef70b 100644 --- a/src/transformers/modeling_bert.py +++ b/src/transformers/modeling_bert.py @@ -592,7 +592,7 @@ BERT_INPUTS_DOCSTRING = r""" Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. - input_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. diff --git a/src/transformers/modeling_roberta.py b/src/transformers/modeling_roberta.py index 27ef7bf270..23c7b29301 100644 --- a/src/transformers/modeling_roberta.py +++ b/src/transformers/modeling_roberta.py @@ -23,7 +23,7 @@ import torch.nn as nn from torch.nn import CrossEntropyLoss, MSELoss from .configuration_roberta import RobertaConfig -from .file_utils import add_start_docstrings +from .file_utils import add_start_docstrings, add_start_docstrings_to_callable from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu @@ -92,26 +92,11 @@ class RobertaEmbeddings(BertEmbeddings): return position_ids.unsqueeze(0).expand(input_shape) -ROBERTA_START_DOCSTRING = r""" The RoBERTa model was proposed in - `RoBERTa: A Robustly Optimized BERT Pretraining Approach`_ - by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, - Veselin Stoyanov. It is based on Google's BERT model released in 2018. +ROBERTA_START_DOCSTRING = r""" - It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining - objective and training with much larger mini-batches and learning rates. - - This implementation is the same as BertModel with a tiny embeddings tweak as well as a setup for Roberta pretrained - models. - - This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and + This model is a PyTorch `torch.nn.Module `_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. - .. _`RoBERTa: A Robustly Optimized BERT Pretraining Approach`: - https://arxiv.org/abs/1907.11692 - - .. _`torch.nn.Module`: - https://pytorch.org/docs/stable/nn.html#module - Parameters: config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. @@ -119,47 +104,38 @@ ROBERTA_START_DOCSTRING = r""" The RoBERTa model was proposed in """ ROBERTA_INPUTS_DOCSTRING = r""" - Inputs: - **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: - Indices of input sequence tokens in the vocabulary. - To match pre-training, RoBERTa input sequence should be formatted with and tokens as follows: - - (a) For sequence pairs: - - ``tokens: Is this Jacksonville ? No it is not . `` - - (b) For single sequences: - - ``tokens: the dog is hairy . `` - - Fully encoded sequences or sequence pairs can be obtained using the RobertaTokenizer.encode function with - the ``add_special_tokens`` parameter set to ``True``. - - RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on - the right rather than the left. - + Args: + input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using :class:`transformers.RobertaTokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. - **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: + :func:`transformers.PreTrainedTokenizer.encode_plus` for details. + + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. - **token_type_ids**: (`optional` need to be trained) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: - Optional segment token indices to indicate first and second portions of the inputs. - This embedding matrice is not trained (not pretrained during RoBERTa pretraining), you will have to train it - during finetuning. + + `What are attention masks? <../glossary.html#attention-mask>`__ + token_type_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` corresponds to a `sentence B` token - (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details). - **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + + `What are token type IDs? <../glossary.html#token-type-ids>`_ + position_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): Indices of positions of each input sequence tokens in the position embeddings. - Selected in the range ``[0, config.max_position_embeddings - 1[``. - **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: + Selected in the range ``[0, config.max_position_embeddings - 1]``. + + `What are position IDs? <../glossary.html#position-ids>`_ + head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`): Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: - ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. - **inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``: - Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation. + :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**. + inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. """ @@ -168,36 +144,11 @@ ROBERTA_INPUTS_DOCSTRING = r""" @add_start_docstrings( "The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.", ROBERTA_START_DOCSTRING, - ROBERTA_INPUTS_DOCSTRING, ) class RobertaModel(BertModel): - r""" - Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: - **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)`` - Sequence of hidden-states at the output of the last layer of the model. - **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)`` - Last layer hidden-state of the first token of the sequence (classification token) - further processed by a Linear layer and a Tanh activation function. The Linear - layer weights are trained from the next sentence prediction (classification) - objective during Bert pretraining. This output is usually *not* a good summary - of the semantic content of the input, you're often better with averaging or pooling - the sequence of hidden-states for the whole input sequence. - **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) - list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) - of shape ``(batch_size, sequence_length, hidden_size)``: - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - **attentions**: (`optional`, returned when ``config.output_attentions=True``) - list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - - Examples:: - - tokenizer = RobertaTokenizer.from_pretrained('roberta-base') - model = RobertaModel.from_pretrained('roberta-base') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 - outputs = model(input_ids) - last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple - + """ + This class overrides :class:`~transformers.BertModel`. Please check the + superclass for the appropriate documentation alongside usage examples. """ config_class = RobertaConfig pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP @@ -217,38 +168,9 @@ class RobertaModel(BertModel): @add_start_docstrings( - """RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING + """RoBERTa Model with a `language modeling` head on top. """, ROBERTA_START_DOCSTRING ) class RobertaForMaskedLM(BertPreTrainedModel): - r""" - **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: - Labels for computing the masked language modeling loss. - Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) - Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels - in ``[0, ..., config.vocab_size]`` - - Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: - **loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: - Masked language modeling loss. - **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) - list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) - of shape ``(batch_size, sequence_length, hidden_size)``: - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - **attentions**: (`optional`, returned when ``config.output_attentions=True``) - list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - - Examples:: - - tokenizer = RobertaTokenizer.from_pretrained('roberta-base') - model = RobertaForMaskedLM.from_pretrained('roberta-base') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 - outputs = model(input_ids, masked_lm_labels=input_ids) - loss, prediction_scores = outputs[:2] - - """ config_class = RobertaConfig pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP base_model_prefix = "roberta" @@ -264,6 +186,7 @@ class RobertaForMaskedLM(BertPreTrainedModel): def get_output_embeddings(self): return self.lm_head.decoder + @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) def forward( self, input_ids=None, @@ -274,6 +197,40 @@ class RobertaForMaskedLM(BertPreTrainedModel): inputs_embeds=None, masked_lm_labels=None, ): + r""" + masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Labels for computing the masked language modeling loss. + Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels + in ``[0, ..., config.vocab_size]`` + + Returns: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: + masked_lm_loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Masked language modeling loss. + prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + + Examples:: + + tokenizer = RobertaTokenizer.from_pretrained('roberta-base') + model = RobertaForMaskedLM.from_pretrained('roberta-base') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, masked_lm_labels=input_ids) + loss, prediction_scores = outputs[:2] + + """ outputs = self.roberta( input_ids, attention_mask=attention_mask, @@ -324,39 +281,8 @@ class RobertaLMHead(nn.Module): """RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, ROBERTA_START_DOCSTRING, - ROBERTA_INPUTS_DOCSTRING, ) class RobertaForSequenceClassification(BertPreTrainedModel): - r""" - **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: - Labels for computing the sequence classification/regression loss. - Indices should be in ``[0, ..., config.num_labels]``. - If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), - If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). - - Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: - **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: - Classification (or regression if config.num_labels==1) loss. - **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` - Classification (or regression if config.num_labels==1) scores (before SoftMax). - **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) - list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) - of shape ``(batch_size, sequence_length, hidden_size)``: - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - **attentions**: (`optional`, returned when ``config.output_attentions=True``) - list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - - Examples:: - - tokenizer = RobertaTokenizer.from_pretrained('roberta-base') - model = RobertaForSequenceClassification.from_pretrained('roberta-base') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 - labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 - outputs = model(input_ids, labels=labels) - loss, logits = outputs[:2] - - """ config_class = RobertaConfig pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP base_model_prefix = "roberta" @@ -368,6 +294,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel): self.roberta = RobertaModel(config) self.classifier = RobertaClassificationHead(config) + @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) def forward( self, input_ids=None, @@ -378,6 +305,41 @@ class RobertaForSequenceClassification(BertPreTrainedModel): inputs_embeds=None, labels=None, ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for computing the sequence classification/regression loss. + Indices should be in :obj:`[0, ..., config.num_labels - 1]`. + If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), + If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). + + Returns: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.RobertaConfig`) and inputs: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + + Examples:: + + tokenizer = RobertaTokenizer.from_pretrained('roberta-base') + model = RobertaForSequenceClassification.from_pretrained('roberta-base') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 + labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, labels=labels) + loss, logits = outputs[:2] + + """ outputs = self.roberta( input_ids, attention_mask=attention_mask, @@ -407,78 +369,8 @@ class RobertaForSequenceClassification(BertPreTrainedModel): """Roberta Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, ROBERTA_START_DOCSTRING, - ROBERTA_INPUTS_DOCSTRING, ) class RobertaForMultipleChoice(BertPreTrainedModel): - r""" - Inputs: - **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``: - Indices of input sequence tokens in the vocabulary. - The second dimension of the input (`num_choices`) indicates the number of choices to score. - To match pre-training, RoBerta input sequence should be formatted with [CLS] and [SEP] tokens as follows: - - (a) For sequence pairs: - - ``tokens: [CLS] is this jack ##son ##ville ? [SEP] [SEP] no it is not . [SEP]`` - - ``token_type_ids: 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1`` - - (b) For single sequences: - - ``tokens: [CLS] the dog is hairy . [SEP]`` - - ``token_type_ids: 0 0 0 0 0 0 0`` - - Indices can be obtained using :class:`transformers.BertTokenizer`. - See :func:`transformers.PreTrainedTokenizer.encode` and - :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. - **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``: - Segment token indices to indicate first and second portions of the inputs. - The second dimension of the input (`num_choices`) indicates the number of choices to score. - Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` - **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``: - Mask to avoid performing attention on padding token indices. - The second dimension of the input (`num_choices`) indicates the number of choices to score. - Mask values selected in ``[0, 1]``: - ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. - **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: - Mask to nullify selected heads of the self-attention modules. - Mask values selected in ``[0, 1]``: - ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. - **inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``: - Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation. - This is useful if you want more control over how to convert `input_ids` indices into associated vectors - than the model's internal embedding lookup matrix. - **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: - Labels for computing the multiple choice classification loss. - Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension - of the input tensors. (see `input_ids` above) - - Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: - **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: - Classification loss. - **classification_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)`` where `num_choices` is the size of the second dimension - of the input tensors. (see `input_ids` above). - Classification scores (before SoftMax). - **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) - list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) - of shape ``(batch_size, sequence_length, hidden_size)``: - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - **attentions**: (`optional`, returned when ``config.output_attentions=True``) - list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - - Examples:: - - tokenizer = RobertaTokenizer.from_pretrained('roberta-base') - model = RobertaForMultipleChoice.from_pretrained('roberta-base') - choices = ["Hello, my dog is cute", "Hello, my cat is amazing"] - input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices - labels = torch.tensor(1).unsqueeze(0) # Batch size 1 - outputs = model(input_ids, labels=labels) - loss, classification_scores = outputs[:2] - - """ config_class = RobertaConfig pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP base_model_prefix = "roberta" @@ -492,6 +384,7 @@ class RobertaForMultipleChoice(BertPreTrainedModel): self.init_weights() + @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) def forward( self, input_ids=None, @@ -502,6 +395,43 @@ class RobertaForMultipleChoice(BertPreTrainedModel): head_mask=None, inputs_embeds=None, ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for computing the multiple choice classification loss. + Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension + of the input tensors. (see `input_ids` above) + + Returns: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (config) and inputs: + loss (:obj:`torch.FloatTensor`` of shape ``(1,)`, `optional`, returned when :obj:`labels` is provided): + Classification loss. + classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`): + `num_choices` is the second dimension of the input tensors. (see `input_ids` above). + + Classification scores (before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + + Examples:: + + tokenizer = RobertaTokenizer.from_pretrained('roberta-base') + model = RobertaForMultipleChoice.from_pretrained('roberta-base') + choices = ["Hello, my dog is cute", "Hello, my cat is amazing"] + input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices + labels = torch.tensor(1).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, labels=labels) + loss, classification_scores = outputs[:2] + + """ num_choices = input_ids.shape[1] flat_input_ids = input_ids.view(-1, input_ids.size(-1)) @@ -535,37 +465,8 @@ class RobertaForMultipleChoice(BertPreTrainedModel): """Roberta Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, ROBERTA_START_DOCSTRING, - ROBERTA_INPUTS_DOCSTRING, ) class RobertaForTokenClassification(BertPreTrainedModel): - r""" - **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: - Labels for computing the token classification loss. - Indices should be in ``[0, ..., config.num_labels - 1]``. - - Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: - **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: - Classification loss. - **scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)`` - Classification scores (before SoftMax). - **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) - list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) - of shape ``(batch_size, sequence_length, hidden_size)``: - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - **attentions**: (`optional`, returned when ``config.output_attentions=True``) - list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - - Examples:: - - tokenizer = RobertaTokenizer.from_pretrained('roberta-base') - model = RobertaForTokenClassification.from_pretrained('roberta-base') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 - labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 - outputs = model(input_ids, labels=labels) - loss, scores = outputs[:2] - - """ config_class = RobertaConfig pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP base_model_prefix = "roberta" @@ -580,6 +481,7 @@ class RobertaForTokenClassification(BertPreTrainedModel): self.init_weights() + @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) def forward( self, input_ids=None, @@ -590,6 +492,39 @@ class RobertaForTokenClassification(BertPreTrainedModel): inputs_embeds=None, labels=None, ): + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`): + Labels for computing the token classification loss. + Indices should be in ``[0, ..., config.num_labels - 1]``. + + Returns: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:obj:`~transformers.RobertaConfig`) and inputs: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) : + Classification loss. + scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`) + Classification scores (before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + + Examples:: + + tokenizer = RobertaTokenizer.from_pretrained('roberta-base') + model = RobertaForTokenClassification.from_pretrained('roberta-base') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 + labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, labels=labels) + loss, scores = outputs[:2] + + """ outputs = self.roberta( input_ids, @@ -644,41 +579,8 @@ class RobertaClassificationHead(nn.Module): """Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, ROBERTA_START_DOCSTRING, - ROBERTA_INPUTS_DOCSTRING, ) class RobertaForQuestionAnswering(BertPreTrainedModel): - r""" - **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: - Labels for position (index) of the start of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (`sequence_length`). - Position outside of the sequence are not taken into account for computing the loss. - **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: - Labels for position (index) of the end of the labelled span for computing the token classification loss. - Positions are clamped to the length of the sequence (`sequence_length`). - Position outside of the sequence are not taken into account for computing the loss. - Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: - **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: - Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. - **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` - Span-start scores (before SoftMax). - **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` - Span-end scores (before SoftMax). - **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) - list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) - of shape ``(batch_size, sequence_length, hidden_size)``: - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - **attentions**: (`optional`, returned when ``config.output_attentions=True``) - list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - Examples:: - tokenizer = RobertaTokenizer.from_pretrained('roberta-large') - model = RobertaForQuestionAnswering.from_pretrained('roberta-large') - question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" - input_ids = tokenizer.encode(question, text) - start_scores, end_scores = model(torch.tensor([input_ids])) - all_tokens = tokenizer.convert_ids_to_tokens(input_ids) - answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]) - """ config_class = RobertaConfig pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP base_model_prefix = "roberta" @@ -692,6 +594,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): self.init_weights() + @add_start_docstrings_to_callable(ROBERTA_INPUTS_DOCSTRING) def forward( self, input_ids, @@ -699,9 +602,49 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): token_type_ids=None, position_ids=None, head_mask=None, + inputs_embeds=None, start_positions=None, end_positions=None, ): + r""" + start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + + Returns: + :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (config) and inputs: + loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): + Span-start scores (before SoftMax). + end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`): + Span-end scores (before SoftMax). + hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_hidden_states=True``): + Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) + of shape :obj:`(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``): + Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape + :obj:`(batch_size, num_heads, sequence_length, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + + Examples:: + tokenizer = RobertaTokenizer.from_pretrained('roberta-large') + model = RobertaForQuestionAnswering.from_pretrained('roberta-large') + question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" + input_ids = tokenizer.encode(question, text) + start_scores, end_scores = model(torch.tensor([input_ids])) + all_tokens = tokenizer.convert_ids_to_tokens(input_ids) + answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]) + """ outputs = self.roberta( input_ids, @@ -709,6 +652,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): token_type_ids=token_type_ids, position_ids=position_ids, head_mask=head_mask, + inputs_embeds=inputs_embeds ) sequence_output = outputs[0]