Merge remote-tracking branch 'origin/master' into run_multiple_choice_add_doc
This commit is contained in:
@@ -22,14 +22,11 @@ import logging
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torch.nn import CrossEntropyLoss, MSELoss
|
||||
|
||||
from pytorch_transformers.modeling_bert import (BertConfig, BertEmbeddings,
|
||||
BertLayerNorm, BertModel,
|
||||
BertPreTrainedModel, gelu)
|
||||
|
||||
from pytorch_transformers.modeling_utils import add_start_docstrings
|
||||
from .modeling_bert import BertEmbeddings, BertLayerNorm, BertModel, BertPreTrainedModel, gelu
|
||||
from .configuration_roberta import RobertaConfig
|
||||
from .file_utils import add_start_docstrings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -39,13 +36,6 @@ ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
|
||||
'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin",
|
||||
}
|
||||
|
||||
ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json",
|
||||
'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json",
|
||||
'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json",
|
||||
}
|
||||
|
||||
|
||||
class RobertaEmbeddings(BertEmbeddings):
|
||||
"""
|
||||
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
|
||||
@@ -61,11 +51,9 @@ class RobertaEmbeddings(BertEmbeddings):
|
||||
# cf. fairseq's `utils.make_positions`
|
||||
position_ids = torch.arange(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=torch.long, device=input_ids.device)
|
||||
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
|
||||
return super(RobertaEmbeddings, self).forward(input_ids, token_type_ids=token_type_ids, position_ids=position_ids)
|
||||
|
||||
|
||||
class RobertaConfig(BertConfig):
|
||||
pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
return super(RobertaEmbeddings, self).forward(input_ids,
|
||||
token_type_ids=token_type_ids,
|
||||
position_ids=position_ids)
|
||||
|
||||
|
||||
ROBERTA_START_DOCSTRING = r""" The RoBERTa model was proposed in
|
||||
@@ -116,13 +104,20 @@ ROBERTA_INPUTS_DOCSTRING = r"""
|
||||
|
||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Indices of positions of each input sequence tokens in the position embeddings.
|
||||
Selected in the range ``[0, config.max_position_embeddings - 1[``.
|
||||
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Mask to avoid performing attention on padding token indices.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||
**token_type_ids**: (`optional` need to be trained) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Optional segment token indices to indicate first and second portions of the inputs.
|
||||
This embedding matrice is not trained (not pretrained during RoBERTa pretraining), you will have to train it
|
||||
during finetuning.
|
||||
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
|
||||
corresponds to a `sentence B` token
|
||||
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
|
||||
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||
Indices of positions of each input sequence tokens in the position embeddings.
|
||||
Selected in the range ``[0, config.max_position_embeddings - 1[``.
|
||||
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||
Mask to nullify selected heads of the self-attention modules.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
@@ -168,14 +163,18 @@ class RobertaModel(BertModel):
|
||||
super(RobertaModel, self).__init__(config)
|
||||
|
||||
self.embeddings = RobertaEmbeddings(config)
|
||||
self.apply(self.init_weights)
|
||||
self.init_weights()
|
||||
|
||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, position_ids=None, head_mask=None):
|
||||
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
|
||||
if input_ids[:, 0].sum().item() != 0:
|
||||
logger.warning("A sequence with no special tokens has been passed to the RoBERTa model. "
|
||||
"This model requires special tokens in order to work. "
|
||||
"Please specify add_special_tokens=True in your encoding.")
|
||||
return super(RobertaModel, self).forward(input_ids, token_type_ids, attention_mask, position_ids, head_mask)
|
||||
return super(RobertaModel, self).forward(input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask)
|
||||
|
||||
|
||||
@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """,
|
||||
@@ -220,7 +219,7 @@ class RobertaForMaskedLM(BertPreTrainedModel):
|
||||
self.roberta = RobertaModel(config)
|
||||
self.lm_head = RobertaLMHead(config)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
self.init_weights()
|
||||
self.tie_weights()
|
||||
|
||||
def tie_weights(self):
|
||||
@@ -229,10 +228,13 @@ class RobertaForMaskedLM(BertPreTrainedModel):
|
||||
"""
|
||||
self._tie_or_clone_weights(self.lm_head.decoder, self.roberta.embeddings.word_embeddings)
|
||||
|
||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, position_ids=None,
|
||||
head_mask=None):
|
||||
outputs = self.roberta(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
|
||||
attention_mask=attention_mask, head_mask=head_mask)
|
||||
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
||||
masked_lm_labels=None):
|
||||
outputs = self.roberta(input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask)
|
||||
sequence_output = outputs[0]
|
||||
prediction_scores = self.lm_head(sequence_output)
|
||||
|
||||
@@ -313,10 +315,13 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
|
||||
self.roberta = RobertaModel(config)
|
||||
self.classifier = RobertaClassificationHead(config)
|
||||
|
||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
|
||||
position_ids=None, head_mask=None):
|
||||
outputs = self.roberta(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
|
||||
attention_mask=attention_mask, head_mask=head_mask)
|
||||
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
||||
labels=None):
|
||||
outputs = self.roberta(input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask)
|
||||
sequence_output = outputs[0]
|
||||
logits = self.classifier(sequence_output)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user