[inputs_embeds] All PyTorch models
This commit is contained in:
@@ -48,16 +48,24 @@ class RobertaEmbeddings(BertEmbeddings):
|
||||
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size,
|
||||
padding_idx=self.padding_idx)
|
||||
|
||||
def forward(self, input_ids, token_type_ids=None, position_ids=None):
|
||||
seq_length = input_ids.size(1)
|
||||
def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
|
||||
if input_ids is not None:
|
||||
input_shape = input_ids.size()
|
||||
else:
|
||||
input_shape = inputs_embeds.size()[:-1]
|
||||
|
||||
seq_length = input_shape[1]
|
||||
device = input_ids.device if input_ids is not None else inputs_embeds.device
|
||||
|
||||
if position_ids is None:
|
||||
# Position numbers begin at padding_idx+1. Padding symbols are ignored.
|
||||
# cf. fairseq's `utils.make_positions`
|
||||
position_ids = torch.arange(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=torch.long, device=input_ids.device)
|
||||
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
|
||||
position_ids = torch.arange(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=torch.long, device=device)
|
||||
position_ids = position_ids.unsqueeze(0).expand(input_shape)
|
||||
return super(RobertaEmbeddings, self).forward(input_ids,
|
||||
token_type_ids=token_type_ids,
|
||||
position_ids=position_ids)
|
||||
position_ids=position_ids,
|
||||
inputs_embeds=inputs_embeds)
|
||||
|
||||
|
||||
ROBERTA_START_DOCSTRING = r""" The RoBERTa model was proposed in
|
||||
@@ -126,6 +134,10 @@ ROBERTA_INPUTS_DOCSTRING = r"""
|
||||
Mask to nullify selected heads of the self-attention modules.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
|
||||
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
|
||||
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
||||
than the model's internal embedding lookup matrix.
|
||||
"""
|
||||
|
||||
@add_start_docstrings("The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
|
||||
@@ -222,13 +234,14 @@ class RobertaForMaskedLM(BertPreTrainedModel):
|
||||
def get_output_embeddings(self):
|
||||
return self.lm_head.decoder
|
||||
|
||||
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
||||
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
|
||||
masked_lm_labels=None):
|
||||
outputs = self.roberta(input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask)
|
||||
head_mask=head_mask,
|
||||
inputs_embeds=inputs_embeds)
|
||||
sequence_output = outputs[0]
|
||||
prediction_scores = self.lm_head(sequence_output)
|
||||
|
||||
@@ -309,13 +322,14 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
|
||||
self.roberta = RobertaModel(config)
|
||||
self.classifier = RobertaClassificationHead(config)
|
||||
|
||||
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
||||
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, inputs_embeds=None,
|
||||
labels=None):
|
||||
outputs = self.roberta(input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask)
|
||||
head_mask=head_mask,
|
||||
inputs_embeds=inputs_embeds)
|
||||
sequence_output = outputs[0]
|
||||
logits = self.classifier(sequence_output)
|
||||
|
||||
@@ -372,6 +386,10 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
|
||||
Mask to nullify selected heads of the self-attention modules.
|
||||
Mask values selected in ``[0, 1]``:
|
||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||
**inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
|
||||
Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
|
||||
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
|
||||
than the model's internal embedding lookup matrix.
|
||||
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
||||
Labels for computing the multiple choice classification loss.
|
||||
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
|
||||
@@ -415,8 +433,8 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
|
||||
|
||||
self.init_weights()
|
||||
|
||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
|
||||
position_ids=None, head_mask=None):
|
||||
def forward(self, input_ids=None, token_type_ids=None, attention_mask=None, labels=None,
|
||||
position_ids=None, head_mask=None, inputs_embeds=None):
|
||||
num_choices = input_ids.shape[1]
|
||||
|
||||
flat_input_ids = input_ids.view(-1, input_ids.size(-1))
|
||||
@@ -487,14 +505,15 @@ class RobertaForTokenClassification(BertPreTrainedModel):
|
||||
|
||||
self.init_weights()
|
||||
|
||||
def forward(self, input_ids, attention_mask=None, token_type_ids=None,
|
||||
position_ids=None, head_mask=None, labels=None):
|
||||
def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
|
||||
position_ids=None, head_mask=None, inputs_embeds=None, labels=None):
|
||||
|
||||
outputs = self.roberta(input_ids,
|
||||
attention_mask=attention_mask,
|
||||
token_type_ids=token_type_ids,
|
||||
position_ids=position_ids,
|
||||
head_mask=head_mask)
|
||||
head_mask=head_mask,
|
||||
inputs_embeds=inputs_embeds)
|
||||
|
||||
sequence_output = outputs[0]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user