WIP reodering arguments for torchscript and TF
This commit is contained in:
@@ -596,18 +596,18 @@ BERT_INPUTS_DOCSTRING = r"""
|
|||||||
Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
|
Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
|
||||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
||||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Indices of positions of each input sequence tokens in the position embeddings.
|
Mask to avoid performing attention on padding token indices.
|
||||||
Selected in the range ``[0, config.max_position_embeddings - 1]``.
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Segment token indices to indicate first and second portions of the inputs.
|
Segment token indices to indicate first and second portions of the inputs.
|
||||||
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
|
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
|
||||||
corresponds to a `sentence B` token
|
corresponds to a `sentence B` token
|
||||||
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
|
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
|
||||||
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Mask to avoid performing attention on padding token indices.
|
Indices of positions of each input sequence tokens in the position embeddings.
|
||||||
Mask values selected in ``[0, 1]``:
|
Selected in the range ``[0, config.max_position_embeddings - 1]``.
|
||||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
|
||||||
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
@@ -668,7 +668,7 @@ class BertModel(BertPreTrainedModel):
|
|||||||
for layer, heads in heads_to_prune.items():
|
for layer, heads in heads_to_prune.items():
|
||||||
self.encoder.layer[layer].attention.prune_heads(heads)
|
self.encoder.layer[layer].attention.prune_heads(heads)
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, position_ids=None, head_mask=None):
|
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
|
||||||
if attention_mask is None:
|
if attention_mask is None:
|
||||||
attention_mask = torch.ones_like(input_ids)
|
attention_mask = torch.ones_like(input_ids)
|
||||||
if token_type_ids is None:
|
if token_type_ids is None:
|
||||||
@@ -771,10 +771,14 @@ class BertForPreTraining(BertPreTrainedModel):
|
|||||||
self._tie_or_clone_weights(self.cls.predictions.decoder,
|
self._tie_or_clone_weights(self.cls.predictions.decoder,
|
||||||
self.bert.embeddings.word_embeddings)
|
self.bert.embeddings.word_embeddings)
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
|
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
||||||
next_sentence_label=None, position_ids=None, head_mask=None):
|
masked_lm_labels=None, next_sentence_label=None):
|
||||||
outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
|
|
||||||
attention_mask=attention_mask, head_mask=head_mask)
|
outputs = self.bert(input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
head_mask=head_mask)
|
||||||
|
|
||||||
sequence_output, pooled_output = outputs[:2]
|
sequence_output, pooled_output = outputs[:2]
|
||||||
prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
|
prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
|
||||||
@@ -839,10 +843,14 @@ class BertForMaskedLM(BertPreTrainedModel):
|
|||||||
self._tie_or_clone_weights(self.cls.predictions.decoder,
|
self._tie_or_clone_weights(self.cls.predictions.decoder,
|
||||||
self.bert.embeddings.word_embeddings)
|
self.bert.embeddings.word_embeddings)
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
|
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
||||||
position_ids=None, head_mask=None):
|
masked_lm_labels=None):
|
||||||
outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
|
|
||||||
attention_mask=attention_mask, head_mask=head_mask)
|
outputs = self.bert(input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
head_mask=head_mask)
|
||||||
|
|
||||||
sequence_output = outputs[0]
|
sequence_output = outputs[0]
|
||||||
prediction_scores = self.cls(sequence_output)
|
prediction_scores = self.cls(sequence_output)
|
||||||
@@ -896,10 +904,15 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
|
|||||||
|
|
||||||
self.init_weights()
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None,
|
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
||||||
position_ids=None, head_mask=None):
|
next_sentence_label=None):
|
||||||
outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
|
|
||||||
attention_mask=attention_mask, head_mask=head_mask)
|
outputs = self.bert(input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
head_mask=head_mask)
|
||||||
|
|
||||||
pooled_output = outputs[1]
|
pooled_output = outputs[1]
|
||||||
|
|
||||||
seq_relationship_score = self.cls(pooled_output)
|
seq_relationship_score = self.cls(pooled_output)
|
||||||
@@ -957,10 +970,15 @@ class BertForSequenceClassification(BertPreTrainedModel):
|
|||||||
|
|
||||||
self.init_weights()
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
|
def forward(self, input_ids, attention_mask=None, token_type_ids=None,
|
||||||
position_ids=None, head_mask=None):
|
position_ids=None, head_mask=None, labels=None):
|
||||||
outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
|
|
||||||
attention_mask=attention_mask, head_mask=head_mask)
|
outputs = self.bert(input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
head_mask=head_mask)
|
||||||
|
|
||||||
pooled_output = outputs[1]
|
pooled_output = outputs[1]
|
||||||
|
|
||||||
pooled_output = self.dropout(pooled_output)
|
pooled_output = self.dropout(pooled_output)
|
||||||
@@ -983,45 +1001,9 @@ class BertForSequenceClassification(BertPreTrainedModel):
|
|||||||
|
|
||||||
@add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of
|
@add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of
|
||||||
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
|
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
|
||||||
BERT_START_DOCSTRING)
|
BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
|
||||||
class BertForMultipleChoice(BertPreTrainedModel):
|
class BertForMultipleChoice(BertPreTrainedModel):
|
||||||
r"""
|
r"""
|
||||||
Inputs:
|
|
||||||
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
|
||||||
Indices of input sequence tokens in the vocabulary.
|
|
||||||
The second dimension of the input (`num_choices`) indicates the number of choices to score.
|
|
||||||
To match pre-training, BERT input sequence should be formatted with [CLS] and [SEP] tokens as follows:
|
|
||||||
|
|
||||||
(a) For sequence pairs:
|
|
||||||
|
|
||||||
``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
|
|
||||||
|
|
||||||
``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1``
|
|
||||||
|
|
||||||
(b) For single sequences:
|
|
||||||
|
|
||||||
``tokens: [CLS] the dog is hairy . [SEP]``
|
|
||||||
|
|
||||||
``token_type_ids: 0 0 0 0 0 0 0``
|
|
||||||
|
|
||||||
Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
|
|
||||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
|
||||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
|
||||||
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
|
||||||
Segment token indices to indicate first and second portions of the inputs.
|
|
||||||
The second dimension of the input (`num_choices`) indicates the number of choices to score.
|
|
||||||
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
|
|
||||||
corresponds to a `sentence B` token
|
|
||||||
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
|
|
||||||
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
|
||||||
Mask to avoid performing attention on padding token indices.
|
|
||||||
The second dimension of the input (`num_choices`) indicates the number of choices to score.
|
|
||||||
Mask values selected in ``[0, 1]``:
|
|
||||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
|
||||||
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
|
||||||
Mask to nullify selected heads of the self-attention modules.
|
|
||||||
Mask values selected in ``[0, 1]``:
|
|
||||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
|
||||||
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
||||||
Labels for computing the multiple choice classification loss.
|
Labels for computing the multiple choice classification loss.
|
||||||
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
|
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
|
||||||
@@ -1061,16 +1043,21 @@ class BertForMultipleChoice(BertPreTrainedModel):
|
|||||||
|
|
||||||
self.init_weights()
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
|
def forward(self, input_ids, attention_mask=None, token_type_ids=None,
|
||||||
position_ids=None, head_mask=None):
|
position_ids=None, head_mask=None, labels=None):
|
||||||
num_choices = input_ids.shape[1]
|
num_choices = input_ids.shape[1]
|
||||||
|
|
||||||
flat_input_ids = input_ids.view(-1, input_ids.size(-1))
|
input_ids = input_ids.view(-1, input_ids.size(-1))
|
||||||
flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
|
attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
|
||||||
flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
|
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
|
||||||
flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
|
position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
|
||||||
outputs = self.bert(flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids,
|
|
||||||
attention_mask=flat_attention_mask, head_mask=head_mask)
|
outputs = self.bert(input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
head_mask=head_mask)
|
||||||
|
|
||||||
pooled_output = outputs[1]
|
pooled_output = outputs[1]
|
||||||
|
|
||||||
pooled_output = self.dropout(pooled_output)
|
pooled_output = self.dropout(pooled_output)
|
||||||
@@ -1129,10 +1116,15 @@ class BertForTokenClassification(BertPreTrainedModel):
|
|||||||
|
|
||||||
self.init_weights()
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
|
def forward(self, input_ids, attention_mask=None, token_type_ids=None,
|
||||||
position_ids=None, head_mask=None):
|
position_ids=None, head_mask=None, labels=None):
|
||||||
outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
|
|
||||||
attention_mask=attention_mask, head_mask=head_mask)
|
outputs = self.bert(input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
head_mask=head_mask)
|
||||||
|
|
||||||
sequence_output = outputs[0]
|
sequence_output = outputs[0]
|
||||||
|
|
||||||
sequence_output = self.dropout(sequence_output)
|
sequence_output = self.dropout(sequence_output)
|
||||||
@@ -1203,10 +1195,15 @@ class BertForQuestionAnswering(BertPreTrainedModel):
|
|||||||
|
|
||||||
self.init_weights()
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,
|
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
||||||
end_positions=None, position_ids=None, head_mask=None):
|
start_positions=None, end_positions=None):
|
||||||
outputs = self.bert(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
|
|
||||||
attention_mask=attention_mask, head_mask=head_mask)
|
outputs = self.bert(input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
head_mask=head_mask)
|
||||||
|
|
||||||
sequence_output = outputs[0]
|
sequence_output = outputs[0]
|
||||||
|
|
||||||
logits = self.qa_outputs(sequence_output)
|
logits = self.qa_outputs(sequence_output)
|
||||||
|
|||||||
@@ -585,10 +585,10 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
|
|||||||
self._tie_or_clone_weights(self.vocab_projector,
|
self._tie_or_clone_weights(self.vocab_projector,
|
||||||
self.distilbert.embeddings.word_embeddings)
|
self.distilbert.embeddings.word_embeddings)
|
||||||
|
|
||||||
def forward(self, input_ids, attention_mask=None, masked_lm_labels=None, head_mask=None):
|
def forward(self, input_ids, attention_mask=None, head_mask=None, masked_lm_labels=None):
|
||||||
dlbrt_output = self.distilbert(input_ids=input_ids,
|
dlbrt_output = self.distilbert(input_ids=input_ids,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask)
|
||||||
hidden_states = dlbrt_output[0] # (bs, seq_length, dim)
|
hidden_states = dlbrt_output[0] # (bs, seq_length, dim)
|
||||||
prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim)
|
prediction_logits = self.vocab_transform(hidden_states) # (bs, seq_length, dim)
|
||||||
prediction_logits = gelu(prediction_logits) # (bs, seq_length, dim)
|
prediction_logits = gelu(prediction_logits) # (bs, seq_length, dim)
|
||||||
@@ -649,10 +649,10 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
|
|||||||
|
|
||||||
self.init_weights()
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, attention_mask=None, labels=None, head_mask=None):
|
def forward(self, input_ids, attention_mask=None, head_mask=None, labels=None):
|
||||||
distilbert_output = self.distilbert(input_ids=input_ids,
|
distilbert_output = self.distilbert(input_ids=input_ids,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask)
|
||||||
hidden_state = distilbert_output[0] # (bs, seq_len, dim)
|
hidden_state = distilbert_output[0] # (bs, seq_len, dim)
|
||||||
pooled_output = hidden_state[:, 0] # (bs, dim)
|
pooled_output = hidden_state[:, 0] # (bs, dim)
|
||||||
pooled_output = self.pre_classifier(pooled_output) # (bs, dim)
|
pooled_output = self.pre_classifier(pooled_output) # (bs, dim)
|
||||||
@@ -723,10 +723,10 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
|
|||||||
|
|
||||||
self.init_weights()
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, attention_mask=None, start_positions=None, end_positions=None, head_mask=None):
|
def forward(self, input_ids, attention_mask=None, head_mask=None, start_positions=None, end_positions=None):
|
||||||
distilbert_output = self.distilbert(input_ids=input_ids,
|
distilbert_output = self.distilbert(input_ids=input_ids,
|
||||||
attention_mask=attention_mask,
|
attention_mask=attention_mask,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask)
|
||||||
hidden_states = distilbert_output[0] # (bs, max_query_len, dim)
|
hidden_states = distilbert_output[0] # (bs, max_query_len, dim)
|
||||||
|
|
||||||
hidden_states = self.dropout(hidden_states) # (bs, max_query_len, dim)
|
hidden_states = self.dropout(hidden_states) # (bs, max_query_len, dim)
|
||||||
|
|||||||
@@ -257,7 +257,7 @@ class Attention(nn.Module):
|
|||||||
self.n_head = self.n_head - len(heads)
|
self.n_head = self.n_head - len(heads)
|
||||||
self.pruned_heads = self.pruned_heads.union(heads)
|
self.pruned_heads = self.pruned_heads.union(heads)
|
||||||
|
|
||||||
def _attn(self, q, k, v, head_mask=None):
|
def _attn(self, q, k, v, attention_mask=None, head_mask=None):
|
||||||
w = torch.matmul(q, k)
|
w = torch.matmul(q, k)
|
||||||
if self.scale:
|
if self.scale:
|
||||||
w = w / math.sqrt(v.size(-1))
|
w = w / math.sqrt(v.size(-1))
|
||||||
@@ -265,6 +265,10 @@ class Attention(nn.Module):
|
|||||||
b = self.bias[:, :, ns-nd:ns, :ns]
|
b = self.bias[:, :, ns-nd:ns, :ns]
|
||||||
w = w * b - 1e4 * (1 - b)
|
w = w * b - 1e4 * (1 - b)
|
||||||
|
|
||||||
|
if attention_mask is not None:
|
||||||
|
# Apply the attention mask
|
||||||
|
w = w + attention_mask
|
||||||
|
|
||||||
w = nn.Softmax(dim=-1)(w)
|
w = nn.Softmax(dim=-1)(w)
|
||||||
w = self.attn_dropout(w)
|
w = self.attn_dropout(w)
|
||||||
|
|
||||||
@@ -290,7 +294,7 @@ class Attention(nn.Module):
|
|||||||
else:
|
else:
|
||||||
return x.permute(0, 2, 1, 3) # (batch, head, seq_length, head_features)
|
return x.permute(0, 2, 1, 3) # (batch, head, seq_length, head_features)
|
||||||
|
|
||||||
def forward(self, x, layer_past=None, head_mask=None):
|
def forward(self, x, layer_past=None, attention_mask=None, head_mask=None):
|
||||||
x = self.c_attn(x)
|
x = self.c_attn(x)
|
||||||
query, key, value = x.split(self.split_size, dim=2)
|
query, key, value = x.split(self.split_size, dim=2)
|
||||||
query = self.split_heads(query)
|
query = self.split_heads(query)
|
||||||
@@ -302,7 +306,7 @@ class Attention(nn.Module):
|
|||||||
value = torch.cat((past_value, value), dim=-2)
|
value = torch.cat((past_value, value), dim=-2)
|
||||||
present = torch.stack((key.transpose(-2, -1), value)) # transpose to have same shapes for stacking
|
present = torch.stack((key.transpose(-2, -1), value)) # transpose to have same shapes for stacking
|
||||||
|
|
||||||
attn_outputs = self._attn(query, key, value, head_mask)
|
attn_outputs = self._attn(query, key, value, attention_mask, head_mask)
|
||||||
a = attn_outputs[0]
|
a = attn_outputs[0]
|
||||||
|
|
||||||
a = self.merge_heads(a)
|
a = self.merge_heads(a)
|
||||||
@@ -337,8 +341,11 @@ class Block(nn.Module):
|
|||||||
self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
|
self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
|
||||||
self.mlp = MLP(4 * nx, config)
|
self.mlp = MLP(4 * nx, config)
|
||||||
|
|
||||||
def forward(self, x, layer_past=None, head_mask=None):
|
def forward(self, x, layer_past=None, attention_mask=None, head_mask=None):
|
||||||
output_attn = self.attn(self.ln_1(x), layer_past=layer_past, head_mask=head_mask)
|
output_attn = self.attn(self.ln_1(x),
|
||||||
|
layer_past=layer_past,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
head_mask=head_mask)
|
||||||
a = output_attn[0] # output_attn: a, present, (attentions)
|
a = output_attn[0] # output_attn: a, present, (attentions)
|
||||||
|
|
||||||
x = x + a
|
x = x + a
|
||||||
@@ -404,17 +411,21 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs:
|
|||||||
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
|
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
|
||||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
||||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
|
||||||
Indices of positions of each input sequence tokens in the position embeddings.
|
|
||||||
Selected in the range ``[0, config.max_position_embeddings - 1]``.
|
|
||||||
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
|
||||||
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
|
||||||
The embeddings from these tokens will be summed with the respective token embeddings.
|
|
||||||
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
|
|
||||||
**past**:
|
**past**:
|
||||||
list of ``torch.FloatTensor`` (one for each layer):
|
list of ``torch.FloatTensor`` (one for each layer):
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
||||||
(see `past` output below). Can be used to speed up sequential decoding.
|
(see `past` output below). Can be used to speed up sequential decoding.
|
||||||
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Mask to avoid performing attention on padding token indices.
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
|
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
||||||
|
The embeddings from these tokens will be summed with the respective token embeddings.
|
||||||
|
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
|
||||||
|
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Indices of positions of each input sequence tokens in the position embeddings.
|
||||||
|
Selected in the range ``[0, config.max_position_embeddings - 1]``.
|
||||||
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
@@ -473,7 +484,7 @@ class GPT2Model(GPT2PreTrainedModel):
|
|||||||
for layer, heads in heads_to_prune.items():
|
for layer, heads in heads_to_prune.items():
|
||||||
self.h[layer].attn.prune_heads(heads)
|
self.h[layer].attn.prune_heads(heads)
|
||||||
|
|
||||||
def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None, head_mask=None):
|
def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
|
||||||
if past is None:
|
if past is None:
|
||||||
past_length = 0
|
past_length = 0
|
||||||
past = [None] * len(self.h)
|
past = [None] * len(self.h)
|
||||||
@@ -483,6 +494,23 @@ class GPT2Model(GPT2PreTrainedModel):
|
|||||||
position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long, device=input_ids.device)
|
position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long, device=input_ids.device)
|
||||||
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
|
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
|
||||||
|
|
||||||
|
# Attention mask.
|
||||||
|
if attention_mask is not None:
|
||||||
|
# We create a 3D attention mask from a 2D tensor mask.
|
||||||
|
# Sizes are [batch_size, 1, 1, to_seq_length]
|
||||||
|
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
|
||||||
|
# this attention mask is more simple than the triangular masking of causal attention
|
||||||
|
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
|
||||||
|
attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
|
||||||
|
|
||||||
|
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
|
||||||
|
# masked positions, this operation will create a tensor which is 0.0 for
|
||||||
|
# positions we want to attend and -10000.0 for masked positions.
|
||||||
|
# Since we are adding it to the raw scores before the softmax, this is
|
||||||
|
# effectively the same as removing these entirely.
|
||||||
|
attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
|
||||||
|
attention_mask = (1.0 - attention_mask) * -10000.0
|
||||||
|
|
||||||
# Prepare head mask if needed
|
# Prepare head mask if needed
|
||||||
# 1.0 in head_mask indicate we keep the head
|
# 1.0 in head_mask indicate we keep the head
|
||||||
# attention_probs has shape bsz x n_heads x N x N
|
# attention_probs has shape bsz x n_heads x N x N
|
||||||
@@ -520,7 +548,11 @@ class GPT2Model(GPT2PreTrainedModel):
|
|||||||
if self.output_hidden_states:
|
if self.output_hidden_states:
|
||||||
all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
|
all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
|
||||||
|
|
||||||
outputs = block(hidden_states, layer_past, head_mask[i])
|
outputs = block(hidden_states,
|
||||||
|
past=layer_past,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
head_mask=head_mask[i])
|
||||||
|
|
||||||
hidden_states, present = outputs[:2]
|
hidden_states, present = outputs[:2]
|
||||||
presents = presents + (present,)
|
presents = presents + (present,)
|
||||||
|
|
||||||
@@ -601,9 +633,14 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
|||||||
self._tie_or_clone_weights(self.lm_head,
|
self._tie_or_clone_weights(self.lm_head,
|
||||||
self.transformer.wte)
|
self.transformer.wte)
|
||||||
|
|
||||||
def forward(self, input_ids, position_ids=None, token_type_ids=None, labels=None, past=None, head_mask=None):
|
def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
||||||
transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
|
labels=None):
|
||||||
past=past, head_mask=head_mask)
|
transformer_outputs = self.transformer(input_ids,
|
||||||
|
past=past,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
head_mask=head_mask)
|
||||||
hidden_states = transformer_outputs[0]
|
hidden_states = transformer_outputs[0]
|
||||||
|
|
||||||
lm_logits = self.lm_head(hidden_states)
|
lm_logits = self.lm_head(hidden_states)
|
||||||
@@ -626,33 +663,12 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
|||||||
head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
|
head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
|
||||||
The language modeling head has its weights tied to the input embeddings,
|
The language modeling head has its weights tied to the input embeddings,
|
||||||
the classification head takes as input the input of a specified classification token index in the input sequence).
|
the classification head takes as input the input of a specified classification token index in the input sequence).
|
||||||
""", GPT2_START_DOCSTRING)
|
""", GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
|
||||||
class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
||||||
r""" Inputs:
|
r"""
|
||||||
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
|
||||||
Indices of input sequence tokens in the vocabulary.
|
|
||||||
The second dimension of the input (`num_choices`) indicates the number of choices to score.
|
|
||||||
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
|
|
||||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
|
||||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
|
||||||
**mc_token_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
|
**mc_token_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
|
||||||
Index of the classification token in each input sequence.
|
Index of the classification token in each input sequence.
|
||||||
Selected in the range ``[0, input_ids.size(-1) - 1[``.
|
Selected in the range ``[0, input_ids.size(-1) - 1[``.
|
||||||
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
|
||||||
Indices of positions of each input sequence tokens in the position embeddings.
|
|
||||||
Selected in the range ``[0, config.max_position_embeddings - 1]``.
|
|
||||||
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
|
||||||
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
|
||||||
The embeddings from these tokens will be summed with the respective token embeddings.
|
|
||||||
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
|
|
||||||
**past**:
|
|
||||||
list of ``torch.FloatTensor`` (one for each layer):
|
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
|
||||||
(see `past` output below). Can be used to speed up sequential decoding.
|
|
||||||
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
|
||||||
Mask to nullify selected heads of the self-attention modules.
|
|
||||||
Mask values selected in ``[0, 1]``:
|
|
||||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
|
||||||
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Labels for language modeling.
|
Labels for language modeling.
|
||||||
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
|
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
|
||||||
@@ -725,10 +741,15 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
|||||||
self._tie_or_clone_weights(self.lm_head,
|
self._tie_or_clone_weights(self.lm_head,
|
||||||
self.transformer.wte)
|
self.transformer.wte)
|
||||||
|
|
||||||
def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
|
def forward(self, input_ids, past=None, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
||||||
position_ids=None, past=None, head_mask=None):
|
mc_token_ids=None, lm_labels=None, mc_labels=None):
|
||||||
transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
|
transformer_outputs = self.transformer(input_ids,
|
||||||
past=past, head_mask=head_mask)
|
past=past,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
head_mask=head_mask)
|
||||||
|
|
||||||
hidden_states = transformer_outputs[0]
|
hidden_states = transformer_outputs[0]
|
||||||
|
|
||||||
lm_logits = self.lm_head(hidden_states)
|
lm_logits = self.lm_head(hidden_states)
|
||||||
|
|||||||
@@ -270,7 +270,7 @@ class Attention(nn.Module):
|
|||||||
self.n_head = self.n_head - len(heads)
|
self.n_head = self.n_head - len(heads)
|
||||||
self.pruned_heads = self.pruned_heads.union(heads)
|
self.pruned_heads = self.pruned_heads.union(heads)
|
||||||
|
|
||||||
def _attn(self, q, k, v, head_mask=None):
|
def _attn(self, q, k, v, attention_mask=None, head_mask=None):
|
||||||
w = torch.matmul(q, k)
|
w = torch.matmul(q, k)
|
||||||
if self.scale:
|
if self.scale:
|
||||||
w = w / math.sqrt(v.size(-1))
|
w = w / math.sqrt(v.size(-1))
|
||||||
@@ -279,6 +279,10 @@ class Attention(nn.Module):
|
|||||||
b = self.bias[:, :, : w.size(-2), : w.size(-1)]
|
b = self.bias[:, :, : w.size(-2), : w.size(-1)]
|
||||||
w = w * b + -1e9 * (1 - b)
|
w = w * b + -1e9 * (1 - b)
|
||||||
|
|
||||||
|
if attention_mask is not None:
|
||||||
|
# Apply the attention mask
|
||||||
|
w = w + attention_mask
|
||||||
|
|
||||||
w = nn.Softmax(dim=-1)(w)
|
w = nn.Softmax(dim=-1)(w)
|
||||||
w = self.attn_dropout(w)
|
w = self.attn_dropout(w)
|
||||||
|
|
||||||
@@ -304,14 +308,14 @@ class Attention(nn.Module):
|
|||||||
else:
|
else:
|
||||||
return x.permute(0, 2, 1, 3)
|
return x.permute(0, 2, 1, 3)
|
||||||
|
|
||||||
def forward(self, x, head_mask=None):
|
def forward(self, x, attention_mask=None, head_mask=None):
|
||||||
x = self.c_attn(x)
|
x = self.c_attn(x)
|
||||||
query, key, value = x.split(self.split_size, dim=2)
|
query, key, value = x.split(self.split_size, dim=2)
|
||||||
query = self.split_heads(query)
|
query = self.split_heads(query)
|
||||||
key = self.split_heads(key, k=True)
|
key = self.split_heads(key, k=True)
|
||||||
value = self.split_heads(value)
|
value = self.split_heads(value)
|
||||||
|
|
||||||
attn_outputs = self._attn(query, key, value, head_mask)
|
attn_outputs = self._attn(query, key, value, attention_mask, head_mask)
|
||||||
a = attn_outputs[0]
|
a = attn_outputs[0]
|
||||||
|
|
||||||
a = self.merge_heads(a)
|
a = self.merge_heads(a)
|
||||||
@@ -346,8 +350,8 @@ class Block(nn.Module):
|
|||||||
self.mlp = MLP(4 * nx, config)
|
self.mlp = MLP(4 * nx, config)
|
||||||
self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
|
self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
|
||||||
|
|
||||||
def forward(self, x, head_mask=None):
|
def forward(self, x, attention_mask=None, head_mask=None):
|
||||||
attn_outputs = self.attn(x, head_mask=head_mask)
|
attn_outputs = self.attn(x, attention_mask=attention_mask, head_mask=head_mask)
|
||||||
a = attn_outputs[0]
|
a = attn_outputs[0]
|
||||||
|
|
||||||
n = self.ln_1(x + a)
|
n = self.ln_1(x + a)
|
||||||
@@ -410,13 +414,17 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs:
|
|||||||
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
|
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
|
||||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
||||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Indices of positions of each input sequence tokens in the position embeddings.
|
Mask to avoid performing attention on padding token indices.
|
||||||
Selected in the range ``[0, config.max_position_embeddings - 1]``.
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
||||||
The embeddings from these tokens will be summed with the respective token embeddings.
|
The embeddings from these tokens will be summed with the respective token embeddings.
|
||||||
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices)
|
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices)
|
||||||
|
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Indices of positions of each input sequence tokens in the position embeddings.
|
||||||
|
Selected in the range ``[0, config.max_position_embeddings - 1]``.
|
||||||
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
@@ -470,7 +478,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
|||||||
for layer, heads in heads_to_prune.items():
|
for layer, heads in heads_to_prune.items():
|
||||||
self.h[layer].attn.prune_heads(heads)
|
self.h[layer].attn.prune_heads(heads)
|
||||||
|
|
||||||
def forward(self, input_ids, position_ids=None, token_type_ids=None, head_mask=None):
|
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
|
||||||
if position_ids is None:
|
if position_ids is None:
|
||||||
# This was used when we had a single embedding matrice from position and token embeddings
|
# This was used when we had a single embedding matrice from position and token embeddings
|
||||||
# start = self.config.vocab_size + self.config.n_special
|
# start = self.config.vocab_size + self.config.n_special
|
||||||
@@ -479,6 +487,23 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
|||||||
position_ids = torch.arange(input_ids.size(-1), dtype=torch.long, device=input_ids.device)
|
position_ids = torch.arange(input_ids.size(-1), dtype=torch.long, device=input_ids.device)
|
||||||
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
|
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
|
||||||
|
|
||||||
|
# Attention mask.
|
||||||
|
if attention_mask is not None:
|
||||||
|
# We create a 3D attention mask from a 2D tensor mask.
|
||||||
|
# Sizes are [batch_size, 1, 1, to_seq_length]
|
||||||
|
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
|
||||||
|
# this attention mask is more simple than the triangular masking of causal attention
|
||||||
|
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
|
||||||
|
attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
|
||||||
|
|
||||||
|
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
|
||||||
|
# masked positions, this operation will create a tensor which is 0.0 for
|
||||||
|
# positions we want to attend and -10000.0 for masked positions.
|
||||||
|
# Since we are adding it to the raw scores before the softmax, this is
|
||||||
|
# effectively the same as removing these entirely.
|
||||||
|
attention_mask = attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
|
||||||
|
attention_mask = (1.0 - attention_mask) * -10000.0
|
||||||
|
|
||||||
# Prepare head mask if needed
|
# Prepare head mask if needed
|
||||||
# 1.0 in head_mask indicate we keep the head
|
# 1.0 in head_mask indicate we keep the head
|
||||||
# attention_probs has shape bsz x n_heads x N x N
|
# attention_probs has shape bsz x n_heads x N x N
|
||||||
@@ -515,7 +540,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
|||||||
if self.output_hidden_states:
|
if self.output_hidden_states:
|
||||||
all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
|
all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
|
||||||
|
|
||||||
outputs = block(hidden_states, head_mask[i])
|
outputs = block(hidden_states, attention_mask, head_mask[i])
|
||||||
hidden_states = outputs[0]
|
hidden_states = outputs[0]
|
||||||
if self.output_attentions:
|
if self.output_attentions:
|
||||||
all_attentions = all_attentions + (outputs[1],)
|
all_attentions = all_attentions + (outputs[1],)
|
||||||
@@ -580,8 +605,12 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
|
|||||||
self._tie_or_clone_weights(self.lm_head,
|
self._tie_or_clone_weights(self.lm_head,
|
||||||
self.transformer.tokens_embed)
|
self.transformer.tokens_embed)
|
||||||
|
|
||||||
def forward(self, input_ids, position_ids=None, token_type_ids=None, labels=None, head_mask=None):
|
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
||||||
transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
|
labels=None):
|
||||||
|
transformer_outputs = self.transformer(input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask)
|
||||||
hidden_states = transformer_outputs[0]
|
hidden_states = transformer_outputs[0]
|
||||||
lm_logits = self.lm_head(hidden_states)
|
lm_logits = self.lm_head(hidden_states)
|
||||||
@@ -604,29 +633,12 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
|
|||||||
head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
|
head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
|
||||||
The language modeling head has its weights tied to the input embeddings,
|
The language modeling head has its weights tied to the input embeddings,
|
||||||
the classification head takes as input the input of a specified classification token index in the input sequence).
|
the classification head takes as input the input of a specified classification token index in the input sequence).
|
||||||
""", OPENAI_GPT_START_DOCSTRING)
|
""", OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
|
||||||
class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
||||||
r""" Inputs:
|
r"""
|
||||||
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
|
||||||
Indices of input sequence tokens in the vocabulary.
|
|
||||||
The second dimension of the input (`num_choices`) indicates the number of choices to score.
|
|
||||||
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
|
|
||||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
|
||||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
|
||||||
**mc_token_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
|
**mc_token_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
|
||||||
Index of the classification token in each input sequence.
|
Index of the classification token in each input sequence.
|
||||||
Selected in the range ``[0, input_ids.size(-1) - 1[``.
|
Selected in the range ``[0, input_ids.size(-1) - 1[``.
|
||||||
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
|
||||||
Indices of positions of each input sequence tokens in the position embeddings.
|
|
||||||
Selected in the range ``[0, config.max_position_embeddings - 1]``.
|
|
||||||
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
|
||||||
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
|
||||||
The embeddings from these tokens will be summed with the respective token embeddings.
|
|
||||||
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
|
|
||||||
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
|
||||||
Mask to nullify selected heads of the self-attention modules.
|
|
||||||
Mask values selected in ``[0, 1]``:
|
|
||||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
|
||||||
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Labels for language modeling.
|
Labels for language modeling.
|
||||||
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
|
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
|
||||||
@@ -687,9 +699,12 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
|||||||
self._tie_or_clone_weights(self.lm_head,
|
self._tie_or_clone_weights(self.lm_head,
|
||||||
self.transformer.tokens_embed)
|
self.transformer.tokens_embed)
|
||||||
|
|
||||||
def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
|
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
||||||
position_ids=None, head_mask=None):
|
lm_labels=None, mc_labels=None):
|
||||||
transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
|
transformer_outputs = self.transformer(input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask)
|
||||||
hidden_states = transformer_outputs[0]
|
hidden_states = transformer_outputs[0]
|
||||||
|
|
||||||
|
|||||||
@@ -61,7 +61,9 @@ class RobertaEmbeddings(BertEmbeddings):
|
|||||||
# cf. fairseq's `utils.make_positions`
|
# cf. fairseq's `utils.make_positions`
|
||||||
position_ids = torch.arange(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=torch.long, device=input_ids.device)
|
position_ids = torch.arange(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=torch.long, device=input_ids.device)
|
||||||
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
|
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
|
||||||
return super(RobertaEmbeddings, self).forward(input_ids, token_type_ids=token_type_ids, position_ids=position_ids)
|
return super(RobertaEmbeddings, self).forward(input_ids,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids)
|
||||||
|
|
||||||
|
|
||||||
class RobertaConfig(BertConfig):
|
class RobertaConfig(BertConfig):
|
||||||
@@ -116,13 +118,20 @@ ROBERTA_INPUTS_DOCSTRING = r"""
|
|||||||
|
|
||||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
||||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
|
||||||
Indices of positions of each input sequence tokens in the position embeddings.
|
|
||||||
Selected in the range ``[0, config.max_position_embeddings - 1[``.
|
|
||||||
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Mask to avoid performing attention on padding token indices.
|
Mask to avoid performing attention on padding token indices.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
|
**token_type_ids**: (`optional` need to be trained) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Optional segment token indices to indicate first and second portions of the inputs.
|
||||||
|
This embedding matrice is not trained (not pretrained during RoBERTa pretraining), you will have to train it
|
||||||
|
during finetuning.
|
||||||
|
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
|
||||||
|
corresponds to a `sentence B` token
|
||||||
|
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
|
||||||
|
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Indices of positions of each input sequence tokens in the position embeddings.
|
||||||
|
Selected in the range ``[0, config.max_position_embeddings - 1[``.
|
||||||
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
@@ -170,12 +179,16 @@ class RobertaModel(BertModel):
|
|||||||
self.embeddings = RobertaEmbeddings(config)
|
self.embeddings = RobertaEmbeddings(config)
|
||||||
self.init_weights()
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, position_ids=None, head_mask=None):
|
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
|
||||||
if input_ids[:, 0].sum().item() != 0:
|
if input_ids[:, 0].sum().item() != 0:
|
||||||
logger.warning("A sequence with no special tokens has been passed to the RoBERTa model. "
|
logger.warning("A sequence with no special tokens has been passed to the RoBERTa model. "
|
||||||
"This model requires special tokens in order to work. "
|
"This model requires special tokens in order to work. "
|
||||||
"Please specify add_special_tokens=True in your encoding.")
|
"Please specify add_special_tokens=True in your encoding.")
|
||||||
return super(RobertaModel, self).forward(input_ids, token_type_ids, attention_mask, position_ids, head_mask)
|
return super(RobertaModel, self).forward(input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
head_mask=head_mask)
|
||||||
|
|
||||||
|
|
||||||
@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """,
|
@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """,
|
||||||
@@ -229,10 +242,13 @@ class RobertaForMaskedLM(BertPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
self._tie_or_clone_weights(self.lm_head.decoder, self.roberta.embeddings.word_embeddings)
|
self._tie_or_clone_weights(self.lm_head.decoder, self.roberta.embeddings.word_embeddings)
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, position_ids=None,
|
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
||||||
head_mask=None):
|
masked_lm_labels=None):
|
||||||
outputs = self.roberta(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
|
outputs = self.roberta(input_ids,
|
||||||
attention_mask=attention_mask, head_mask=head_mask)
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
head_mask=head_mask)
|
||||||
sequence_output = outputs[0]
|
sequence_output = outputs[0]
|
||||||
prediction_scores = self.lm_head(sequence_output)
|
prediction_scores = self.lm_head(sequence_output)
|
||||||
|
|
||||||
@@ -313,10 +329,13 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
|
|||||||
self.roberta = RobertaModel(config)
|
self.roberta = RobertaModel(config)
|
||||||
self.classifier = RobertaClassificationHead(config)
|
self.classifier = RobertaClassificationHead(config)
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
|
def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
|
||||||
position_ids=None, head_mask=None):
|
labels=None):
|
||||||
outputs = self.roberta(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
|
outputs = self.roberta(input_ids,
|
||||||
attention_mask=attention_mask, head_mask=head_mask)
|
attention_mask=attention_mask,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
head_mask=head_mask)
|
||||||
sequence_output = outputs[0]
|
sequence_output = outputs[0]
|
||||||
logits = self.classifier(sequence_output)
|
logits = self.classifier(sequence_output)
|
||||||
|
|
||||||
|
|||||||
@@ -1342,7 +1342,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
|
|||||||
def init_mems(self, data):
|
def init_mems(self, data):
|
||||||
return self.transformer.init_mems(data)
|
return self.transformer.init_mems(data)
|
||||||
|
|
||||||
def forward(self, input_ids, labels=None, mems=None, head_mask=None):
|
def forward(self, input_ids, mems=None, head_mask=None, labels=None):
|
||||||
bsz = input_ids.size(0)
|
bsz = input_ids.size(0)
|
||||||
tgt_len = input_ids.size(1)
|
tgt_len = input_ids.size(1)
|
||||||
|
|
||||||
|
|||||||
@@ -441,23 +441,23 @@ XLM_INPUTS_DOCSTRING = r"""
|
|||||||
Indices can be obtained using :class:`pytorch_transformers.XLMTokenizer`.
|
Indices can be obtained using :class:`pytorch_transformers.XLMTokenizer`.
|
||||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
||||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Indices of positions of each input sequence tokens in the position embeddings.
|
Mask to avoid performing attention on padding token indices.
|
||||||
Selected in the range ``[0, config.max_position_embeddings - 1]``.
|
Mask values selected in ``[0, 1]``:
|
||||||
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
|
||||||
The embeddings from these tokens will be summed with the respective token embeddings.
|
|
||||||
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
|
|
||||||
**langs**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
**langs**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
A parallel sequence of tokens to be used to indicate the language of each token in the input.
|
A parallel sequence of tokens to be used to indicate the language of each token in the input.
|
||||||
Indices are languages ids which can be obtained from the language names by using two conversion mappings
|
Indices are languages ids which can be obtained from the language names by using two conversion mappings
|
||||||
provided in the configuration of the model (only provided for multilingual models).
|
provided in the configuration of the model (only provided for multilingual models).
|
||||||
More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and
|
More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and
|
||||||
the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str).
|
the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str).
|
||||||
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Mask to avoid performing attention on padding token indices.
|
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
||||||
Mask values selected in ``[0, 1]``:
|
The embeddings from these tokens will be summed with the respective token embeddings.
|
||||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
|
||||||
|
**position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Indices of positions of each input sequence tokens in the position embeddings.
|
||||||
|
Selected in the range ``[0, config.max_position_embeddings - 1]``.
|
||||||
**lengths**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
**lengths**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
|
||||||
Length of each sentence that can be used to avoid performing attention on padding token indices.
|
Length of each sentence that can be used to avoid performing attention on padding token indices.
|
||||||
You can also use `attention_mask` for the same result (see above), kept here for compatbility.
|
You can also use `attention_mask` for the same result (see above), kept here for compatbility.
|
||||||
@@ -584,8 +584,8 @@ class XLMModel(XLMPreTrainedModel):
|
|||||||
for layer, heads in heads_to_prune.items():
|
for layer, heads in heads_to_prune.items():
|
||||||
self.attentions[layer].prune_heads(heads)
|
self.attentions[layer].prune_heads(heads)
|
||||||
|
|
||||||
def forward(self, input_ids, lengths=None, position_ids=None, langs=None,
|
def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
|
||||||
token_type_ids=None, attention_mask=None, cache=None, head_mask=None): # src_enc=None, src_len=None,
|
lengths=None, cache=None, head_mask=None): # removed: src_enc=None, src_len=None
|
||||||
if lengths is None:
|
if lengths is None:
|
||||||
lengths = (input_ids != self.pad_index).sum(dim=1).long()
|
lengths = (input_ids != self.pad_index).sum(dim=1).long()
|
||||||
# mask = input_ids != self.pad_index
|
# mask = input_ids != self.pad_index
|
||||||
@@ -790,11 +790,16 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
self._tie_or_clone_weights(self.pred_layer.proj, self.transformer.embeddings)
|
self._tie_or_clone_weights(self.pred_layer.proj, self.transformer.embeddings)
|
||||||
|
|
||||||
def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
|
def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
|
||||||
attention_mask=None, cache=None, labels=None, head_mask=None):
|
lengths=None, cache=None, head_mask=None, labels=None):
|
||||||
transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids,
|
transformer_outputs = self.transformer(input_ids,
|
||||||
token_type_ids=token_type_ids, langs=langs,
|
attention_mask=attention_mask,
|
||||||
attention_mask=attention_mask, cache=cache, head_mask=head_mask)
|
langs=langs,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
lengths=lengths,
|
||||||
|
cache=cache,
|
||||||
|
head_mask=head_mask)
|
||||||
|
|
||||||
output = transformer_outputs[0]
|
output = transformer_outputs[0]
|
||||||
outputs = self.pred_layer(output, labels)
|
outputs = self.pred_layer(output, labels)
|
||||||
@@ -846,11 +851,16 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
|
|||||||
|
|
||||||
self.init_weights()
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
|
def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
|
||||||
attention_mask=None, cache=None, labels=None, head_mask=None):
|
lengths=None, cache=None, head_mask=None, labels=None):
|
||||||
transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids,
|
transformer_outputs = self.transformer(input_ids,
|
||||||
token_type_ids=token_type_ids, langs=langs,
|
attention_mask=attention_mask,
|
||||||
attention_mask=attention_mask, cache=cache, head_mask=head_mask)
|
langs=langs,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
lengths=lengths,
|
||||||
|
cache=cache,
|
||||||
|
head_mask=head_mask)
|
||||||
|
|
||||||
output = transformer_outputs[0]
|
output = transformer_outputs[0]
|
||||||
logits = self.sequence_summary(output)
|
logits = self.sequence_summary(output)
|
||||||
@@ -924,12 +934,17 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
|
|||||||
|
|
||||||
self.init_weights()
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
|
def forward(self, input_ids, attention_mask=None, langs=None, token_type_ids=None, position_ids=None,
|
||||||
attention_mask=None, cache=None, start_positions=None, end_positions=None,
|
lengths=None, cache=None, head_mask=None, start_positions=None, end_positions=None,
|
||||||
cls_index=None, is_impossible=None, p_mask=None, head_mask=None):
|
is_impossible=None, cls_index=None, p_mask=None):
|
||||||
transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids,
|
transformer_outputs = self.transformer(input_ids,
|
||||||
token_type_ids=token_type_ids, langs=langs,
|
attention_mask=attention_mask,
|
||||||
attention_mask=attention_mask, cache=cache, head_mask=head_mask)
|
langs=langs,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
position_ids=position_ids,
|
||||||
|
lengths=lengths,
|
||||||
|
cache=cache,
|
||||||
|
head_mask=head_mask)
|
||||||
|
|
||||||
output = transformer_outputs[0]
|
output = transformer_outputs[0]
|
||||||
|
|
||||||
|
|||||||
@@ -647,21 +647,10 @@ XLNET_INPUTS_DOCSTRING = r"""
|
|||||||
Indices can be obtained using :class:`pytorch_transformers.XLNetTokenizer`.
|
Indices can be obtained using :class:`pytorch_transformers.XLNetTokenizer`.
|
||||||
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
|
||||||
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
|
||||||
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
|
||||||
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
|
||||||
The embeddings from these tokens will be summed with the respective token embeddings.
|
|
||||||
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
|
|
||||||
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Mask to avoid performing attention on padding token indices.
|
Mask to avoid performing attention on padding token indices.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
**input_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
|
||||||
Mask to avoid performing attention on padding token indices.
|
|
||||||
Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding.
|
|
||||||
Kept for compatibility with the original code base.
|
|
||||||
You can only uses one of `input_mask` and `attention_mask`
|
|
||||||
Mask values selected in ``[0, 1]``:
|
|
||||||
``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED.
|
|
||||||
**mems**: (`optional`)
|
**mems**: (`optional`)
|
||||||
list of ``torch.FloatTensor`` (one for each layer):
|
list of ``torch.FloatTensor`` (one for each layer):
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks) as output by the model
|
that contains pre-computed hidden-states (key and values in the attention blocks) as output by the model
|
||||||
@@ -679,6 +668,17 @@ XLNET_INPUTS_DOCSTRING = r"""
|
|||||||
Mask to indicate the output tokens to use.
|
Mask to indicate the output tokens to use.
|
||||||
If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token.
|
If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token.
|
||||||
Only used during pretraining for partial prediction or for sequential decoding (generation).
|
Only used during pretraining for partial prediction or for sequential decoding (generation).
|
||||||
|
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
||||||
|
The embeddings from these tokens will be summed with the respective token embeddings.
|
||||||
|
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
|
||||||
|
**input_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
|
Mask to avoid performing attention on padding token indices.
|
||||||
|
Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding.
|
||||||
|
Kept for compatibility with the original code base.
|
||||||
|
You can only uses one of `input_mask` and `attention_mask`
|
||||||
|
Mask values selected in ``[0, 1]``:
|
||||||
|
``1`` for tokens that are MASKED, ``0`` for tokens that are NOT MASKED.
|
||||||
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
@@ -837,8 +837,8 @@ class XLNetModel(XLNetPreTrainedModel):
|
|||||||
pos_emb = pos_emb.to(next(self.parameters()))
|
pos_emb = pos_emb.to(next(self.parameters()))
|
||||||
return pos_emb
|
return pos_emb
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
|
def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
|
||||||
mems=None, perm_mask=None, target_mapping=None, head_mask=None):
|
token_type_ids=None, input_mask=None, head_mask=None):
|
||||||
# the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
|
# the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
|
||||||
# but we want a unified interface in the library with the batch size on the first dimension
|
# but we want a unified interface in the library with the batch size on the first dimension
|
||||||
# so we move here the first dimension (batch) to the end
|
# so we move here the first dimension (batch) to the end
|
||||||
@@ -1042,12 +1042,15 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
self._tie_or_clone_weights(self.lm_loss, self.transformer.word_embedding)
|
self._tie_or_clone_weights(self.lm_loss, self.transformer.word_embedding)
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
|
def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
|
||||||
mems=None, perm_mask=None, target_mapping=None,
|
token_type_ids=None, input_mask=None, head_mask=None, labels=None):
|
||||||
labels=None, head_mask=None):
|
transformer_outputs = self.transformer(input_ids,
|
||||||
transformer_outputs = self.transformer(input_ids, token_type_ids=token_type_ids,
|
attention_mask=attention_mask,
|
||||||
input_mask=input_mask, attention_mask=attention_mask,
|
mems=mems,
|
||||||
mems=mems, perm_mask=perm_mask, target_mapping=target_mapping,
|
perm_mask=perm_mask,
|
||||||
|
target_mapping=target_mapping,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
input_mask=input_mask,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask)
|
||||||
|
|
||||||
logits = self.lm_loss(transformer_outputs[0])
|
logits = self.lm_loss(transformer_outputs[0])
|
||||||
@@ -1113,12 +1116,15 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
|
|||||||
|
|
||||||
self.init_weights()
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
|
def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
|
||||||
mems=None, perm_mask=None, target_mapping=None,
|
token_type_ids=None, input_mask=None, head_mask=None, labels=None):
|
||||||
labels=None, head_mask=None):
|
transformer_outputs = self.transformer(input_ids,
|
||||||
transformer_outputs = self.transformer(input_ids, token_type_ids=token_type_ids,
|
attention_mask=attention_mask,
|
||||||
input_mask=input_mask, attention_mask=attention_mask,
|
mems=mems,
|
||||||
mems=mems, perm_mask=perm_mask, target_mapping=target_mapping,
|
perm_mask=perm_mask,
|
||||||
|
target_mapping=target_mapping,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
input_mask=input_mask,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask)
|
||||||
output = transformer_outputs[0]
|
output = transformer_outputs[0]
|
||||||
|
|
||||||
@@ -1215,13 +1221,16 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
|
|||||||
|
|
||||||
self.init_weights()
|
self.init_weights()
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, input_mask=None, attention_mask=None,
|
def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None,
|
||||||
mems=None, perm_mask=None, target_mapping=None,
|
token_type_ids=None, input_mask=None, head_mask=None,
|
||||||
start_positions=None, end_positions=None, cls_index=None, is_impossible=None, p_mask=None,
|
start_positions=None, end_positions=None, is_impossible=None, cls_index=None, p_mask=None,):
|
||||||
head_mask=None):
|
transformer_outputs = self.transformer(input_ids,
|
||||||
transformer_outputs = self.transformer(input_ids, token_type_ids=token_type_ids,
|
attention_mask=attention_mask,
|
||||||
input_mask=input_mask, attention_mask=attention_mask,
|
mems=mems,
|
||||||
mems=mems, perm_mask=perm_mask, target_mapping=target_mapping,
|
perm_mask=perm_mask,
|
||||||
|
target_mapping=target_mapping,
|
||||||
|
token_type_ids=token_type_ids,
|
||||||
|
input_mask=input_mask,
|
||||||
head_mask=head_mask)
|
head_mask=head_mask)
|
||||||
hidden_states = transformer_outputs[0]
|
hidden_states = transformer_outputs[0]
|
||||||
start_logits = self.start_logits(hidden_states, p_mask=p_mask)
|
start_logits = self.start_logits(hidden_states, p_mask=p_mask)
|
||||||
|
|||||||
@@ -126,8 +126,8 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
model = BertModel(config=config)
|
model = BertModel(config=config)
|
||||||
model.eval()
|
model.eval()
|
||||||
sequence_output, pooled_output = model(input_ids, token_type_ids, input_mask)
|
sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
|
||||||
sequence_output, pooled_output = model(input_ids, token_type_ids)
|
sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
|
||||||
sequence_output, pooled_output = model(input_ids)
|
sequence_output, pooled_output = model(input_ids)
|
||||||
|
|
||||||
result = {
|
result = {
|
||||||
@@ -143,7 +143,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
model = BertForMaskedLM(config=config)
|
model = BertForMaskedLM(config=config)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, prediction_scores = model(input_ids, token_type_ids, input_mask, token_labels)
|
loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
|
||||||
result = {
|
result = {
|
||||||
"loss": loss,
|
"loss": loss,
|
||||||
"prediction_scores": prediction_scores,
|
"prediction_scores": prediction_scores,
|
||||||
@@ -156,7 +156,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
model = BertForNextSentencePrediction(config=config)
|
model = BertForNextSentencePrediction(config=config)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, seq_relationship_score = model(input_ids, token_type_ids, input_mask, sequence_labels)
|
loss, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels)
|
||||||
result = {
|
result = {
|
||||||
"loss": loss,
|
"loss": loss,
|
||||||
"seq_relationship_score": seq_relationship_score,
|
"seq_relationship_score": seq_relationship_score,
|
||||||
@@ -170,7 +170,8 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
model = BertForPreTraining(config=config)
|
model = BertForPreTraining(config=config)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, prediction_scores, seq_relationship_score = model(input_ids, token_type_ids, input_mask, token_labels, sequence_labels)
|
loss, prediction_scores, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
|
||||||
|
masked_lm_labels=token_labels, next_sentence_label=sequence_labels)
|
||||||
result = {
|
result = {
|
||||||
"loss": loss,
|
"loss": loss,
|
||||||
"prediction_scores": prediction_scores,
|
"prediction_scores": prediction_scores,
|
||||||
@@ -188,7 +189,8 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
model = BertForQuestionAnswering(config=config)
|
model = BertForQuestionAnswering(config=config)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, start_logits, end_logits = model(input_ids, token_type_ids, input_mask, sequence_labels, sequence_labels)
|
loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
|
||||||
|
start_positions=sequence_labels, end_positions=sequence_labels)
|
||||||
result = {
|
result = {
|
||||||
"loss": loss,
|
"loss": loss,
|
||||||
"start_logits": start_logits,
|
"start_logits": start_logits,
|
||||||
@@ -207,7 +209,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
config.num_labels = self.num_labels
|
config.num_labels = self.num_labels
|
||||||
model = BertForSequenceClassification(config)
|
model = BertForSequenceClassification(config)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, logits = model(input_ids, token_type_ids, input_mask, sequence_labels)
|
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
|
||||||
result = {
|
result = {
|
||||||
"loss": loss,
|
"loss": loss,
|
||||||
"logits": logits,
|
"logits": logits,
|
||||||
@@ -222,7 +224,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
config.num_labels = self.num_labels
|
config.num_labels = self.num_labels
|
||||||
model = BertForTokenClassification(config=config)
|
model = BertForTokenClassification(config=config)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, logits = model(input_ids, token_type_ids, input_mask, token_labels)
|
loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
|
||||||
result = {
|
result = {
|
||||||
"loss": loss,
|
"loss": loss,
|
||||||
"logits": logits,
|
"logits": logits,
|
||||||
@@ -241,9 +243,9 @@ class BertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
||||||
multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
|
||||||
loss, logits = model(multiple_choice_inputs_ids,
|
loss, logits = model(multiple_choice_inputs_ids,
|
||||||
multiple_choice_token_type_ids,
|
attention_mask=multiple_choice_input_mask,
|
||||||
multiple_choice_input_mask,
|
token_type_ids=multiple_choice_token_type_ids,
|
||||||
choice_labels)
|
labels=choice_labels)
|
||||||
result = {
|
result = {
|
||||||
"loss": loss,
|
"loss": loss,
|
||||||
"logits": logits,
|
"logits": logits,
|
||||||
|
|||||||
@@ -148,7 +148,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
|
||||||
model = DistilBertForQuestionAnswering(config=config)
|
model = DistilBertForQuestionAnswering(config=config)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, start_logits, end_logits = model(input_ids, input_mask, sequence_labels, sequence_labels)
|
loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels)
|
||||||
result = {
|
result = {
|
||||||
"loss": loss,
|
"loss": loss,
|
||||||
"start_logits": start_logits,
|
"start_logits": start_logits,
|
||||||
@@ -166,7 +166,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
|
|||||||
config.num_labels = self.num_labels
|
config.num_labels = self.num_labels
|
||||||
model = DistilBertForSequenceClassification(config)
|
model = DistilBertForSequenceClassification(config)
|
||||||
model.eval()
|
model.eval()
|
||||||
loss, logits = model(input_ids, input_mask, sequence_labels)
|
loss, logits = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
|
||||||
result = {
|
result = {
|
||||||
"loss": loss,
|
"loss": loss,
|
||||||
"logits": logits,
|
"logits": logits,
|
||||||
|
|||||||
Reference in New Issue
Block a user