fix #827
This commit is contained in:
@@ -36,7 +36,7 @@ def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_p
|
|||||||
model = chkpt['model']
|
model = chkpt['model']
|
||||||
|
|
||||||
config = chkpt['params']
|
config = chkpt['params']
|
||||||
config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.Tensor, numpy.ndarray)))
|
config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.FloatTensor, numpy.ndarray)))
|
||||||
|
|
||||||
vocab = chkpt['dico_word2id']
|
vocab = chkpt['dico_word2id']
|
||||||
vocab = dict((s + '</w>' if s.find('@@') == -1 and i > 13 else s.replace('@@', ''), i) for s, i in vocab.items())
|
vocab = dict((s + '</w>' if s.find('@@') == -1 and i > 13 else s.replace('@@', ''), i) for s, i in vocab.items())
|
||||||
|
|||||||
@@ -609,11 +609,11 @@ BERT_INPUTS_DOCSTRING = r"""
|
|||||||
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
|
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
|
||||||
corresponds to a `sentence B` token
|
corresponds to a `sentence B` token
|
||||||
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
|
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
|
||||||
**attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``:
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Mask to avoid performing attention on padding token indices.
|
Mask to avoid performing attention on padding token indices.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
**head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
@@ -1027,12 +1027,12 @@ class BertForMultipleChoice(BertPreTrainedModel):
|
|||||||
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
|
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
|
||||||
corresponds to a `sentence B` token
|
corresponds to a `sentence B` token
|
||||||
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
|
(see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
|
||||||
**attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
||||||
Mask to avoid performing attention on padding token indices.
|
Mask to avoid performing attention on padding token indices.
|
||||||
The second dimension of the input (`num_choices`) indicates the number of choices to score.
|
The second dimension of the input (`num_choices`) indicates the number of choices to score.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
**head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
|||||||
@@ -402,11 +402,11 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs:
|
|||||||
list of ``torch.FloatTensor`` (one for each layer):
|
list of ``torch.FloatTensor`` (one for each layer):
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
||||||
(see `past` output below). Can be used to speed up sequential decoding.
|
(see `past` output below). Can be used to speed up sequential decoding.
|
||||||
**attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``:
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Mask to avoid performing attention on padding token indices.
|
Mask to avoid performing attention on padding token indices.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
**head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
@@ -638,11 +638,11 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
|||||||
list of ``torch.FloatTensor`` (one for each layer):
|
list of ``torch.FloatTensor`` (one for each layer):
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
||||||
(see `past` output below). Can be used to speed up sequential decoding.
|
(see `past` output below). Can be used to speed up sequential decoding.
|
||||||
**attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
||||||
Mask to avoid performing attention on padding token indices.
|
Mask to avoid performing attention on padding token indices.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
**head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
|||||||
@@ -412,11 +412,11 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs:
|
|||||||
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
||||||
The embeddings from these tokens will be summed with the respective token embeddings.
|
The embeddings from these tokens will be summed with the respective token embeddings.
|
||||||
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
|
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
|
||||||
**attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``:
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Mask to avoid performing attention on padding token indices.
|
Mask to avoid performing attention on padding token indices.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
**head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
@@ -624,11 +624,11 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
|||||||
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
||||||
The embeddings from these tokens will be summed with the respective token embeddings.
|
The embeddings from these tokens will be summed with the respective token embeddings.
|
||||||
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
|
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
|
||||||
**attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
|
||||||
Mask to avoid performing attention on padding token indices.
|
Mask to avoid performing attention on padding token indices.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
**head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
|||||||
@@ -394,8 +394,8 @@ class MultiHeadAttn(nn.Module):
|
|||||||
self.pre_lnorm = pre_lnorm
|
self.pre_lnorm = pre_lnorm
|
||||||
|
|
||||||
if r_r_bias is None or r_w_bias is None: # Biases are not shared
|
if r_r_bias is None or r_w_bias is None: # Biases are not shared
|
||||||
self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
|
self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
|
||||||
self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
|
self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
|
||||||
else:
|
else:
|
||||||
self.r_r_bias = r_r_bias
|
self.r_r_bias = r_r_bias
|
||||||
self.r_w_bias = r_w_bias
|
self.r_w_bias = r_w_bias
|
||||||
@@ -483,8 +483,8 @@ class RelMultiHeadAttn(nn.Module):
|
|||||||
self.pre_lnorm = pre_lnorm
|
self.pre_lnorm = pre_lnorm
|
||||||
|
|
||||||
if r_r_bias is None or r_w_bias is None: # Biases are not shared
|
if r_r_bias is None or r_w_bias is None: # Biases are not shared
|
||||||
self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
|
self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
|
||||||
self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
|
self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
|
||||||
else:
|
else:
|
||||||
self.r_r_bias = r_r_bias
|
self.r_r_bias = r_r_bias
|
||||||
self.r_w_bias = r_w_bias
|
self.r_w_bias = r_w_bias
|
||||||
@@ -803,13 +803,13 @@ class AdaptiveEmbedding(nn.Module):
|
|||||||
nn.Embedding(n_token, d_embed, sparse=sample_softmax>0)
|
nn.Embedding(n_token, d_embed, sparse=sample_softmax>0)
|
||||||
)
|
)
|
||||||
if d_proj != d_embed:
|
if d_proj != d_embed:
|
||||||
self.emb_projs.append(nn.Parameter(torch.Tensor(d_proj, d_embed)))
|
self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_embed)))
|
||||||
else:
|
else:
|
||||||
for i in range(len(self.cutoffs)):
|
for i in range(len(self.cutoffs)):
|
||||||
l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
|
l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i+1]
|
||||||
d_emb_i = d_embed // (div_val ** i)
|
d_emb_i = d_embed // (div_val ** i)
|
||||||
self.emb_layers.append(nn.Embedding(r_idx-l_idx, d_emb_i))
|
self.emb_layers.append(nn.Embedding(r_idx-l_idx, d_emb_i))
|
||||||
self.emb_projs.append(nn.Parameter(torch.Tensor(d_proj, d_emb_i)))
|
self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_emb_i)))
|
||||||
|
|
||||||
def forward(self, inp):
|
def forward(self, inp):
|
||||||
if self.div_val == 1:
|
if self.div_val == 1:
|
||||||
@@ -941,7 +941,7 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
|
|||||||
list of ``torch.FloatTensor`` (one for each layer):
|
list of ``torch.FloatTensor`` (one for each layer):
|
||||||
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
|
||||||
(see `mems` output below). Can be used to speed up sequential decoding and attend to longer context.
|
(see `mems` output below). Can be used to speed up sequential decoding and attend to longer context.
|
||||||
**head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
@@ -1003,8 +1003,8 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
|
|||||||
self.attn_type = config.attn_type
|
self.attn_type = config.attn_type
|
||||||
|
|
||||||
if not config.untie_r:
|
if not config.untie_r:
|
||||||
self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
|
self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
|
||||||
self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
|
self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
|
||||||
|
|
||||||
self.layers = nn.ModuleList()
|
self.layers = nn.ModuleList()
|
||||||
if config.attn_type == 0: # the default attention
|
if config.attn_type == 0: # the default attention
|
||||||
@@ -1046,14 +1046,14 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
|
|||||||
if self.attn_type == 0: # default attention
|
if self.attn_type == 0: # default attention
|
||||||
self.pos_emb = PositionalEmbedding(self.d_model)
|
self.pos_emb = PositionalEmbedding(self.d_model)
|
||||||
elif self.attn_type == 1: # learnable
|
elif self.attn_type == 1: # learnable
|
||||||
self.r_emb = nn.Parameter(torch.Tensor(
|
self.r_emb = nn.Parameter(torch.FloatTensor(
|
||||||
self.n_layer, self.max_klen, self.n_head, self.d_head))
|
self.n_layer, self.max_klen, self.n_head, self.d_head))
|
||||||
self.r_bias = nn.Parameter(torch.Tensor(
|
self.r_bias = nn.Parameter(torch.FloatTensor(
|
||||||
self.n_layer, self.max_klen, self.n_head))
|
self.n_layer, self.max_klen, self.n_head))
|
||||||
elif self.attn_type == 2: # absolute standard
|
elif self.attn_type == 2: # absolute standard
|
||||||
self.pos_emb = PositionalEmbedding(self.d_model)
|
self.pos_emb = PositionalEmbedding(self.d_model)
|
||||||
elif self.attn_type == 3: # absolute deeper SA
|
elif self.attn_type == 3: # absolute deeper SA
|
||||||
self.r_emb = nn.Parameter(torch.Tensor(
|
self.r_emb = nn.Parameter(torch.FloatTensor(
|
||||||
self.n_layer, self.max_klen, self.n_head, self.d_head))
|
self.n_layer, self.max_klen, self.n_head, self.d_head))
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.apply(self.init_weights)
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
|
|||||||
for i in range(len(self.cutoffs)):
|
for i in range(len(self.cutoffs)):
|
||||||
if d_proj != d_embed:
|
if d_proj != d_embed:
|
||||||
self.out_projs.append(
|
self.out_projs.append(
|
||||||
nn.Parameter(torch.Tensor(d_proj, d_embed))
|
nn.Parameter(torch.FloatTensor(d_proj, d_embed))
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self.out_projs.append(None)
|
self.out_projs.append(None)
|
||||||
@@ -68,7 +68,7 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
|
|||||||
d_emb_i = d_embed // (div_val ** i)
|
d_emb_i = d_embed // (div_val ** i)
|
||||||
|
|
||||||
self.out_projs.append(
|
self.out_projs.append(
|
||||||
nn.Parameter(torch.Tensor(d_proj, d_emb_i))
|
nn.Parameter(torch.FloatTensor(d_proj, d_emb_i))
|
||||||
)
|
)
|
||||||
|
|
||||||
self.out_layers.append(nn.Linear(d_emb_i, r_idx-l_idx))
|
self.out_layers.append(nn.Linear(d_emb_i, r_idx-l_idx))
|
||||||
|
|||||||
@@ -436,7 +436,7 @@ XLM_INPUTS_DOCSTRING = r"""
|
|||||||
A parallel sequence of tokens to be used to indicate the language of each token in the input.
|
A parallel sequence of tokens to be used to indicate the language of each token in the input.
|
||||||
Indices are selected in the pre-trained language vocabulary,
|
Indices are selected in the pre-trained language vocabulary,
|
||||||
i.e. in the range ``[0, config.n_langs - 1[``.
|
i.e. in the range ``[0, config.n_langs - 1[``.
|
||||||
**attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``:
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Mask to avoid performing attention on padding token indices.
|
Mask to avoid performing attention on padding token indices.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
@@ -449,7 +449,7 @@ XLM_INPUTS_DOCSTRING = r"""
|
|||||||
hidden-states (key and values in the attention blocks) as computed by the model
|
hidden-states (key and values in the attention blocks) as computed by the model
|
||||||
(see `cache` output below). Can be used to speed up sequential decoding.
|
(see `cache` output below). Can be used to speed up sequential decoding.
|
||||||
The dictionary object will be modified in-place during the forward pass to add newly computed hidden-states.
|
The dictionary object will be modified in-place during the forward pass to add newly computed hidden-states.
|
||||||
**head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
|
|||||||
@@ -367,16 +367,16 @@ class XLNetRelativeAttention(nn.Module):
|
|||||||
self.d_model = config.d_model
|
self.d_model = config.d_model
|
||||||
self.scale = 1 / (config.d_head ** 0.5)
|
self.scale = 1 / (config.d_head ** 0.5)
|
||||||
|
|
||||||
self.q = nn.Parameter(torch.Tensor(config.d_model, self.n_head, self.d_head))
|
self.q = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head))
|
||||||
self.k = nn.Parameter(torch.Tensor(config.d_model, self.n_head, self.d_head))
|
self.k = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head))
|
||||||
self.v = nn.Parameter(torch.Tensor(config.d_model, self.n_head, self.d_head))
|
self.v = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head))
|
||||||
self.o = nn.Parameter(torch.Tensor(config.d_model, self.n_head, self.d_head))
|
self.o = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head))
|
||||||
self.r = nn.Parameter(torch.Tensor(config.d_model, self.n_head, self.d_head))
|
self.r = nn.Parameter(torch.FloatTensor(config.d_model, self.n_head, self.d_head))
|
||||||
|
|
||||||
self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
|
self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
|
||||||
self.r_s_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
|
self.r_s_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
|
||||||
self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
|
self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
|
||||||
self.seg_embed = nn.Parameter(torch.Tensor(2, self.n_head, self.d_head))
|
self.seg_embed = nn.Parameter(torch.FloatTensor(2, self.n_head, self.d_head))
|
||||||
|
|
||||||
self.layer_norm = XLNetLayerNorm(config.d_model, eps=config.layer_norm_eps)
|
self.layer_norm = XLNetLayerNorm(config.d_model, eps=config.layer_norm_eps)
|
||||||
self.dropout = nn.Dropout(config.dropout)
|
self.dropout = nn.Dropout(config.dropout)
|
||||||
@@ -660,11 +660,11 @@ XLNET_INPUTS_DOCSTRING = r"""
|
|||||||
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
A parallel sequence of tokens (can be used to indicate various portions of the inputs).
|
||||||
The embeddings from these tokens will be summed with the respective token embeddings.
|
The embeddings from these tokens will be summed with the respective token embeddings.
|
||||||
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
|
Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
|
||||||
**attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``:
|
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Mask to avoid performing attention on padding token indices.
|
Mask to avoid performing attention on padding token indices.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
|
||||||
**input_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``:
|
**input_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
|
||||||
Mask to avoid performing attention on padding token indices.
|
Mask to avoid performing attention on padding token indices.
|
||||||
Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding.
|
Negative of `attention_mask`, i.e. with 0 for real tokens and 1 for padding.
|
||||||
Kept for compatibility with the original code base.
|
Kept for compatibility with the original code base.
|
||||||
@@ -685,7 +685,7 @@ XLNET_INPUTS_DOCSTRING = r"""
|
|||||||
Mask to indicate the output tokens to use.
|
Mask to indicate the output tokens to use.
|
||||||
If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token.
|
If ``target_mapping[k, i, j] = 1``, the i-th predict in batch k is on the j-th token.
|
||||||
Only used during pretraining for partial prediction or for sequential decoding (generation).
|
Only used during pretraining for partial prediction or for sequential decoding (generation).
|
||||||
**head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
|
||||||
Mask to nullify selected heads of the self-attention modules.
|
Mask to nullify selected heads of the self-attention modules.
|
||||||
Mask values selected in ``[0, 1]``:
|
Mask values selected in ``[0, 1]``:
|
||||||
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
|
||||||
@@ -735,7 +735,7 @@ class XLNetModel(XLNetPreTrainedModel):
|
|||||||
self.n_layer = config.n_layer
|
self.n_layer = config.n_layer
|
||||||
|
|
||||||
self.word_embedding = nn.Embedding(config.n_token, config.d_model)
|
self.word_embedding = nn.Embedding(config.n_token, config.d_model)
|
||||||
self.mask_emb = nn.Parameter(torch.Tensor(1, 1, config.d_model))
|
self.mask_emb = nn.Parameter(torch.FloatTensor(1, 1, config.d_model))
|
||||||
self.layer = nn.ModuleList([XLNetLayer(config) for _ in range(config.n_layer)])
|
self.layer = nn.ModuleList([XLNetLayer(config) for _ in range(config.n_layer)])
|
||||||
self.dropout = nn.Dropout(config.dropout)
|
self.dropout = nn.Dropout(config.dropout)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user