test deviation with tf model: max ~1e-3 should be ok
This commit is contained in:
169
hubconfs/xlnet_hubconf.py
Normal file
169
hubconfs/xlnet_hubconf.py
Normal file
@@ -0,0 +1,169 @@
|
||||
from pytorch_pretrained_bert.tokenization_xlnet import XLNetTokenizer
|
||||
from pytorch_pretrained_bert.modeling_xlnet import (
|
||||
XLNetConfig,
|
||||
XLNetModel,
|
||||
XLNetLMHeadModel,
|
||||
XLNetForSequenceClassification
|
||||
)
|
||||
|
||||
# A lot of models share the same param doc. Use a decorator
|
||||
# to save typing
|
||||
xlnet_docstring = """
|
||||
Params:
|
||||
pretrained_model_name_or_path: either:
|
||||
- a str with the name of a pre-trained model to load selected in the list of:
|
||||
. `xlnet-large-cased`
|
||||
- a path or url to a pretrained model archive containing:
|
||||
. `config.json` a configuration file for the model
|
||||
. `pytorch_model.bin` a PyTorch dump of a XLNetForPreTraining instance
|
||||
- a path or url to a pretrained model archive containing:
|
||||
. `xlnet_config.json` a configuration file for the model
|
||||
. `model.chkpt` a TensorFlow checkpoint
|
||||
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
|
||||
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
|
||||
state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
|
||||
*inputs, **kwargs: additional input for the specific XLNet class
|
||||
"""
|
||||
|
||||
|
||||
def _append_from_pretrained_docstring(docstr):
|
||||
def docstring_decorator(fn):
|
||||
fn.__doc__ = fn.__doc__ + docstr
|
||||
return fn
|
||||
return docstring_decorator
|
||||
|
||||
|
||||
def xlnetTokenizer(*args, **kwargs):
|
||||
"""
|
||||
Instantiate a XLNet sentencepiece tokenizer for XLNet from a pre-trained vocab file.
|
||||
Peculiarities:
|
||||
- require Google sentencepiece (https://github.com/google/sentencepiece)
|
||||
|
||||
Args:
|
||||
pretrained_model_name_or_path: Path to pretrained model archive
|
||||
or one of pre-trained vocab configs below.
|
||||
* xlnet-large-cased
|
||||
Keyword args:
|
||||
special_tokens: Special tokens in vocabulary that are not pretrained
|
||||
Default: None
|
||||
max_len: An artificial maximum length to truncate tokenized sequences to;
|
||||
Effective maximum length is always the minimum of this
|
||||
value (if specified) and the underlying model's
|
||||
sequence length.
|
||||
Default: None
|
||||
|
||||
Example:
|
||||
>>> import torch
|
||||
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetTokenizer', 'xlnet-large-cased')
|
||||
|
||||
>>> text = "Who was Jim Henson ?"
|
||||
>>> indexed_tokens = tokenizer.encode(tokenized_text)
|
||||
"""
|
||||
tokenizer = XLNetTokenizer.from_pretrained(*args, **kwargs)
|
||||
return tokenizer
|
||||
|
||||
|
||||
@_append_from_pretrained_docstring(xlnet_docstring)
|
||||
def xlnetModel(*args, **kwargs):
|
||||
"""
|
||||
xlnetModel is the basic XLNet Transformer model from
|
||||
"XLNet: Generalized Autoregressive Pretraining for Language Understanding"
|
||||
by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
|
||||
|
||||
Example:
|
||||
# Load the tokenizer
|
||||
>>> import torch
|
||||
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetTokenizer', 'xlnet-large-cased')
|
||||
|
||||
# Prepare tokenized input
|
||||
>>> text_1 = "Who was Jim Henson ?"
|
||||
>>> text_2 = "Jim Henson was a puppeteer"
|
||||
>>> indexed_tokens_1 = tokenizer.encode(text_1)
|
||||
>>> indexed_tokens_2 = tokenizer.encode(text_2)
|
||||
>>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
|
||||
>>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
|
||||
|
||||
# Load xlnetModel
|
||||
>>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetModel', 'xlnet-large-cased')
|
||||
>>> model.eval()
|
||||
|
||||
# Predict hidden states features for each layer
|
||||
>>> with torch.no_grad():
|
||||
hidden_states_1, mems = model(tokens_tensor_1)
|
||||
hidden_states_2, mems = model(tokens_tensor_2, past=mems)
|
||||
"""
|
||||
model = XLNetModel.from_pretrained(*args, **kwargs)
|
||||
return model
|
||||
|
||||
|
||||
@_append_from_pretrained_docstring(xlnet_docstring)
|
||||
def xlnetLMHeadModel(*args, **kwargs):
|
||||
"""
|
||||
xlnetModel is the basic XLNet Transformer model from
|
||||
"XLNet: Generalized Autoregressive Pretraining for Language Understanding"
|
||||
by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
|
||||
with a tied (pre-trained) language modeling head on top.
|
||||
|
||||
Example:
|
||||
# Load the tokenizer
|
||||
>>> import torch
|
||||
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetTokenizer', 'xlnet-large-cased')
|
||||
|
||||
# Prepare tokenized input
|
||||
>>> text_1 = "Who was Jim Henson ?"
|
||||
>>> text_2 = "Jim Henson was a puppeteer"
|
||||
>>> indexed_tokens_1 = tokenizer.encode(text_1)
|
||||
>>> indexed_tokens_2 = tokenizer.encode(text_2)
|
||||
>>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
|
||||
>>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
|
||||
|
||||
# Load xlnetLMHeadModel
|
||||
>>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetLMHeadModel', 'xlnet-large-cased')
|
||||
>>> model.eval()
|
||||
|
||||
# Predict hidden states features for each layer
|
||||
>>> with torch.no_grad():
|
||||
predictions_1, mems = model(tokens_tensor_1)
|
||||
predictions_2, mems = model(tokens_tensor_2, mems=mems)
|
||||
|
||||
# Get the predicted last token
|
||||
>>> predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
|
||||
>>> predicted_token = tokenizer.decode([predicted_index])
|
||||
>>> assert predicted_token == ' who'
|
||||
"""
|
||||
model = XLNetLMHeadModel.from_pretrained(*args, **kwargs)
|
||||
return model
|
||||
|
||||
|
||||
@_append_from_pretrained_docstring(xlnet_docstring)
|
||||
def xlnetForSequenceClassification(*args, **kwargs):
|
||||
"""
|
||||
xlnetModel is the basic XLNet Transformer model from
|
||||
"XLNet: Generalized Autoregressive Pretraining for Language Understanding"
|
||||
by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
|
||||
|
||||
Example:
|
||||
# Load the tokenizer
|
||||
>>> import torch
|
||||
>>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetTokenizer', 'xlnet-large-cased')
|
||||
|
||||
# Prepare tokenized input
|
||||
>>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
|
||||
>>> text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
|
||||
>>> tokenized_text1 = tokenizer.tokenize(text1)
|
||||
>>> tokenized_text2 = tokenizer.tokenize(text2)
|
||||
>>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
|
||||
>>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
|
||||
>>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
|
||||
>>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
|
||||
|
||||
# Load xlnetForSequenceClassification
|
||||
>>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'xlnetForSequenceClassification', 'xlnet-large-cased')
|
||||
>>> model.eval()
|
||||
|
||||
# Predict sequence classes logits
|
||||
>>> with torch.no_grad():
|
||||
lm_logits, mems = model(tokens_tensor)
|
||||
"""
|
||||
model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)
|
||||
return model
|
||||
@@ -3,7 +3,7 @@ from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
|
||||
from .tokenization_openai import OpenAIGPTTokenizer
|
||||
from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
|
||||
from .tokenization_gpt2 import GPT2Tokenizer
|
||||
from .tokenization_xlnet import XLNetTokenizer
|
||||
from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
|
||||
|
||||
from .modeling import (BertConfig, BertModel, BertForPreTraining,
|
||||
BertForMaskedLM, BertForNextSentencePrediction,
|
||||
|
||||
@@ -165,12 +165,12 @@ def load_tf_weights_in_xlnet(model, config, tf_path):
|
||||
|
||||
|
||||
def gelu(x):
|
||||
"""Implementation of the gelu activation function.
|
||||
For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
|
||||
0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
||||
""" Implementation of the gelu activation function.
|
||||
XLNet is using OpenAI GPT's gelu (not exactly the same as BERT)
|
||||
Also see https://arxiv.org/abs/1606.08415
|
||||
"""
|
||||
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
|
||||
cdf = 0.5 * (1.0 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
||||
return x * cdf
|
||||
|
||||
|
||||
def swish(x):
|
||||
@@ -657,7 +657,7 @@ class XLNetPreTrainedModel(nn.Module):
|
||||
- a str with the name of a pre-trained model to load selected in the list of:
|
||||
. `xlnet-large-cased`
|
||||
- a path or url to a pretrained model archive containing:
|
||||
. `xlnet_config.json` a configuration file for the model
|
||||
. `config.json` a configuration file for the model
|
||||
. `pytorch_model.bin` a PyTorch dump of a XLNetForPreTraining instance
|
||||
- a path or url to a pretrained model archive containing:
|
||||
. `xlnet_config.json` a configuration file for the model
|
||||
@@ -767,6 +767,8 @@ class XLNetPreTrainedModel(nn.Module):
|
||||
if len(error_msgs) > 0:
|
||||
raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
|
||||
model.__class__.__name__, "\n\t".join(error_msgs)))
|
||||
if isinstance(model, XLNetLMHeadModel):
|
||||
model.tie_weights() # make sure word embedding weights are still tied
|
||||
return model
|
||||
|
||||
|
||||
@@ -894,23 +896,23 @@ class XLNetModel(XLNetPreTrainedModel):
|
||||
output_all_encoded_layers=True, head_mask=None):
|
||||
"""
|
||||
Args:
|
||||
inp_k: int32 Tensor in shape [len, bsz], the input token IDs.
|
||||
seg_id: int32 Tensor in shape [len, bsz], the input segment IDs.
|
||||
input_mask: [optional] float32 Tensor in shape [len, bsz], the input mask.
|
||||
inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
|
||||
seg_id: int32 Tensor in shape [bsz, len], the input segment IDs.
|
||||
input_mask: [optional] float32 Tensor in shape [bsz, len], the input mask.
|
||||
0 for real tokens and 1 for padding.
|
||||
mems: [optional] a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
|
||||
from previous batches. The length of the list equals n_layer.
|
||||
If None, no memory is used.
|
||||
perm_mask: [optional] float32 Tensor in shape [len, len, bsz].
|
||||
If perm_mask[i, j, k] = 0, i attend to j in batch k;
|
||||
if perm_mask[i, j, k] = 1, i does not attend to j in batch k.
|
||||
perm_mask: [optional] float32 Tensor in shape [bsz, len, len].
|
||||
If perm_mask[k, i, j] = 0, i attend to j in batch k;
|
||||
if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
|
||||
If None, each position attends to all the others.
|
||||
target_mapping: [optional] float32 Tensor in shape [num_predict, len, bsz].
|
||||
If target_mapping[i, j, k] = 1, the i-th predict in batch k is
|
||||
target_mapping: [optional] float32 Tensor in shape [bsz, num_predict, len].
|
||||
If target_mapping[k, i, j] = 1, the i-th predict in batch k is
|
||||
on the j-th token.
|
||||
Only used during pretraining for partial prediction.
|
||||
Set to None during finetuning.
|
||||
inp_q: [optional] float32 Tensor in shape [len, bsz].
|
||||
inp_q: [optional] float32 Tensor in shape [bsz, len].
|
||||
1 for tokens with losses and 0 for tokens without losses.
|
||||
Only used during pretraining for two-stream attention.
|
||||
Set to None during finetuning.
|
||||
@@ -926,6 +928,16 @@ class XLNetModel(XLNetPreTrainedModel):
|
||||
summary_type: str, "last", "first", "mean", or "attn". The method
|
||||
to pool the input to get a vector representation.
|
||||
"""
|
||||
# the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
|
||||
# but we want a unified interface in the library with the batch size on the first dimension
|
||||
# so we move here the first dimension (batch) to the end
|
||||
inp_k = inp_k.transpose(0, 1).contiguous()
|
||||
seg_id = seg_id.transpose(0, 1).contiguous() if seg_id is not None else None
|
||||
input_mask = input_mask.transpose(0, 1).contiguous() if input_mask is not None else None
|
||||
perm_mask = perm_mask.permute(1, 2, 0).contiguous() if perm_mask is not None else None
|
||||
target_mapping = target_mapping.permute(1, 2, 0).contiguous() if target_mapping is not None else None
|
||||
inp_q = inp_q.transpose(0, 1).contiguous() if inp_q is not None else None
|
||||
|
||||
qlen, bsz = inp_k.shape[0], inp_k.shape[1]
|
||||
mlen = mems[0].shape[0] if mems is not None else 0
|
||||
klen = mlen + qlen
|
||||
@@ -1020,6 +1032,7 @@ class XLNetModel(XLNetPreTrainedModel):
|
||||
if mems is None:
|
||||
mems = [None] * len(self.layer)
|
||||
|
||||
hidden_states = []
|
||||
for i, layer_module in enumerate(self.layer):
|
||||
# cache new mems
|
||||
new_mems.append(self.cache_mem(output_h, mems[i]))
|
||||
@@ -1029,10 +1042,14 @@ class XLNetModel(XLNetPreTrainedModel):
|
||||
r=pos_emb, seg_mat=seg_mat,
|
||||
mems=mems[i], target_mapping=target_mapping,
|
||||
head_mask=head_mask)
|
||||
|
||||
hidden_states.append(output_h)
|
||||
output = self.dropout(output_g if output_g is not None else output_h)
|
||||
|
||||
return output, new_mems
|
||||
# We transpose back here to shape [bsz, len, hidden_dim] (cf. beginning of forward() method)
|
||||
output = output.permute(1, 0, 2).contiguous()
|
||||
hidden_states = [hs.permute(1, 0, 2).contiguous() for hs in hidden_states]
|
||||
|
||||
return output, hidden_states, new_mems
|
||||
|
||||
|
||||
class XLNetLMHeadModel(XLNetPreTrainedModel):
|
||||
@@ -1110,23 +1127,23 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
|
||||
target=None, output_all_encoded_layers=True, head_mask=None):
|
||||
"""
|
||||
Args:
|
||||
inp_k: int32 Tensor in shape [len, bsz], the input token IDs.
|
||||
seg_id: int32 Tensor in shape [len, bsz], the input segment IDs.
|
||||
input_mask: float32 Tensor in shape [len, bsz], the input mask.
|
||||
inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
|
||||
seg_id: int32 Tensor in shape [bsz, len], the input segment IDs.
|
||||
input_mask: float32 Tensor in shape [bsz, len], the input mask.
|
||||
0 for real tokens and 1 for padding.
|
||||
mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
|
||||
from previous batches. The length of the list equals n_layer.
|
||||
If None, no memory is used.
|
||||
perm_mask: float32 Tensor in shape [len, len, bsz].
|
||||
If perm_mask[i, j, k] = 0, i attend to j in batch k;
|
||||
if perm_mask[i, j, k] = 1, i does not attend to j in batch k.
|
||||
perm_mask: float32 Tensor in shape [bsz, len, len].
|
||||
If perm_mask[k, i, j] = 0, i attend to j in batch k;
|
||||
if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
|
||||
If None, each position attends to all the others.
|
||||
target_mapping: float32 Tensor in shape [num_predict, len, bsz].
|
||||
If target_mapping[i, j, k] = 1, the i-th predict in batch k is
|
||||
target_mapping: float32 Tensor in shape [bsz, num_predict, len].
|
||||
If target_mapping[k, i, j] = 1, the i-th predict in batch k is
|
||||
on the j-th token.
|
||||
Only used during pretraining for partial prediction.
|
||||
Set to None during finetuning.
|
||||
inp_q: float32 Tensor in shape [len, bsz].
|
||||
inp_q: float32 Tensor in shape [bsz, len].
|
||||
1 for tokens with losses and 0 for tokens without losses.
|
||||
Only used during pretraining for two-stream attention.
|
||||
Set to None during finetuning.
|
||||
@@ -1134,7 +1151,131 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
|
||||
summary_type: str, "last", "first", "mean", or "attn". The method
|
||||
to pool the input to get a vector representation.
|
||||
"""
|
||||
output, new_mems = self.transformer(inp_k, seg_id, input_mask,
|
||||
output, hidden_states, new_mems = self.transformer(inp_k, seg_id, input_mask,
|
||||
mems, perm_mask, target_mapping, inp_q,
|
||||
output_all_encoded_layers, head_mask)
|
||||
|
||||
logits = self.lm_loss(output)
|
||||
|
||||
if target is not None:
|
||||
# Flatten the tokens
|
||||
loss_fct = CrossEntropyLoss(ignore_index=-1)
|
||||
loss = loss_fct(logits.view(-1, logits.size(-1)),
|
||||
target.view(-1))
|
||||
return loss, new_mems
|
||||
|
||||
# if self.output_attentions:
|
||||
# all_attentions, encoded_layers = encoded_layers
|
||||
# sequence_output = encoded_layers[-1]
|
||||
# pooled_output = self.pooler(sequence_output)
|
||||
# if not output_all_encoded_layers:
|
||||
# encoded_layers = encoded_layers[-1]
|
||||
# if self.output_attentions:
|
||||
return logits, new_mems
|
||||
# return all_attentions, encoded_layers, pooled_output
|
||||
|
||||
|
||||
class XLNetForSequenceClassification(XLNetPreTrainedModel):
|
||||
"""XLNet model ("XLNet: Generalized Autoregressive Pretraining for Language Understanding").
|
||||
|
||||
Params:
|
||||
`config`: a XLNetConfig class instance with the configuration to build a new model
|
||||
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
|
||||
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
|
||||
This can be used to compute head importance metrics. Default: False
|
||||
`summary_type`: str, "last", "first", "mean", or "attn". The method
|
||||
to pool the input to get a vector representation. Default: last
|
||||
|
||||
Inputs:
|
||||
inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
|
||||
seg_id: int32 Tensor in shape [bsz, len], the input segment IDs.
|
||||
input_mask: float32 Tensor in shape [bsz, len], the input mask.
|
||||
0 for real tokens and 1 for padding.
|
||||
mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
|
||||
from previous batches. The length of the list equals n_layer.
|
||||
If None, no memory is used.
|
||||
perm_mask: float32 Tensor in shape [bsz, len, len].
|
||||
If perm_mask[k, i, j] = 0, i attend to j in batch k;
|
||||
if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
|
||||
If None, each position attends to all the others.
|
||||
target_mapping: float32 Tensor in shape [bsz, num_predict, len].
|
||||
If target_mapping[k, i, j] = 1, the i-th predict in batch k is
|
||||
on the j-th token.
|
||||
Only used during pretraining for partial prediction.
|
||||
Set to None during finetuning.
|
||||
inp_q: float32 Tensor in shape [bsz, len].
|
||||
1 for tokens with losses and 0 for tokens without losses.
|
||||
Only used during pretraining for two-stream attention.
|
||||
Set to None during finetuning.
|
||||
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
|
||||
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
|
||||
|
||||
|
||||
Outputs: Tuple of (logits or loss, mems)
|
||||
`logits or loss`:
|
||||
if target is None:
|
||||
Token logits with shape [batch_size, sequence_length]
|
||||
else:
|
||||
CrossEntropy loss with the targets
|
||||
`new_mems`: list (num layers) of updated mem states at the entry of each layer
|
||||
each mem state is a torch.FloatTensor of size [self.config.mem_len, batch_size, self.config.d_model]
|
||||
Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `target`
|
||||
|
||||
Example usage:
|
||||
```python
|
||||
# Already been converted into WordPiece token ids
|
||||
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
|
||||
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
|
||||
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
|
||||
|
||||
config = modeling.XLNetConfig(vocab_size_or_config_json_file=32000, d_model=768,
|
||||
n_layer=12, num_attention_heads=12, intermediate_size=3072)
|
||||
|
||||
model = modeling.XLNetModel(config=config)
|
||||
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
|
||||
```
|
||||
"""
|
||||
def __init__(self, config, summary_type="last", output_attentions=False, keep_multihead_output=False):
|
||||
super(XLNetForSequenceClassification, self).__init__(config)
|
||||
self.output_attentions = output_attentions
|
||||
self.attn_type = config.attn_type
|
||||
self.same_length = config.same_length
|
||||
self.summary_type = summary_type
|
||||
|
||||
self.transformer = XLNetModel(config, output_attentions=output_attentions,
|
||||
keep_multihead_output=keep_multihead_output)
|
||||
self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
|
||||
|
||||
self.apply(self.init_xlnet_weights)
|
||||
self.tie_weights()
|
||||
|
||||
def forward(self, inp_k, seg_id=None, input_mask=None,
|
||||
mems=None, perm_mask=None, target_mapping=None, inp_q=None,
|
||||
target=None, output_all_encoded_layers=True, head_mask=None):
|
||||
"""
|
||||
Args:
|
||||
inp_k: int32 Tensor in shape [bsz, len], the input token IDs.
|
||||
seg_id: int32 Tensor in shape [bsz, len], the input segment IDs.
|
||||
input_mask: float32 Tensor in shape [bsz, len], the input mask.
|
||||
0 for real tokens and 1 for padding.
|
||||
mems: a list of float32 Tensors in shape [mem_len, bsz, d_model], memory
|
||||
from previous batches. The length of the list equals n_layer.
|
||||
If None, no memory is used.
|
||||
perm_mask: float32 Tensor in shape [bsz, len, len].
|
||||
If perm_mask[k, i, j] = 0, i attend to j in batch k;
|
||||
if perm_mask[k, i, j] = 1, i does not attend to j in batch k.
|
||||
If None, each position attends to all the others.
|
||||
target_mapping: float32 Tensor in shape [bsz, num_predict, len].
|
||||
If target_mapping[k, i, j] = 1, the i-th predict in batch k is
|
||||
on the j-th token.
|
||||
Only used during pretraining for partial prediction.
|
||||
Set to None during finetuning.
|
||||
inp_q: float32 Tensor in shape [bsz, len].
|
||||
1 for tokens with losses and 0 for tokens without losses.
|
||||
Only used during pretraining for two-stream attention.
|
||||
Set to None during finetuning.
|
||||
"""
|
||||
output, hidden_states, new_mems = self.transformer(inp_k, seg_id, input_mask,
|
||||
mems, perm_mask, target_mapping, inp_q,
|
||||
output_all_encoded_layers, head_mask)
|
||||
|
||||
|
||||
@@ -74,9 +74,9 @@ class XLNetModelTest(unittest.TestCase):
|
||||
self.type_vocab_size = type_vocab_size
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
input_ids_1 = XLNetModelTest.ids_tensor([self.seq_length, self.batch_size], self.vocab_size)
|
||||
input_ids_2 = XLNetModelTest.ids_tensor([self.seq_length, self.batch_size], self.vocab_size)
|
||||
segment_ids = XLNetModelTest.ids_tensor([self.seq_length, self.batch_size], self.type_vocab_size)
|
||||
input_ids_1 = XLNetModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||
input_ids_2 = XLNetModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||
segment_ids = XLNetModelTest.ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
|
||||
|
||||
# inp_k: int32 Tensor in shape [len, bsz], the input token IDs.
|
||||
# seg_id: int32 Tensor in shape [len, bsz], the input segment IDs.
|
||||
@@ -101,7 +101,7 @@ class XLNetModelTest(unittest.TestCase):
|
||||
|
||||
lm_labels = None
|
||||
if self.use_labels:
|
||||
lm_labels = XLNetModelTest.ids_tensor([self.seq_length, self.batch_size], self.vocab_size)
|
||||
lm_labels = XLNetModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
|
||||
|
||||
config = XLNetConfig(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
@@ -155,7 +155,7 @@ class XLNetModelTest(unittest.TestCase):
|
||||
[])
|
||||
self.parent.assertListEqual(
|
||||
list(result["lm_logits_1"].size()),
|
||||
[self.seq_length, self.batch_size, self.vocab_size])
|
||||
[self.batch_size, self.seq_length, self.vocab_size])
|
||||
self.parent.assertListEqual(
|
||||
list(list(mem.size()) for mem in result["mems_1a"]),
|
||||
[[self.seq_length, self.batch_size, self.d_model]] * self.n_layer)
|
||||
@@ -171,7 +171,7 @@ class XLNetModelTest(unittest.TestCase):
|
||||
[])
|
||||
self.parent.assertListEqual(
|
||||
list(result["lm_logits_2"].size()),
|
||||
[self.seq_length, self.batch_size, self.vocab_size])
|
||||
[self.batch_size, self.seq_length, self.vocab_size])
|
||||
self.parent.assertListEqual(
|
||||
list(list(mem.size()) for mem in result["mems_2a"]),
|
||||
[[self.mem_len, self.batch_size, self.d_model]] * self.n_layer)
|
||||
|
||||
@@ -20,7 +20,9 @@ from io import open
|
||||
import shutil
|
||||
import pytest
|
||||
|
||||
from pytorch_pretrained_bert.tokenization_xlnet import (XLNetTokenizer, PRETRAINED_VOCAB_ARCHIVE_MAP)
|
||||
from pytorch_pretrained_bert.tokenization_xlnet import (XLNetTokenizer,
|
||||
PRETRAINED_VOCAB_ARCHIVE_MAP,
|
||||
SPIECE_UNDERLINE)
|
||||
|
||||
SAMPLE_VOCAB = os.path.join(os.path.dirname(
|
||||
os.path.dirname(os.path.abspath(__file__))),
|
||||
@@ -45,9 +47,9 @@ class XLNetTokenizationTest(unittest.TestCase):
|
||||
os.remove(special_tokens_file)
|
||||
|
||||
tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
|
||||
self.assertListEqual(tokens, ['▁I', '▁was', '▁b', 'or', 'n', '▁in', '▁',
|
||||
'9', '2', '0', '0', '0', ',', '▁and', '▁this',
|
||||
'▁is', '▁f', 'al', 's', 'é', '.'])
|
||||
self.assertListEqual(tokens, [SPIECE_UNDERLINE + 'I', SPIECE_UNDERLINE + 'was', SPIECE_UNDERLINE + 'b', 'or', 'n', SPIECE_UNDERLINE + 'in', SPIECE_UNDERLINE + '',
|
||||
'9', '2', '0', '0', '0', ',', SPIECE_UNDERLINE + 'and', SPIECE_UNDERLINE + 'this',
|
||||
SPIECE_UNDERLINE + 'is', SPIECE_UNDERLINE + 'f', 'al', 's', 'é', '.'])
|
||||
ids = tokenizer.convert_tokens_to_ids(tokens)
|
||||
self.assertListEqual(
|
||||
ids, [8, 21, 84, 55, 24, 19, 7, 0,
|
||||
@@ -55,9 +57,9 @@ class XLNetTokenizationTest(unittest.TestCase):
|
||||
46, 72, 80, 6, 0, 4])
|
||||
|
||||
back_tokens = tokenizer.convert_ids_to_tokens(ids)
|
||||
self.assertListEqual(back_tokens, ['▁I', '▁was', '▁b', 'or', 'n', '▁in',
|
||||
'▁', '<unk>', '2', '0', '0', '0', ',',
|
||||
'▁and', '▁this', '▁is', '▁f', 'al', 's',
|
||||
self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + 'I', SPIECE_UNDERLINE + 'was', SPIECE_UNDERLINE + 'b', 'or', 'n', SPIECE_UNDERLINE + 'in',
|
||||
SPIECE_UNDERLINE + '', '<unk>', '2', '0', '0', '0', ',',
|
||||
SPIECE_UNDERLINE + 'and', SPIECE_UNDERLINE + 'this', SPIECE_UNDERLINE + 'is', SPIECE_UNDERLINE + 'f', 'al', 's',
|
||||
'<unk>', '.'])
|
||||
|
||||
@pytest.mark.slow
|
||||
@@ -71,17 +73,17 @@ class XLNetTokenizationTest(unittest.TestCase):
|
||||
def test_tokenizer_lower(self):
|
||||
tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=True)
|
||||
tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
|
||||
self.assertListEqual(tokens, ['▁', 'i', '▁was', '▁b', 'or', 'n', '▁in', '▁',
|
||||
'9', '2', '0', '0', '0', ',', '▁and', '▁this',
|
||||
'▁is', '▁f', 'al', 'se', '.'])
|
||||
self.assertListEqual(tokens, [SPIECE_UNDERLINE + '', 'i', SPIECE_UNDERLINE + 'was', SPIECE_UNDERLINE + 'b', 'or', 'n', SPIECE_UNDERLINE + 'in', SPIECE_UNDERLINE + '',
|
||||
'9', '2', '0', '0', '0', ',', SPIECE_UNDERLINE + 'and', SPIECE_UNDERLINE + 'this',
|
||||
SPIECE_UNDERLINE + 'is', SPIECE_UNDERLINE + 'f', 'al', 'se', '.'])
|
||||
self.assertListEqual(tokenizer.tokenize(u"H\u00E9llo"), ["▁he", "ll", "o"])
|
||||
|
||||
def test_tokenizer_no_lower(self):
|
||||
tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=False)
|
||||
tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
|
||||
self.assertListEqual(tokens, ['▁I', '▁was', '▁b', 'or', 'n', '▁in', '▁',
|
||||
'9', '2', '0', '0', '0', ',', '▁and', '▁this',
|
||||
'▁is', '▁f', 'al', 'se', '.'])
|
||||
self.assertListEqual(tokens, [SPIECE_UNDERLINE + 'I', SPIECE_UNDERLINE + 'was', SPIECE_UNDERLINE + 'b', 'or', 'n', SPIECE_UNDERLINE + 'in', SPIECE_UNDERLINE + '',
|
||||
'9', '2', '0', '0', '0', ',', SPIECE_UNDERLINE + 'and', SPIECE_UNDERLINE + 'this',
|
||||
SPIECE_UNDERLINE + 'is', SPIECE_UNDERLINE + 'f', 'al', 'se', '.'])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
Reference in New Issue
Block a user