diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py index dbe979c564..0dc86563aa 100644 --- a/pytorch_transformers/__init__.py +++ b/pytorch_transformers/__init__.py @@ -68,7 +68,8 @@ if _torch_available: GPT2LMHeadModel, GPT2DoubleHeadsModel, load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP) from .modeling_xlnet import (XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel, - XLNetForSequenceClassification, XLNetForQuestionAnswering, + XLNetForSequenceClassification, XLNetForQuestionAnsweringSimple, + XLNetForQuestionAnswering, load_tf_weights_in_xlnet, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP) from .modeling_xlm import (XLMPreTrainedModel , XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, @@ -112,6 +113,12 @@ if _tf_available: load_gpt2_pt_weights_in_tf2, TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP) + from .modeling_tf_xlnet import (TFXLNetPreTrainedModel, TFXLNetMainLayer, + TFXLNetModel, TFXLNetLMHeadModel, + TFXLNetForSequenceClassification, + TFXLNetForQuestionAnsweringSimple, + load_xlnet_pt_weights_in_tf2, + TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP) # Files and general utilities from .file_utils import (PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE, diff --git a/pytorch_transformers/configuration_utils.py b/pytorch_transformers/configuration_utils.py index 42346d6b6c..fb1fe82f43 100644 --- a/pytorch_transformers/configuration_utils.py +++ b/pytorch_transformers/configuration_utils.py @@ -175,7 +175,7 @@ class PretrainedConfig(object): """Constructs a `Config` from a Python dictionary of parameters.""" config = cls(vocab_size_or_config_json_file=-1) for key, value in json_object.items(): - config.__dict__[key] = value + setattr(config, key, value) return config @classmethod diff --git a/pytorch_transformers/configuration_xlnet.py b/pytorch_transformers/configuration_xlnet.py index 204d44aa72..cb325dfe17 100644 --- a/pytorch_transformers/configuration_xlnet.py +++ b/pytorch_transformers/configuration_xlnet.py @@ -112,7 +112,7 @@ class XLNetConfig(PretrainedConfig): with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: json_config = json.loads(reader.read()) for key, value in json_config.items(): - self.__dict__[key] = value + setattr(config, key, value) elif isinstance(vocab_size_or_config_json_file, int): self.n_token = vocab_size_or_config_json_file self.d_model = d_model diff --git a/pytorch_transformers/convert_pytorch_checkpoint_to_tf2.py b/pytorch_transformers/convert_pytorch_checkpoint_to_tf2.py index 03b14d4517..0acaf92788 100644 --- a/pytorch_transformers/convert_pytorch_checkpoint_to_tf2.py +++ b/pytorch_transformers/convert_pytorch_checkpoint_to_tf2.py @@ -24,12 +24,13 @@ import tensorflow as tf from pytorch_transformers import is_torch_available from pytorch_transformers import (BertConfig, TFBertForPreTraining, load_bert_pt_weights_in_tf2, - GPT2Config, TFGPT2LMHeadModel, load_gpt2_pt_weights_in_tf2) + GPT2Config, TFGPT2LMHeadModel, load_gpt2_pt_weights_in_tf2, + XLNetConfig, TFXLNetLMHeadModel, load_xlnet_pt_weights_in_tf2) if is_torch_available(): import torch import numpy as np - from pytorch_transformers import BertForPreTraining, GPT2LMHeadModel + from pytorch_transformers import BertForPreTraining, GPT2LMHeadModel, XLNetLMHeadModel else: BertForPreTraining, GPT2LMHeadModel = None, None @@ -40,6 +41,7 @@ logging.basicConfig(level=logging.INFO) MODEL_CLASSES = { 'bert': (BertConfig, TFBertForPreTraining, load_bert_pt_weights_in_tf2, BertForPreTraining), 'gpt2': (GPT2Config, TFGPT2LMHeadModel, load_gpt2_pt_weights_in_tf2, GPT2LMHeadModel), + 'xlnet': (XLNetConfig, TFXLNetLMHeadModel, load_xlnet_pt_weights_in_tf2, XLNetLMHeadModel), } def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False): @@ -50,6 +52,8 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file # Initialise TF model config = config_class.from_json_file(config_file) + config.output_hidden_states = True + config.output_attentions = True print("Building TensorFlow model from configuration: {}".format(str(config))) tf_model = model_class(config) diff --git a/pytorch_transformers/modeling_tf_bert.py b/pytorch_transformers/modeling_tf_bert.py index 74951fed34..f1b5ec7109 100644 --- a/pytorch_transformers/modeling_tf_bert.py +++ b/pytorch_transformers/modeling_tf_bert.py @@ -83,7 +83,7 @@ def load_bert_pt_weights_in_tf2(tf_model, config, pytorch_checkpoint_path): name = name.replace('cls_mlm', 'cls') # We had to split this layer in two in the TF model to be name = name.replace('cls_nsp', 'cls') # able to do transfer learning (Keras only allow to remove full layers) name = name.replace(':0', '') - name = name.replace('layer_', 'layer/') + name = name.replace('__', '/') name = name.split('/') name = name[1:] @@ -391,7 +391,7 @@ class TFBertEncoder(tf.keras.layers.Layer): super(TFBertEncoder, self).__init__(**kwargs) self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states - self.layer = [TFBertLayer(config, name='layer_{}'.format(i)) for i in range(config.num_hidden_layers)] + self.layer = [TFBertLayer(config, name='layer__{}'.format(i)) for i in range(config.num_hidden_layers)] def call(self, inputs, training=False): hidden_states, attention_mask, head_mask = inputs diff --git a/pytorch_transformers/modeling_tf_gpt2.py b/pytorch_transformers/modeling_tf_gpt2.py index 72421370bb..491b592b13 100644 --- a/pytorch_transformers/modeling_tf_gpt2.py +++ b/pytorch_transformers/modeling_tf_gpt2.py @@ -70,7 +70,7 @@ def load_gpt2_pt_weights_in_tf2(tf_model, config, pytorch_checkpoint_path): for symbolic_weight in symbolic_weights: name = symbolic_weight.name name = name.replace(':0', '') - name = name.replace('h_', 'h/') + name = name.replace('__', '/') name = name.split('/') name = name[2:] @@ -282,7 +282,7 @@ class TFGPT2MainLayer(tf.keras.layers.Layer): self.h = [TFBlock(config.n_ctx, config, scale=True, - name='h_{}'.format(i)) for i in range(config.n_layer)] + name='h__{}'.format(i)) for i in range(config.n_layer)] self.ln_f = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name='ln_f') def _resize_token_embeddings(self, new_num_tokens): diff --git a/pytorch_transformers/modeling_tf_utils.py b/pytorch_transformers/modeling_tf_utils.py index 1860ab4f8b..704de3b62f 100644 --- a/pytorch_transformers/modeling_tf_utils.py +++ b/pytorch_transformers/modeling_tf_utils.py @@ -386,7 +386,7 @@ class TFSequenceSummary(tf.keras.layers.Layer): self.activation = None if hasattr(config, 'summary_activation') and config.summary_activation == 'tanh': - self.activation = tf.keras.layers.Tanh() + self.activation = tf.keras.activations.tanh self.first_dropout = None if hasattr(config, 'summary_first_dropout') and config.summary_first_dropout > 0: diff --git a/pytorch_transformers/modeling_tf_xlnet.py b/pytorch_transformers/modeling_tf_xlnet.py index 7708e3abfd..12f0640531 100644 --- a/pytorch_transformers/modeling_tf_xlnet.py +++ b/pytorch_transformers/modeling_tf_xlnet.py @@ -28,7 +28,7 @@ import numpy as np import tensorflow as tf from .configuration_xlnet import XLNetConfig -from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list +from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list from .file_utils import add_start_docstrings @@ -72,13 +72,15 @@ def load_xlnet_pt_weights_in_tf2(tf_model, config, pytorch_checkpoint_path): name = name.replace('cls_mlm', 'cls') # We had to split this layer in two in the TF model to be name = name.replace('cls_nsp', 'cls') # able to do transfer learning (Keras only allow to remove full layers) name = name.replace(':0', '') - name = name.replace('layer_', 'layer/') + name = name.replace('layer__', 'layer/') name = name.split('/') name = name[1:] transpose = bool(name[-1] == 'kernel') - if name[-1] == 'kernel' or name[-1] == 'embeddings': + if name[-1] == 'kernel' or name[-1] == 'embeddings' or name[-1] == 'gamma': name[-1] = 'weight' + if name[-1] == 'beta': + name[-1] = 'bias' name = '.'.join(name) assert name in state_dict, "{} not found in PyTorch model".format(name) @@ -237,16 +239,16 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): return attn_vec - def post_attention(self, inputs, training=False): + def post_attention(self, inputs, residual=True, training=False): """Post-attention processing.""" # post-attention projection (back to `d_model`) - h, attn_vec, residual = inputs + h, attn_vec = inputs attn_out = tf.einsum('ibnd,hnd->ibh', attn_vec, self.o) attn_out = self.dropout(attn_out, training=training) - if residual is not None: + if residual: attn_out = attn_out + h output = self.layer_norm(attn_out) @@ -259,23 +261,23 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): if g is not None: ###### Two-stream attention with relative positional encoding. # content based attention score - if mems is not None and mems.dim() > 1: - cat = torch.cat([mems, h], dim=0) + if mems is not None and mems.shape.ndims > 1: + cat = tf.concat([mems, h], axis=0) else: cat = h # content-based key head - k_head_h = torch.einsum('ibh,hnd->ibnd', cat, self.k) + k_head_h = tf.einsum('ibh,hnd->ibnd', cat, self.k) # content-based value head - v_head_h = torch.einsum('ibh,hnd->ibnd', cat, self.v) + v_head_h = tf.einsum('ibh,hnd->ibnd', cat, self.v) # position-based key head - k_head_r = torch.einsum('ibh,hnd->ibnd', r, self.r) + k_head_r = tf.einsum('ibh,hnd->ibnd', r, self.r) ##### h-stream # content-stream query head - q_head_h = torch.einsum('ibh,hnd->ibnd', h, self.q) + q_head_h = tf.einsum('ibh,hnd->ibnd', h, self.q) # core attention ops attn_vec_h = self.rel_attn_core( @@ -286,15 +288,15 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): attn_vec_h, attn_prob_h = attn_vec_h # post processing - output_h = self.post_attention([h, attn_vec_h, None], training=training) + output_h = self.post_attention([h, attn_vec_h], training=training) ##### g-stream # query-stream query head - q_head_g = torch.einsum('ibh,hnd->ibnd', g, self.q) + q_head_g = tf.einsum('ibh,hnd->ibnd', g, self.q) # core attention ops if target_mapping is not None: - q_head_g = torch.einsum('mbnd,mlb->lbnd', q_head_g, target_mapping) + q_head_g = tf.einsum('mbnd,mlb->lbnd', q_head_g, target_mapping) attn_vec_g = self.rel_attn_core( [q_head_g, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_g, head_mask], training=training) @@ -302,7 +304,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): if self.output_attentions: attn_vec_g, attn_prob_g = attn_vec_g - attn_vec_g = torch.einsum('lbnd,mlb->mbnd', attn_vec_g, target_mapping) + attn_vec_g = tf.einsum('lbnd,mlb->mbnd', attn_vec_g, target_mapping) else: attn_vec_g = self.rel_attn_core( [q_head_g, k_head_h, v_head_h, k_head_r, seg_mat, attn_mask_g, head_mask], @@ -312,15 +314,15 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): attn_vec_g, attn_prob_g = attn_vec_g # post processing - output_g = self.post_attention([g, attn_vec_g, None], training=training) + output_g = self.post_attention([g, attn_vec_g], training=training) if self.output_attentions: attn_prob = attn_prob_h, attn_prob_g else: ###### Multi-head attention with relative positional encoding - if mems is not None and mems.dim() > 1: - cat = tf.concat([mems, h], dim=0) + if mems is not None and mems.shape.ndims > 1: + cat = tf.concat([mems, h], axis=0) else: cat = h @@ -341,7 +343,7 @@ class TFXLNetRelativeAttention(tf.keras.layers.Layer): attn_vec, attn_prob = attn_vec # post processing - output_h = self.post_attention([h, attn_vec, None], training=training) + output_h = self.post_attention([h, attn_vec], training=training) output_g = None outputs = (output_h, output_g) @@ -391,6 +393,27 @@ class TFXLNetLayer(tf.keras.layers.Layer): return outputs +class TFXLNetLMHead(tf.keras.layers.Layer): + def __init__(self, config, input_embeddings, **kwargs): + super(TFXLNetLMHead, self).__init__(**kwargs) + self.vocab_size = config.vocab_size + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.input_embeddings = input_embeddings + + def build(self, input_shape): + self.bias = self.add_weight(shape=(self.vocab_size,), + initializer='zeros', + trainable=True, + name='bias') + super(TFXLNetLMHead, self).build(input_shape) + + def call(self, hidden_states): + hidden_states = self.input_embeddings(hidden_states, mode="linear") + hidden_states = hidden_states + self.bias + return hidden_states + + class TFXLNetMainLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super(TFXLNetMainLayer, self).__init__(**kwargs) @@ -409,7 +432,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): self.initializer_range = config.initializer_range self.word_embedding = TFSharedEmbeddings(config.n_token, config.d_model, initializer_range=config.initializer_range, name='word_embedding') - self.layer = [TFXLNetLayer(config, name='layer_{}'.format(i)) for i in range(config.n_layer)] + self.layer = [TFXLNetLayer(config, name='layer__{}'.format(i)) for i in range(config.n_layer)] self.dropout = tf.keras.layers.Dropout(config.dropout) def build(self, input_shape): @@ -464,7 +487,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): if prev_mem is None: new_mem = curr_out[-self.mem_len:] else: - new_mem = tf.concat([prev_mem, curr_out], 0)[-mem_len:] + new_mem = tf.concat([prev_mem, curr_out], 0)[-self.mem_len:] return tf.stop_gradient(new_mem) @@ -618,7 +641,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): word_emb_k = self.word_embedding(input_ids) output_h = self.dropout(word_emb_k, training=training) if target_mapping is not None: - word_emb_q = tf.tile(mask_emb, [tf.shape(target_mapping)[0], bsz, 1]) + word_emb_q = tf.tile(self.mask_emb, [tf.shape(target_mapping)[0], bsz, 1]) # else: # We removed the inp_q input which was same as target mapping # inp_q_ext = inp_q[:, :, None] # word_emb_q = inp_q_ext * self.mask_emb + (1 - inp_q_ext) * word_emb_k @@ -827,187 +850,173 @@ class TFXLNetModel(TFXLNetPreTrainedModel): return outputs -# @add_start_docstrings("""XLNet Model with a language modeling head on top -# (linear layer with weights tied to the input embeddings). """, -# XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING) -# class XLNetLMHeadModel(XLNetPreTrainedModel): -# r""" -# **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: -# Labels for language modeling. -# Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` -# Indices are selected in ``[-1, 0, ..., config.vocab_size]`` -# All labels set to ``-1`` are ignored (masked), the loss is only -# computed for labels in ``[0, ..., config.vocab_size]`` +@add_start_docstrings("""XLNet Model with a language modeling head on top + (linear layer with weights tied to the input embeddings). """, + XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING) +class TFXLNetLMHeadModel(TFXLNetPreTrainedModel): + r""" + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + **mems**: + list of ``torch.FloatTensor`` (one for each layer): + that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model + if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context. + See details in the docstring of the `mems` input above. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. -# Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: -# **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: -# Language modeling loss. -# **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` -# Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). -# **mems**: -# list of ``torch.FloatTensor`` (one for each layer): -# that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model -# if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context. -# See details in the docstring of the `mems` input above. -# **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) -# list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) -# of shape ``(batch_size, sequence_length, hidden_size)``: -# Hidden-states of the model at the output of each layer plus the initial embedding outputs. -# **attentions**: (`optional`, returned when ``config.output_attentions=True``) -# list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: -# Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Examples:: -# Examples:: + tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') + model = TFXLNetLMHeadModel.from_pretrained('xlnet-large-cased') + # We show how to setup inputs to predict a next token using a bi-directional context. + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very ")).unsqueeze(0) # We will predict the masked token + perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float) + perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token + target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float) # Shape [1, 1, seq_length] => let's predict one token + target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token) + outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping) + next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] -# tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') -# model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased') -# # We show how to setup inputs to predict a next token using a bi-directional context. -# input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very ")).unsqueeze(0) # We will predict the masked token -# perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float) -# perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token -# target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float) # Shape [1, 1, seq_length] => let's predict one token -# target_mapping[0, 0, -1] = 1.0 # Our first (and only) prediction will be the last token of the sequence (the masked token) -# outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping) -# next_token_logits = outputs[0] # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size] + """ + def __init__(self, config, *inputs, **kwargs): + super(TFXLNetLMHeadModel, self).__init__(config, *inputs, **kwargs) + self.n_token = config.n_token -# """ -# def __init__(self, config, **kwargs): -# super(XLNetLMHeadModel, self).__init__(config) -# self.attn_type = config.attn_type -# self.same_length = config.same_length + self.transformer = TFXLNetMainLayer(config, name='transformer') + self.lm_loss = TFXLNetLMHead(config, self.transformer.word_embedding, name='lm_loss') -# self.transformer = XLNetModel(config) -# self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True) + def call(self, inputs, training=False): + transformer_outputs = self.transformer(inputs, training=training) + hidden_state = transformer_outputs[0] + logits = self.lm_loss(hidden_state) -# self.init_weights() -# self.tie_weights() + outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it -# def tie_weights(self): -# """ Make sure we are sharing the embeddings -# """ -# self._tie_or_clone_weights(self.lm_loss, self.transformer.word_embedding) - -# def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, -# token_type_ids=None, input_mask=None, head_mask=None, labels=None): -# transformer_outputs = self.transformer(input_ids, -# attention_mask=attention_mask, -# mems=mems, -# perm_mask=perm_mask, -# target_mapping=target_mapping, -# token_type_ids=token_type_ids, -# input_mask=input_mask, -# head_mask=head_mask) - -# logits = self.lm_loss(transformer_outputs[0]) - -# outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it - -# if labels is not None: -# # Flatten the tokens -# loss_fct = CrossEntropyLoss(ignore_index=-1) -# loss = loss_fct(logits.view(-1, logits.size(-1)), -# labels.view(-1)) -# outputs = (loss,) + outputs - -# return outputs # return (loss), logits, mems, (hidden states), (attentions) + return outputs # return logits, mems, (hidden states), (attentions) -# @add_start_docstrings("""XLNet Model with a sequence classification/regression head on top (a linear layer on top of -# the pooled output) e.g. for GLUE tasks. """, -# XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING) -# class XLNetForSequenceClassification(XLNetPreTrainedModel): -# r""" -# **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: -# Labels for computing the sequence classification/regression loss. -# Indices should be in ``[0, ..., config.num_labels - 1]``. -# If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), -# If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). +@add_start_docstrings("""XLNet Model with a sequence classification/regression head on top (a linear layer on top of + the pooled output) e.g. for GLUE tasks. """, + XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING) +class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel): + r""" + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for computing the sequence classification/regression loss. + Indices should be in ``[0, ..., config.num_labels - 1]``. + If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), + If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). -# Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: -# **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: -# Classification (or regression if config.num_labels==1) loss. -# **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` -# Classification (or regression if config.num_labels==1) scores (before SoftMax). -# **mems**: -# list of ``torch.FloatTensor`` (one for each layer): -# that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model -# if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context. -# See details in the docstring of the `mems` input above. -# **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) -# list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) -# of shape ``(batch_size, sequence_length, hidden_size)``: -# Hidden-states of the model at the output of each layer plus the initial embedding outputs. -# **attentions**: (`optional`, returned when ``config.output_attentions=True``) -# list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: -# Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Classification (or regression if config.num_labels==1) loss. + **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` + Classification (or regression if config.num_labels==1) scores (before SoftMax). + **mems**: + list of ``torch.FloatTensor`` (one for each layer): + that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model + if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context. + See details in the docstring of the `mems` input above. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. -# Examples:: + Examples:: -# tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') -# model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased') -# input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 -# labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 -# outputs = model(input_ids, labels=labels) -# loss, logits = outputs[:2] + tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') + model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, labels=labels) + loss, logits = outputs[:2] -# """ -# def __init__(self, config, **kwargs): -# super(XLNetForSequenceClassification, self).__init__(config) -# self.num_labels = config.num_labels + """ + def __init__(self, config, *inputs, **kwargs): + super(TFXLNetForSequenceClassification, self).__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels -# self.transformer = XLNetModel(config) -# self.sequence_summary = SequenceSummary(config) -# self.logits_proj = nn.Linear(config.d_model, config.num_labels) + self.transformer = TFXLNetMainLayer(config, name='transformer') + self.sequence_summary = TFSequenceSummary(config, name='sequence_summary') + self.logits_proj = tf.keras.layers.Dense(config.num_labels, name='logits_proj') -# self.init_weights() + def call(self, inputs, training=False): + transformer_outputs = self.transformer(inputs, training=training) + output = transformer_outputs[0] -# def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, -# token_type_ids=None, input_mask=None, head_mask=None, labels=None): -# transformer_outputs = self.transformer(input_ids, -# attention_mask=attention_mask, -# mems=mems, -# perm_mask=perm_mask, -# target_mapping=target_mapping, -# token_type_ids=token_type_ids, -# input_mask=input_mask, -# head_mask=head_mask) -# output = transformer_outputs[0] + output = self.sequence_summary(output) + logits = self.logits_proj(output) -# output = self.sequence_summary(output) -# logits = self.logits_proj(output) + outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it -# outputs = (logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it - -# if labels is not None: -# if self.num_labels == 1: -# # We are doing regression -# loss_fct = MSELoss() -# loss = loss_fct(logits.view(-1), labels.view(-1)) -# else: -# loss_fct = CrossEntropyLoss() -# loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) -# outputs = (loss,) + outputs - -# return outputs # return (loss), logits, mems, (hidden states), (attentions) + return outputs # return logits, mems, (hidden states), (attentions) # @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of # the hidden-states output to compute `span start logits` and `span end logits`). """, # XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING) -# class XLNetForQuestionAnswering(XLNetPreTrainedModel): +# class TFXLNetForQuestionAnswering(TFXLNetPreTrainedModel): +class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel): + r""" + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` + Span-start scores (before SoftMax). + **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` + Span-end scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') + model = TFXLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + start_positions = torch.tensor([1]) + end_positions = torch.tensor([3]) + outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) + loss, start_scores, end_scores = outputs[:2] + + """ + def __init__(self, config, *inputs, **kwargs): + super(TFXLNetForQuestionAnsweringSimple, self).__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + + self.transformer = TFXLNetMainLayer(config, name='transformer') + self.qa_outputs = tf.keras.layers.Dense(config.num_labels, name='qa_outputs') + + def call(self, inputs, training=False): + transformer_outputs = self.transformer(inputs, training=training) + + sequence_output = transformer_outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = tf.split(logits, 2, axis=-1) + start_logits = tf.squeeze(start_logits, axis=-1) + end_logits = tf.squeeze(end_logits, axis=-1) + + outputs = (start_logits, end_logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it + + return outputs # start_logits, end_logits, (hidden_states), (attentions) + +# @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of +# the hidden-states output to compute `span start logits` and `span end logits`). """, +# XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING) +# class TFXLNetForQuestionAnswering(TFXLNetPreTrainedModel): # r""" -# **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: -# Labels for position (index) of the start of the labelled span for computing the token classification loss. -# Positions are clamped to the length of the sequence (`sequence_length`). -# Position outside of the sequence are not taken into account for computing the loss. -# **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: -# Labels for position (index) of the end of the labelled span for computing the token classification loss. -# Positions are clamped to the length of the sequence (`sequence_length`). -# Position outside of the sequence are not taken into account for computing the loss. -# **is_impossible**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: -# Labels whether a question has an answer or no answer (SQuAD 2.0) -# **cls_index**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: -# Labels for position (index) of the classification token to use as input for computing plausibility of the answer. # **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: # Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...). # 1.0 means token should be masked. 0.0 mean token is not masked. @@ -1054,29 +1063,18 @@ class TFXLNetModel(TFXLNetPreTrainedModel): # loss, start_scores, end_scores = outputs[:2] # """ -# def __init__(self, config, **kwargs): -# super(XLNetForQuestionAnswering, self).__init__(config) +# def __init__(self, config, *inputs, **kwargs): +# super(TFXLNetForQuestionAnswering, self).__init__(config, *inputs, **kwargs) # self.start_n_top = config.start_n_top # self.end_n_top = config.end_n_top -# self.transformer = XLNetModel(config) -# self.start_logits = PoolerStartLogits(config) -# self.end_logits = PoolerEndLogits(config) -# self.answer_class = PoolerAnswerClass(config) +# self.transformer = TFXLNetMainLayer(config, name='transformer') +# self.start_logits = TFPoolerStartLogits(config, name='start_logits') +# self.end_logits = TFPoolerEndLogits(config, name='end_logits') +# self.answer_class = TFPoolerAnswerClass(config, name='answer_class') -# self.init_weights() - -# def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, -# token_type_ids=None, input_mask=None, head_mask=None, -# start_positions=None, end_positions=None, is_impossible=None, cls_index=None, p_mask=None,): -# transformer_outputs = self.transformer(input_ids, -# attention_mask=attention_mask, -# mems=mems, -# perm_mask=perm_mask, -# target_mapping=target_mapping, -# token_type_ids=token_type_ids, -# input_mask=input_mask, -# head_mask=head_mask) +# def call(self, inputs, training=False): +# transformer_outputs = self.transformer(inputs, training=training) # hidden_states = transformer_outputs[0] # start_logits = self.start_logits(hidden_states, p_mask=p_mask) diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py index 97feaad371..c8e55d2107 100644 --- a/pytorch_transformers/modeling_xlnet.py +++ b/pytorch_transformers/modeling_xlnet.py @@ -1003,6 +1003,101 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): return outputs # return (loss), logits, mems, (hidden states), (attentions) +@add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of + the hidden-states output to compute `span start logits` and `span end logits`). """, + XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING) +class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): + r""" + **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned if both ``start_positions`` and ``end_positions`` are provided) ``torch.FloatTensor`` of shape ``(1,)``: + Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses. + **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` + Span-start scores (before SoftMax). + **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` + Span-end scores (before SoftMax). + **mems**: + list of ``torch.FloatTensor`` (one for each layer): + that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model + if config.mem_len > 0 else tuple of None. Can be used to speed up sequential decoding and attend to longer context. + See details in the docstring of the `mems` input above. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') + model = XLMForQuestionAnswering.from_pretrained('xlnet-large-cased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + start_positions = torch.tensor([1]) + end_positions = torch.tensor([3]) + outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) + loss, start_scores, end_scores = outputs[:2] + + """ + def __init__(self, config): + super(XLNetForQuestionAnsweringSimple, self).__init__(config) + self.num_labels = config.num_labels + + self.transformer = XLNetModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + def forward(self, input_ids, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, + token_type_ids=None, input_mask=None, head_mask=None, + start_positions=None, end_positions=None): + + outputs = self.transformer(input_ids, + attention_mask=attention_mask, + mems=mems, + perm_mask=perm_mask, + target_mapping=target_mapping, + token_type_ids=token_type_ids, + input_mask=input_mask, + head_mask=head_mask) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + outputs = (start_logits, end_logits,) + outputs[2:] + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions.clamp_(0, ignored_index) + end_positions.clamp_(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + outputs = (total_loss,) + outputs + + return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) + + @add_start_docstrings("""XLNet Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """, XLNET_START_DOCSTRING, XLNET_INPUTS_DOCSTRING) diff --git a/pytorch_transformers/tests/modeling_tf_xlnet_test.py b/pytorch_transformers/tests/modeling_tf_xlnet_test.py index 8d61d6b5dc..01c4494664 100644 --- a/pytorch_transformers/tests/modeling_tf_xlnet_test.py +++ b/pytorch_transformers/tests/modeling_tf_xlnet_test.py @@ -28,9 +28,10 @@ from pytorch_transformers import XLNetConfig, is_tf_available if is_tf_available(): import tensorflow as tf - from pytorch_transformers.modeling_tf_xlnet import (TFXLNetModel, TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP) - # XLNetLMHeadModel, - # XLNetForSequenceClassification, XLNetForQuestionAnswering) + from pytorch_transformers.modeling_tf_xlnet import (TFXLNetModel, TFXLNetLMHeadModel, + TFXLNetForSequenceClassification, + TFXLNetForQuestionAnsweringSimple, + TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP) else: pytestmark = pytest.mark.skip("Require TensorFlow") @@ -39,9 +40,9 @@ from .configuration_common_test import ConfigTester class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester): - all_model_classes=(TFXLNetModel, ) if is_tf_available() else () - # all_model_classes=(TFXLNetModel, TFXLNetLMHeadModel, - # TFXLNetForSequenceClassification, TFXLNetForQuestionAnswering) if is_tf_available() else () + all_model_classes=(TFXLNetModel, TFXLNetLMHeadModel, + TFXLNetForSequenceClassification, + TFXLNetForQuestionAnsweringSimple) if is_tf_available() else () test_pruning = False class TFXLNetModelTester(object): @@ -169,128 +170,88 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester): def create_and_check_xlnet_lm_head(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask, target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels): - pass - # model = XLNetLMHeadModel(config) - # model.eval() + model = TFXLNetLMHeadModel(config) - # loss_1, all_logits_1, mems_1 = model(input_ids_1, token_type_ids=segment_ids, labels=lm_labels) + inputs_1 = {'input_ids': input_ids_1, + 'token_type_ids': segment_ids} - # loss_2, all_logits_2, mems_2 = model(input_ids_2, token_type_ids=segment_ids, labels=lm_labels, mems=mems_1) + all_logits_1, mems_1 = model(inputs_1) - # logits, _ = model(input_ids_q, perm_mask=perm_mask, target_mapping=target_mapping) + inputs_2 = {'input_ids': input_ids_2, + 'mems': mems_1, + 'token_type_ids': segment_ids} - # result = { - # "loss_1": loss_1, - # "mems_1": mems_1, - # "all_logits_1": all_logits_1, - # "loss_2": loss_2, - # "mems_2": mems_2, - # "all_logits_2": all_logits_2, - # } + all_logits_2, mems_2 = model(inputs_2) - # self.parent.assertListEqual( - # list(result["loss_1"].size()), - # []) - # self.parent.assertListEqual( - # list(result["all_logits_1"].size()), - # [self.batch_size, self.seq_length, self.vocab_size]) - # self.parent.assertListEqual( - # list(list(mem.size()) for mem in result["mems_1"]), - # [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers) + inputs_3 = {'input_ids': input_ids_q, + 'perm_mask': perm_mask, + 'target_mapping': target_mapping} - # self.parent.assertListEqual( - # list(result["loss_2"].size()), - # []) - # self.parent.assertListEqual( - # list(result["all_logits_2"].size()), - # [self.batch_size, self.seq_length, self.vocab_size]) - # self.parent.assertListEqual( - # list(list(mem.size()) for mem in result["mems_2"]), - # [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers) + logits, _ = model(inputs_3) + + result = { + "mems_1": [mem.numpy() for mem in mems_1], + "all_logits_1": all_logits_1.numpy(), + "mems_2": [mem.numpy() for mem in mems_2], + "all_logits_2": all_logits_2.numpy(), + } + + self.parent.assertListEqual( + list(result["all_logits_1"].shape), + [self.batch_size, self.seq_length, self.vocab_size]) + self.parent.assertListEqual( + list(list(mem.shape) for mem in result["mems_1"]), + [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers) + + self.parent.assertListEqual( + list(result["all_logits_2"].shape), + [self.batch_size, self.seq_length, self.vocab_size]) + self.parent.assertListEqual( + list(list(mem.shape) for mem in result["mems_2"]), + [[self.mem_len, self.batch_size, self.hidden_size]] * self.num_hidden_layers) def create_and_check_xlnet_qa(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask, target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels): - pass - # model = XLNetForQuestionAnswering(config) - # model.eval() + model = TFXLNetForQuestionAnsweringSimple(config) - # outputs = model(input_ids_1) - # start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits, mems = outputs + inputs = {'input_ids': input_ids_1, + 'attention_mask': input_mask, + 'token_type_ids': segment_ids} + start_logits, end_logits, mems = model(inputs) - # outputs = model(input_ids_1, start_positions=sequence_labels, - # end_positions=sequence_labels, - # cls_index=sequence_labels, - # is_impossible=is_impossible_labels, - # p_mask=input_mask) + result = { + "start_logits": start_logits.numpy(), + "end_logits": end_logits.numpy(), + "mems": [m.numpy() for m in mems], + } - # outputs = model(input_ids_1, start_positions=sequence_labels, - # end_positions=sequence_labels, - # cls_index=sequence_labels, - # is_impossible=is_impossible_labels) - - # total_loss, mems = outputs - - # outputs = model(input_ids_1, start_positions=sequence_labels, - # end_positions=sequence_labels) - - # total_loss, mems = outputs - - # result = { - # "loss": total_loss, - # "start_top_log_probs": start_top_log_probs, - # "start_top_index": start_top_index, - # "end_top_log_probs": end_top_log_probs, - # "end_top_index": end_top_index, - # "cls_logits": cls_logits, - # "mems": mems, - # } - - # self.parent.assertListEqual( - # list(result["loss"].size()), - # []) - # self.parent.assertListEqual( - # list(result["start_top_log_probs"].size()), - # [self.batch_size, model.config.start_n_top]) - # self.parent.assertListEqual( - # list(result["start_top_index"].size()), - # [self.batch_size, model.config.start_n_top]) - # self.parent.assertListEqual( - # list(result["end_top_log_probs"].size()), - # [self.batch_size, model.config.start_n_top * model.config.end_n_top]) - # self.parent.assertListEqual( - # list(result["end_top_index"].size()), - # [self.batch_size, model.config.start_n_top * model.config.end_n_top]) - # self.parent.assertListEqual( - # list(result["cls_logits"].size()), - # [self.batch_size]) - # self.parent.assertListEqual( - # list(list(mem.size()) for mem in result["mems"]), - # [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers) + self.parent.assertListEqual( + list(result["start_logits"].shape), + [self.batch_size, self.seq_length]) + self.parent.assertListEqual( + list(result["end_logits"].shape), + [self.batch_size, self.seq_length]) + self.parent.assertListEqual( + list(list(mem.shape) for mem in result["mems"]), + [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers) def create_and_check_xlnet_sequence_classif(self, config, input_ids_1, input_ids_2, input_ids_q, perm_mask, input_mask, target_mapping, segment_ids, lm_labels, sequence_labels, is_impossible_labels): - pass - # model = XLNetForSequenceClassification(config) - # model.eval() + model = TFXLNetForSequenceClassification(config) - # logits, mems_1 = model(input_ids_1) - # loss, logits, mems_1 = model(input_ids_1, labels=sequence_labels) + logits, mems_1 = model(input_ids_1) - # result = { - # "loss": loss, - # "mems_1": mems_1, - # "logits": logits, - # } + result = { + "mems_1": [mem.numpy() for mem in mems_1], + "logits": logits.numpy(), + } - # self.parent.assertListEqual( - # list(result["loss"].size()), - # []) - # self.parent.assertListEqual( - # list(result["logits"].size()), - # [self.batch_size, self.type_sequence_label_size]) - # self.parent.assertListEqual( - # list(list(mem.size()) for mem in result["mems_1"]), - # [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers) + self.parent.assertListEqual( + list(result["logits"].shape), + [self.batch_size, self.type_sequence_label_size]) + self.parent.assertListEqual( + list(list(mem.shape) for mem in result["mems_1"]), + [[self.seq_length, self.batch_size, self.hidden_size]] * self.num_hidden_layers) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs()