diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py index 0dc86563aa..2f7bdb3de4 100644 --- a/pytorch_transformers/__init__.py +++ b/pytorch_transformers/__init__.py @@ -120,6 +120,13 @@ if _tf_available: load_xlnet_pt_weights_in_tf2, TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP) + from .modeling_tf_xlm import (TFXLMPreTrainedModel, TFXLMMainLayer, + TFXLMModel, TFXLMWithLMHeadModel, + TFXLMForSequenceClassification, + TFXLMForQuestionAnsweringSimple, + load_xlm_pt_weights_in_tf2, + TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP) + # Files and general utilities from .file_utils import (PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE, cached_path, add_start_docstrings, add_end_docstrings, diff --git a/pytorch_transformers/modeling_tf_xlm.py b/pytorch_transformers/modeling_tf_xlm.py new file mode 100644 index 0000000000..d0db638013 --- /dev/null +++ b/pytorch_transformers/modeling_tf_xlm.py @@ -0,0 +1,798 @@ +# coding=utf-8 +# Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" TF 2.0 XLM model. +""" +from __future__ import absolute_import, division, print_function, unicode_literals + +import logging +import math + +import itertools +import numpy as np +import tensorflow as tf + +from .configuration_xlm import XLMConfig +from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list +from .file_utils import add_start_docstrings + +logger = logging.getLogger(__name__) + +TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP = { + 'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-tf_model.h5", + 'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-tf_model.h5", + 'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-tf_model.h5", + 'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-tf_model.h5", + 'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-tf_model.h5", + 'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-tf_model.h5", + 'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-tf_model.h5", + 'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-tf_model.h5", + 'xlm-mlm-17-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-17-1280-tf_model.h5", + 'xlm-mlm-100-1280': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-100-1280-tf_model.h5", +} + + +def load_xlm_pt_weights_in_tf2(tf_model, config, pytorch_checkpoint_path): + """ Load pytorch checkpoints in a TF 2.0 model and save it using HDF5 format + We use HDF5 to easily do transfer learning + (see https://github.com/tensorflow/tensorflow/blob/ee16fcac960ae660e0e4496658a366e2f745e1f0/tensorflow/python/keras/engine/network.py#L1352-L1357). + """ + try: + import re + import torch + import numpy + from tensorflow.python.keras import backend as K + except ImportError: + logger.error("Loading a PyTorch model in TensorFlow, requires PyTorch to be installed. Please see " + "https://pytorch.org/ for installation instructions.") + raise + + pt_path = os.path.abspath(pytorch_checkpoint_path) + logger.info("Loading PyTorch weights from {}".format(pt_path)) + # Load pytorch model + state_dict = torch.load(pt_path, map_location='cpu') + + inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]] + tf_inputs = tf.constant(inputs_list) + tfo = tf_model(tf_inputs, training=False) # build the network + + symbolic_weights = tf_model.trainable_weights + tf_model.non_trainable_weights + weight_value_tuples = [] + all_pytorch_weights = set(list(state_dict.keys())) + for symbolic_weight in symbolic_weights: + name = symbolic_weight.name + name = name.replace(':0', '') + name = name.replace('__', '/') + name = name.split('/') + name = name[1:] + + transpose = bool(name[-1] == 'kernel') + if name[-1] == 'kernel' or name[-1] == 'embeddings' or name[-1] == 'gamma': + name[-1] = 'weight' + if name[-1] == 'beta': + name[-1] = 'bias' + + name = '.'.join(name) + assert name in state_dict, "{} not found in PyTorch model".format(name) + array = state_dict[name].numpy() + + if transpose: + array = numpy.transpose(array) + + try: + assert list(symbolic_weight.shape) == list(array.shape) + except AssertionError as e: + e.args += (symbolic_weight.shape, array.shape) + raise e + + logger.info("Initialize TF weight {}".format(symbolic_weight.name)) + + weight_value_tuples.append((symbolic_weight, array)) + all_pytorch_weights.discard(name) + + K.batch_set_value(weight_value_tuples) + + tfo = tf_model(tf_inputs, training=False) # Make sure restore ops are run + + logger.info("Weights or buffers not loaded from PyTorch model: {}".format(all_pytorch_weights)) + + return tf_model + + +def create_sinusoidal_embeddings(n_pos, dim, out): + position_enc = np.array([ + [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] + for pos in range(n_pos) + ]) + out[:, 0::2] = tf.constant(np.sin(position_enc[:, 0::2])) + out[:, 1::2] = tf.constant(np.cos(position_enc[:, 1::2])) + + +def gelu(x): + """ Gaussian Error Linear Unit. + Original Implementation of the gelu activation function in Google Bert repo when initialy created. + For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): + 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) + Also see https://arxiv.org/abs/1606.08415 + """ + cdf = 0.5 * (1.0 + tf.math.erf(x / tf.math.sqrt(2.0))) + return x * cdf + + +def get_masks(slen, lengths, causal, padding_mask=None, dtype=tf.float32): + """ + Generate hidden states mask, and optionally an attention mask. + """ + bs = shape_list(lengths)[0] + if padding_mask is not None: + mask = padding_mask + else: + # assert lengths.max().item() <= slen + alen = tf.range(slen) + mask = tf.math.less(alen, lengths[:, tf.newaxis]) + + # attention mask is the same as mask, or triangular inferior attention (causal) + if causal: + attn_mask = tf.less_equal(tf.tile(alen[tf.newaxis, tf.newaxis, :], (bs, slen, 1)), + alen[tf.newaxis, :, tf.newaxis]) + else: + attn_mask = mask + + # sanity check + assert shape_list(mask) == [bs, slen] + assert causal is False or shape_list(attn_mask) == [bs, slen, slen] + + mask = tf.cast(mask, dtype=dtype) + attn_mask = tf.cast(attn_mask, dtype=dtype) + + return mask, attn_mask + + +class TFMultiHeadAttention(tf.keras.layers.Layer): + + NEW_ID = itertools.count() + + def __init__(self, n_heads, dim, config, **kwargs): + super(TFMultiHeadAttention, self).__init__(**kwargs) + self.layer_id = next(TFMultiHeadAttention.NEW_ID) + self.output_attentions = config.output_attentions + self.dim = dim + self.n_heads = n_heads + assert self.dim % self.n_heads == 0 + + self.q_lin = tf.keras.layers.Dense(dim, name='q_lin') + self.k_lin = tf.keras.layers.Dense(dim, name='k_lin') + self.v_lin = tf.keras.layers.Dense(dim, name='v_lin') + self.out_lin = tf.keras.layers.Dense(dim, name='out_lin') + self.dropout = tf.keras.layers.Dropout(config.attention_dropout) + self.pruned_heads = set() + + def prune_heads(self, heads): + raise NotImplementedError + + def call(self, inputs, training=False): + """ + Self-attention (if kv is None) or attention over source sentence (provided by kv). + """ + input, mask, kv, cache, head_mask = inputs + # Input is (bs, qlen, dim) + # Mask is (bs, klen) (non-causal) or (bs, klen, klen) + bs, qlen, dim = shape_list(input) + if kv is None: + klen = qlen if cache is None else cache['slen'] + qlen + else: + klen = shape_list(kv)[1] + # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim) + n_heads = self.n_heads + dim_per_head = self.dim // n_heads + mask_reshape = (bs, 1, qlen, klen) if len(shape_list(mask)) == 3 else (bs, 1, 1, klen) + + def shape(x): + """ projection """ + return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3)) + + def unshape(x): + """ compute context """ + return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head)) + + q = shape(self.q_lin(input)) # (bs, n_heads, qlen, dim_per_head) + if kv is None: + k = shape(self.k_lin(input)) # (bs, n_heads, qlen, dim_per_head) + v = shape(self.v_lin(input)) # (bs, n_heads, qlen, dim_per_head) + elif cache is None or self.layer_id not in cache: + k = v = kv + k = shape(self.k_lin(k)) # (bs, n_heads, qlen, dim_per_head) + v = shape(self.v_lin(v)) # (bs, n_heads, qlen, dim_per_head) + + if cache is not None: + if self.layer_id in cache: + if kv is None: + k_, v_ = cache[self.layer_id] + k = tf.concat([k_, k], axis=2) # (bs, n_heads, klen, dim_per_head) + v = tf.concat([v_, v], axis=2) # (bs, n_heads, klen, dim_per_head) + else: + k, v = cache[self.layer_id] + cache[self.layer_id] = (k, v) + + q = q / math.sqrt(dim_per_head) # (bs, n_heads, qlen, dim_per_head) + scores = tf.matmul(q, k, transpose_b=True) # (bs, n_heads, qlen, klen) + mask = tf.reshape(mask, mask_reshape) # (bs, n_heads, qlen, klen) + # scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, qlen, klen) + scores = scores - 1e30 * (1.0 - mask) + + weights = tf.nn.softmax(scores, axis=-1) # (bs, n_heads, qlen, klen) + weights = self.dropout(weights, training=training) # (bs, n_heads, qlen, klen) + + # Mask heads if we want to + if head_mask is not None: + weights = weights * head_mask + + context = tf.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head) + context = unshape(context) # (bs, qlen, dim) + + outputs = (self.out_lin(context),) + if self.output_attentions: + outputs = outputs + (weights,) + return outputs + + +class TFTransformerFFN(tf.keras.layers.Layer): + + def __init__(self, in_dim, dim_hidden, out_dim, config, **kwargs): + super(TFTransformerFFN, self).__init__(**kwargs) + self.lin1 = tf.keras.layers.Dense(dim_hidden, name='lin1') + self.lin2 = tf.keras.layers.Dense(out_dim, name='lin2') + self.act = tf.keras.layers.Activation(gelu) if config.gelu_activation else tf.keras.activations.relu + self.dropout = tf.keras.layers.Dropout(config.dropout) + + def call(self, input, training=False): + x = self.lin1(input) + x = self.act(x) + x = self.lin2(x) + x = self.dropout(x, training=training) + return x + + +class TFXLMMainLayer(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super(TFXLMMainLayer, self).__init__(**kwargs) + self.output_attentions = config.output_attentions + self.output_hidden_states = config.output_hidden_states + + # encoder / decoder, output layer + self.is_encoder = config.is_encoder + self.is_decoder = not config.is_encoder + if self.is_decoder: + raise NotImplementedError("Currently XLM can only be used as an encoder") + # self.with_output = with_output + self.causal = config.causal + + # dictionary / languages + self.n_langs = config.n_langs + self.use_lang_emb = config.use_lang_emb + self.n_words = config.n_words + self.eos_index = config.eos_index + self.pad_index = config.pad_index + # self.dico = dico + # self.id2lang = config.id2lang + # self.lang2id = config.lang2id + # assert len(self.dico) == self.n_words + # assert len(self.id2lang) == len(self.lang2id) == self.n_langs + + # model parameters + self.dim = config.emb_dim # 512 by default + self.hidden_dim = self.dim * 4 # 2048 by default + self.n_heads = config.n_heads # 8 by default + self.n_layers = config.n_layers + assert self.dim % self.n_heads == 0, 'transformer dim must be a multiple of n_heads' + + # embeddings + self.dropout = tf.keras.layers.Dropout(config.dropout) + self.attention_dropout = tf.keras.layers.Dropout(config.attention_dropout) + + self.position_embeddings = tf.keras.layers.Embedding(config.max_position_embeddings, self.dim, name='position_embeddings') + if config.sinusoidal_embeddings: + raise NotImplementedError + # create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight) + if config.n_langs > 1 and config.use_lang_emb: + self.lang_embeddings = tf.keras.layers.Embedding(self.n_langs, self.dim, name='lang_embeddings') + self.embeddings = TFSharedEmbeddings(self.n_words, self.dim, name='embeddings') # padding_idx=self.pad_index) + self.layer_norm_emb = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm_emb') + + # transformer layers + self.attentions = [] + self.layer_norm1 = [] + self.ffns = [] + self.layer_norm2 = [] + # if self.is_decoder: + # self.layer_norm15 = tf.keras.layers.LayerList() + # self.encoder_attn = tf.keras.layers.LayerList() + + for i in range(self.n_layers): + self.attentions.append(TFMultiHeadAttention(self.n_heads, self.dim, config=config, name='attentions__{}'.format(i))) + self.layer_norm1.append(tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm1__{}'.format(i))) + # if self.is_decoder: + # self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps)) + # self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout)) + self.ffns.append(TFTransformerFFN(self.dim, self.hidden_dim, self.dim, config=config, name='ffns__{}'.format(i))) + self.layer_norm2.append(tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name='layer_norm2__{}'.format(i))) + + if hasattr(config, "pruned_heads"): + pruned_heads = config.pruned_heads.copy().items() + config.pruned_heads = {} + for layer, heads in pruned_heads: + if self.attentions[int(layer)].n_heads == config.n_heads: + self.prune_heads({int(layer): list(map(int, heads))}) + + + def _resize_token_embeddings(self, new_num_tokens): + raise NotImplementedError + + def _prune_heads(self, heads_to_prune): + """ Prunes heads of the model. + heads_to_prune: dict of {layer_num: list of heads to prune in this layer} + See base class PreTrainedModel + """ + raise NotImplementedError + + def call(self, inputs, training=False): # removed: src_enc=None, src_len=None + if not isinstance(inputs, (dict, tuple, list)): + input_ids = inputs + (attention_mask, langs, token_type_ids, position_ids, + lengths, cache, head_mask) = None, None, None, None, None, None, None + elif isinstance(inputs, (tuple, list)): + input_ids = inputs[0] + attention_mask = inputs[1] if len(inputs) > 1 else None + langs = inputs[2] if len(inputs) > 2 else None + token_type_ids = inputs[3] if len(inputs) > 3 else None + position_ids = inputs[4] if len(inputs) > 4 else None + lengths = inputs[5] if len(inputs) > 5 else None + cache = inputs[6] if len(inputs) > 6 else None + head_mask = inputs[7] if len(inputs) > 7 else None + assert len(inputs) <= 8, "Too many inputs." + else: + input_ids = inputs.get('input_ids') + attention_mask = inputs.get('attention_mask', None) + langs = inputs.get('langs', None) + token_type_ids = inputs.get('token_type_ids', None) + position_ids = inputs.get('position_ids', None) + lengths = inputs.get('lengths', None) + cache = inputs.get('cache', None) + head_mask = inputs.get('head_mask', None) + assert len(inputs) <= 8, "Too many inputs." + + if lengths is None: + lengths = tf.reduce_sum(tf.cast(tf.not_equal(input_ids, self.pad_index), dtype=tf.int32), axis=1) + # mask = input_ids != self.pad_index + + # check inputs + bs, slen = shape_list(input_ids) + assert shape_list(lengths)[0] == bs + # assert lengths.max().item() <= slen + # input_ids = input_ids.transpose(0, 1) # batch size as dimension 0 + # assert (src_enc is None) == (src_len is None) + # if src_enc is not None: + # assert self.is_decoder + # assert src_enc.size(0) == bs + + # generate masks + mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask) + # if self.is_decoder and src_enc is not None: + # src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None] + + # position_ids + if position_ids is None: + position_ids = tf.expand_dims(tf.range(slen), axis=0) + else: + assert shape_list(position_ids) == [bs, slen] # (slen, bs) + # position_ids = position_ids.transpose(0, 1) + + # langs + if langs is not None: + assert shape_list(langs) == [bs, slen] # (slen, bs) + # langs = langs.transpose(0, 1) + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x qlen x klen] + if head_mask is not None: + raise NotImplementedError + else: + head_mask = [None] * self.n_layers + + # do not recompute cached elements + if cache is not None: + _slen = slen - cache['slen'] + input_ids = input_ids[:, -_slen:] + position_ids = position_ids[:, -_slen:] + if langs is not None: + langs = langs[:, -_slen:] + mask = mask[:, -_slen:] + attn_mask = attn_mask[:, -_slen:] + + # embeddings + tensor = self.embeddings(input_ids) + tensor = tensor + self.position_embeddings(position_ids) + if langs is not None and self.use_lang_emb: + tensor = tensor + self.lang_embeddings(langs) + if token_type_ids is not None: + tensor = tensor + self.embeddings(token_type_ids) + tensor = self.layer_norm_emb(tensor) + tensor = self.dropout(tensor, training=training) + tensor = tensor * mask[..., tf.newaxis] + + # transformer layers + hidden_states = () + attentions = () + for i in range(self.n_layers): + if self.output_hidden_states: + hidden_states = hidden_states + (tensor,) + + # self attention + attn_outputs = self.attentions[i]([tensor, attn_mask, None, cache, head_mask[i]], training=training) + attn = attn_outputs[0] + if self.output_attentions: + attentions = attentions + (attn_outputs[1],) + attn = self.dropout(attn, training=training) + tensor = tensor + attn + tensor = self.layer_norm1[i](tensor) + + # encoder attention (for decoder only) + # if self.is_decoder and src_enc is not None: + # attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache) + # attn = F.dropout(attn, p=self.dropout, training=self.training) + # tensor = tensor + attn + # tensor = self.layer_norm15[i](tensor) + + # FFN + tensor = tensor + self.ffns[i](tensor) + tensor = self.layer_norm2[i](tensor) + tensor = tensor * mask[..., tf.newaxis] + + # Add last hidden state + if self.output_hidden_states: + hidden_states = hidden_states + (tensor,) + + # update cache length + if cache is not None: + cache['slen'] += tensor.size(1) + + # move back sequence length to dimension 0 + # tensor = tensor.transpose(0, 1) + + outputs = (tensor,) + if self.output_hidden_states: + outputs = outputs + (hidden_states,) + if self.output_attentions: + outputs = outputs + (attentions,) + return outputs # outputs, (hidden_states), (attentions) + + +class TFXLMPreTrainedModel(TFPreTrainedModel): + """ An abstract class to handle weights initialization and + a simple interface for dowloading and loading pretrained models. + """ + config_class = XLMConfig + pretrained_model_archive_map = TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP + load_pt_weights = load_xlm_pt_weights_in_tf2 + base_model_prefix = "transformer" + + +XLM_START_DOCSTRING = r""" The XLM model was proposed in + `Cross-lingual Language Model Pretraining`_ + by Guillaume Lample*, Alexis Conneau*. It's a transformer pre-trained using one of the following objectives: + + - a causal language modeling (CLM) objective (next token prediction), + - a masked language modeling (MLM) objective (Bert-like), or + - a Translation Language Modeling (TLM) object (extension of Bert's MLM to multiple language inputs) + + Original code can be found `here`_. + + This model is a PyTorch `torch.tf.keras.layers.Layer`_ sub-class. Use it as a regular PyTorch Module and + refer to the PyTorch documentation for all matter related to general usage and behavior. + + .. _`Cross-lingual Language Model Pretraining`: + https://arxiv.org/abs/1901.07291 + + .. _`torch.tf.keras.layers.Layer`: + https://pytorch.org/docs/stable/nn.html#module + + .. _`here`: + https://github.com/facebookresearch/XLM + + Parameters: + config (:class:`~pytorch_transformers.XLMConfig`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the configuration. + Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights. +""" + +XLM_INPUTS_DOCSTRING = r""" + Inputs: + **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Indices of input sequence tokens in the vocabulary. + + XLM is a model with absolute position embeddings so it's usually advised to pad the inputs on + the right rather than the left. + + Indices can be obtained using :class:`pytorch_transformers.XLMTokenizer`. + See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and + :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. + **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: + Mask to avoid performing attention on padding token indices. + Mask values selected in ``[0, 1]``: + ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. + **langs**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + A parallel sequence of tokens to be used to indicate the language of each token in the input. + Indices are languages ids which can be obtained from the language names by using two conversion mappings + provided in the configuration of the model (only provided for multilingual models). + More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and + the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str). + **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + A parallel sequence of tokens (can be used to indicate various portions of the inputs). + The embeddings from these tokens will be summed with the respective token embeddings. + Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices). + **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Indices of positions of each input sequence tokens in the position embeddings. + Selected in the range ``[0, config.max_position_embeddings - 1]``. + **lengths**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Length of each sentence that can be used to avoid performing attention on padding token indices. + You can also use `attention_mask` for the same result (see above), kept here for compatbility. + Indices selected in ``[0, ..., input_ids.size(-1)]``: + **cache**: + dictionary with ``torch.FloatTensor`` that contains pre-computed + hidden-states (key and values in the attention blocks) as computed by the model + (see `cache` output below). Can be used to speed up sequential decoding. + The dictionary object will be modified in-place during the forward pass to add newly computed hidden-states. + **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: + Mask to nullify selected heads of the self-attention modules. + Mask values selected in ``[0, 1]``: + ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. +""" + +@add_start_docstrings("The bare XLM Model transformer outputing raw hidden-states without any specific head on top.", + XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING) +class TFXLMModel(TFXLMPreTrainedModel): + r""" + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)`` + Sequence of hidden-states at the last layer of the model. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') + model = XLMModel.from_pretrained('xlm-mlm-en-2048') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids) + last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple + + """ + def __init__(self, config, *inputs, **kwargs): + super(TFXLMModel, self).__init__(config, *inputs, **kwargs) + self.transformer = TFXLMMainLayer(config, name='transformer') + + def call(self, inputs, training=False): + outputs = self.transformer(inputs, training=training) + return outputs + + + +class TFXLMPredLayer(tf.keras.layers.Layer): + """ + Prediction layer (cross_entropy or adaptive_softmax). + """ + def __init__(self, config, input_embeddings, **kwargs): + super(TFXLMPredLayer, self).__init__(**kwargs) + self.asm = config.asm + self.n_words = config.n_words + self.pad_index = config.pad_index + if config.asm is False: + self.input_embeddings = input_embeddings + else: + raise NotImplementedError + # self.proj = nn.AdaptiveLogSoftmaxWithLoss( + # in_features=dim, + # n_classes=config.n_words, + # cutoffs=config.asm_cutoffs, + # div_value=config.asm_div_value, + # head_bias=True, # default is False + # ) + + def build(self, input_shape): + # The output weights are the same as the input embeddings, but there is an output-only bias for each token. + self.bias = self.add_weight(shape=(self.n_words,), + initializer='zeros', + trainable=True, + name='bias') + super(TFXLMPredLayer, self).build(input_shape) + + def call(self, hidden_states): + hidden_states = self.input_embeddings(hidden_states, mode="linear") + hidden_states = hidden_states + self.bias + return hidden_states + + +@add_start_docstrings("""The XLM Model transformer with a language modeling head on top + (linear layer with weights tied to the input embeddings). """, + XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING) +class TFXLMWithLMHeadModel(TFXLMPreTrainedModel): + r""" + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Labels for language modeling. + Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids`` + Indices are selected in ``[-1, 0, ..., config.vocab_size]`` + All labels set to ``-1`` are ignored (masked), the loss is only + computed for labels in ``[0, ..., config.vocab_size]`` + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Language modeling loss. + **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') + model = XLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids) + last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple + + """ + def __init__(self, config, *inputs, **kwargs): + super(TFXLMWithLMHeadModel, self).__init__(config, *inputs, **kwargs) + self.transformer = TFXLMMainLayer(config, name='transformer') + self.pred_layer = TFXLMPredLayer(config, self.transformer.embeddings, name='pred_layer') + + + def call(self, inputs, training=False): + transformer_outputs = self.transformer(inputs, training=training) + + output = transformer_outputs[0] + outputs = self.pred_layer(output) + outputs = (outputs,) + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here + + return outputs + + +@add_start_docstrings("""XLM Model with a sequence classification/regression head on top (a linear layer on top of + the pooled output) e.g. for GLUE tasks. """, + XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING) +class TFXLMForSequenceClassification(TFXLMPreTrainedModel): + r""" + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for computing the sequence classification/regression loss. + Indices should be in ``[0, ..., config.num_labels - 1]``. + If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), + If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Classification (or regression if config.num_labels==1) loss. + **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` + Classification (or regression if config.num_labels==1) scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') + model = XLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, labels=labels) + loss, logits = outputs[:2] + + """ + def __init__(self, config, *inputs, **kwargs): + super(TFXLMForSequenceClassification, self).__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + + self.transformer = TFXLMMainLayer(config, name='transformer') + self.sequence_summary = TFSequenceSummary(config, name='sequence_summary') + + def call(self, inputs, training=False): + transformer_outputs = self.transformer(inputs, training=training) + output = transformer_outputs[0] + + logits = self.sequence_summary(output) + + outputs = (logits,) + transformer_outputs[1:] # Keep new_mems and attention/hidden states if they are here + return outputs + + +@add_start_docstrings("""XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of + the hidden-states output to compute `span start logits` and `span end logits`). """, + XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING) +class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel): + r""" + **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + **is_impossible**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels whether a question has an answer or no answer (SQuAD 2.0) + **cls_index**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for position (index) of the classification token to use as input for computing plausibility of the answer. + **p_mask**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...) + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` + Span-start scores (before SoftMax). + **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` + Span-end scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') + model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + start_positions = torch.tensor([1]) + end_positions = torch.tensor([3]) + outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) + loss, start_scores, end_scores = outputs[:2] + + """ + def __init__(self, config, *inputs, **kwargs): + super(TFXLMForQuestionAnsweringSimple, self).__init__(config, *inputs, **kwargs) + self.transformer = TFXLMMainLayer(config, name='transformer') + self.qa_outputs = tf.keras.layers.Dense(config.num_labels, name='qa_outputs') + + def call(self, inputs, training=False): + transformer_outputs = self.transformer(inputs, training=training) + + sequence_output = transformer_outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = tf.split(logits, 2, axis=-1) + start_logits = tf.squeeze(start_logits, axis=-1) + end_logits = tf.squeeze(end_logits, axis=-1) + + outputs = (start_logits, end_logits,) + transformer_outputs[1:] # Keep mems, hidden states, attentions if there are in it + + return outputs # start_logits, end_logits, (hidden_states), (attentions) diff --git a/pytorch_transformers/modeling_tf_xlnet.py b/pytorch_transformers/modeling_tf_xlnet.py index 12f0640531..a65213cb33 100644 --- a/pytorch_transformers/modeling_tf_xlnet.py +++ b/pytorch_transformers/modeling_tf_xlnet.py @@ -69,10 +69,8 @@ def load_xlnet_pt_weights_in_tf2(tf_model, config, pytorch_checkpoint_path): all_pytorch_weights = set(list(state_dict.keys())) for symbolic_weight in symbolic_weights: name = symbolic_weight.name - name = name.replace('cls_mlm', 'cls') # We had to split this layer in two in the TF model to be - name = name.replace('cls_nsp', 'cls') # able to do transfer learning (Keras only allow to remove full layers) name = name.replace(':0', '') - name = name.replace('layer__', 'layer/') + name = name.replace('__', '/') name = name.split('/') name = name[1:] @@ -887,8 +885,6 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): super(TFXLNetLMHeadModel, self).__init__(config, *inputs, **kwargs) - self.n_token = config.n_token - self.transformer = TFXLNetMainLayer(config, name='transformer') self.lm_loss = TFXLNetLMHead(config, self.transformer.word_embedding, name='lm_loss') @@ -993,8 +989,6 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): super(TFXLNetForQuestionAnsweringSimple, self).__init__(config, *inputs, **kwargs) - self.num_labels = config.num_labels - self.transformer = TFXLNetMainLayer(config, name='transformer') self.qa_outputs = tf.keras.layers.Dense(config.num_labels, name='qa_outputs') diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py index 34406f5b61..aeb1fcf689 100644 --- a/pytorch_transformers/modeling_xlm.py +++ b/pytorch_transformers/modeling_xlm.py @@ -337,11 +337,6 @@ class XLMModel(XLMPreTrainedModel): last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ - ATTRIBUTES = ['encoder', 'eos_index', 'pad_index', # 'with_output', - 'n_langs', 'use_lang_emb', 'n_words', 'dim', 'n_layers', 'n_heads', - 'hidden_dim', 'dropout', 'attention_dropout', 'asm', - 'asm_cutoffs', 'asm_div_value'] - def __init__(self, config): #, dico, is_encoder, with_output): super(XLMModel, self).__init__(config) self.output_attentions = config.output_attentions diff --git a/pytorch_transformers/tests/modeling_tf_xlm_test.py b/pytorch_transformers/tests/modeling_tf_xlm_test.py new file mode 100644 index 0000000000..5aed818ddd --- /dev/null +++ b/pytorch_transformers/tests/modeling_tf_xlm_test.py @@ -0,0 +1,264 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import unittest +import shutil +import pytest + +from pytorch_transformers import is_tf_available + +if is_tf_available(): + import tensorflow as tf + from pytorch_transformers import (XLMConfig, TFXLMModel, + TFXLMWithLMHeadModel, + TFXLMForSequenceClassification, + TFXLMForQuestionAnsweringSimple, + TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP) +else: + pytestmark = pytest.mark.skip("Require TensorFlow") + +from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor) +from .configuration_common_test import ConfigTester + + +class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester): + + all_model_classes = (TFXLMModel, TFXLMWithLMHeadModel, + TFXLMForSequenceClassification, + TFXLMForQuestionAnsweringSimple) if is_tf_available() else () + + + class TFXLMModelTester(object): + + def __init__(self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_lengths=True, + use_token_type_ids=True, + use_labels=True, + gelu_activation=True, + sinusoidal_embeddings=False, + causal=False, + asm=False, + n_langs=2, + vocab_size=99, + n_special=0, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + summary_type="last", + use_proj=True, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_lengths = use_input_lengths + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.gelu_activation = gelu_activation + self.sinusoidal_embeddings = sinusoidal_embeddings + self.asm = asm + self.n_langs = n_langs + self.vocab_size = vocab_size + self.n_special = n_special + self.summary_type = summary_type + self.causal = causal + self.use_proj = use_proj + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.n_langs = n_langs + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.summary_type = summary_type + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + input_mask = ids_tensor([self.batch_size, self.seq_length], 2, dtype=tf.float32) + + input_lengths = None + if self.use_input_lengths: + input_lengths = ids_tensor([self.batch_size], vocab_size=2) + self.seq_length - 2 # small variation of seq_length + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.n_langs) + + sequence_labels = None + token_labels = None + is_impossible_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32) + + config = XLMConfig( + vocab_size_or_config_json_file=self.vocab_size, + n_special=self.n_special, + emb_dim=self.hidden_size, + n_layers=self.num_hidden_layers, + n_heads=self.num_attention_heads, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + gelu_activation=self.gelu_activation, + sinusoidal_embeddings=self.sinusoidal_embeddings, + asm=self.asm, + causal=self.causal, + n_langs=self.n_langs, + max_position_embeddings=self.max_position_embeddings, + initializer_range=self.initializer_range, + summary_type=self.summary_type, + use_proj=self.use_proj) + + return config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask + + def create_and_check_xlm_model(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask): + model = TFXLMModel(config=config) + inputs = {'input_ids': input_ids, + 'lengths': input_lengths, + 'langs': token_type_ids} + outputs = model(inputs) + + inputs = [input_ids, input_mask] + outputs = model(inputs) + sequence_output = outputs[0] + result = { + "sequence_output": sequence_output.numpy(), + } + self.parent.assertListEqual( + list(result["sequence_output"].shape), + [self.batch_size, self.seq_length, self.hidden_size]) + + + def create_and_check_xlm_lm_head(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask): + model = TFXLMWithLMHeadModel(config) + + inputs = {'input_ids': input_ids, + 'lengths': input_lengths, + 'langs': token_type_ids} + outputs = model(inputs) + + logits = outputs[0] + + result = { + "logits": logits.numpy(), + } + + self.parent.assertListEqual( + list(result["logits"].shape), + [self.batch_size, self.seq_length, self.vocab_size]) + + + def create_and_check_xlm_qa(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask): + model = TFXLMForQuestionAnsweringSimple(config) + + inputs = {'input_ids': input_ids, + 'lengths': input_lengths} + + outputs = model(inputs) + start_logits, end_logits = model(inputs) + + result = { + "start_logits": start_logits.numpy(), + "end_logits": end_logits.numpy(), + } + + self.parent.assertListEqual( + list(result["start_logits"].shape), + [self.batch_size, self.seq_length]) + self.parent.assertListEqual( + list(result["end_logits"].shape), + [self.batch_size, self.seq_length]) + + + def create_and_check_xlm_sequence_classif(self, config, input_ids, token_type_ids, input_lengths, sequence_labels, token_labels, is_impossible_labels, input_mask): + model = TFXLMForSequenceClassification(config) + + inputs = {'input_ids': input_ids, + 'lengths': input_lengths} + + (logits,) = model(inputs) + + result = { + "logits": logits.numpy(), + } + + self.parent.assertListEqual( + list(result["logits"].shape), + [self.batch_size, self.type_sequence_label_size]) + + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + (config, input_ids, token_type_ids, input_lengths, + sequence_labels, token_labels, is_impossible_labels, input_mask) = config_and_inputs + inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'lengths': input_lengths} + return config, inputs_dict + + def setUp(self): + self.model_tester = TFXLMModelTest.TFXLMModelTester(self) + self.config_tester = ConfigTester(self, config_class=XLMConfig, emb_dim=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_xlm_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_xlm_model(*config_and_inputs) + + def test_xlm_lm_head(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_xlm_lm_head(*config_and_inputs) + + def test_xlm_qa(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_xlm_qa(*config_and_inputs) + + def test_xlm_sequence_classif(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs) + + @pytest.mark.slow + def test_model_from_pretrained(self): + cache_dir = "/tmp/pytorch_transformers_test/" + for model_name in list(TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: + model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir) + shutil.rmtree(cache_dir) + self.assertIsNotNone(model) + + +if __name__ == "__main__": + unittest.main() diff --git a/pytorch_transformers/tests/modeling_xlm_test.py b/pytorch_transformers/tests/modeling_xlm_test.py index acc8a68066..841bea94b7 100644 --- a/pytorch_transformers/tests/modeling_xlm_test.py +++ b/pytorch_transformers/tests/modeling_xlm_test.py @@ -272,20 +272,17 @@ class XLMModelTest(CommonTestCases.CommonModelTester): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_xlm_model(*config_and_inputs) - # config_and_inputs = tester.prepare_config_and_inputs() - # tester.create_and_check_xlm_for_masked_lm(*config_and_inputs) + def test_xlm_lm_head(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_xlm_lm_head(*config_and_inputs) - # config_and_inputs = tester.prepare_config_and_inputs() - # tester.create_and_check_xlm_for_multiple_choice(*config_and_inputs) + def test_xlm_qa(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_xlm_qa(*config_and_inputs) - # config_and_inputs = tester.prepare_config_and_inputs() - # tester.create_and_check_xlm_for_question_answering(*config_and_inputs) - - # config_and_inputs = tester.prepare_config_and_inputs() - # tester.create_and_check_xlm_for_sequence_classification(*config_and_inputs) - - # config_and_inputs = tester.prepare_config_and_inputs() - # tester.create_and_check_xlm_for_token_classification(*config_and_inputs) + def test_xlm_sequence_classif(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_xlm_sequence_classif(*config_and_inputs) @pytest.mark.slow def test_model_from_pretrained(self):