added doc for transformer-xl

This commit is contained in:
thomwolf
2019-07-15 10:11:09 +02:00
parent 4cb489457f
commit 0201d86015
3 changed files with 106 additions and 112 deletions

View File

@@ -382,10 +382,10 @@ GPT2_START_DOCSTRING = r""" OpenAI GPT-2 model was proposed in
https://pytorch.org/docs/stable/nn.html#module https://pytorch.org/docs/stable/nn.html#module
Parameters: Parameters:
config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model. config (:class:`~pytorch_transformers.GPT2Config`): Model configuration class with all the parameters of the model.
""" """
GPT2_INPUTS_DOCTRING = r""" Inputs: GPT2_INPUTS_DOCSTRING = r""" Inputs:
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of input sequence tokens in the vocabulary. Indices of input sequence tokens in the vocabulary.
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`. Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
@@ -413,7 +413,7 @@ GPT2_INPUTS_DOCTRING = r""" Inputs:
""" """
@add_start_docstrings("The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.", @add_start_docstrings("The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.",
GPT2_START_DOCSTRING, GPT2_INPUTS_DOCTRING) GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
class GPT2Model(GPT2PreTrainedModel): class GPT2Model(GPT2PreTrainedModel):
r""" r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -538,7 +538,7 @@ class GPT2Model(GPT2PreTrainedModel):
@add_start_docstrings("""The GPT2 Model transformer with a language modeling head on top @add_start_docstrings("""The GPT2 Model transformer with a language modeling head on top
(linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCTRING) (linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
class GPT2LMHeadModel(GPT2PreTrainedModel): class GPT2LMHeadModel(GPT2PreTrainedModel):
r""" r"""
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:

View File

@@ -31,7 +31,8 @@ from torch.nn import CrossEntropyLoss
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
from .modeling_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, from .modeling_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig,
PreTrainedModel, prune_conv1d_layer, SequenceSummary) PreTrainedModel, prune_conv1d_layer, SequenceSummary,
add_start_docstrings)
from .modeling_bert import BertLayerNorm as LayerNorm from .modeling_bert import BertLayerNorm as LayerNorm
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -395,10 +396,10 @@ OPENAI_GPT_START_DOCSTRING = r""" OpenAI GPT model was proposed in
https://pytorch.org/docs/stable/nn.html#module https://pytorch.org/docs/stable/nn.html#module
Parameters: Parameters:
config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model. config (:class:`~pytorch_transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
""" """
OPENAI_GPT_INPUTS_DOCTRING = r""" Inputs: OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs:
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of input sequence tokens in the vocabulary. Indices of input sequence tokens in the vocabulary.
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`. Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
@@ -422,7 +423,7 @@ OPENAI_GPT_INPUTS_DOCTRING = r""" Inputs:
""" """
@add_start_docstrings("The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.", @add_start_docstrings("The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.",
OPENAI_GPT_START_DOCSTRING, GPT2_INPUTS_DOCTRING) OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
class OpenAIGPTModel(OpenAIGPTPreTrainedModel): class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
r""" r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -532,7 +533,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling head on top @add_start_docstrings("""OpenAI GPT Model transformer with a language modeling head on top
(linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCTRING) (linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
r""" r"""
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:

View File

@@ -36,7 +36,7 @@ from torch.nn.parameter import Parameter
from .modeling_bert import BertLayerNorm as LayerNorm from .modeling_bert import BertLayerNorm as LayerNorm
from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits
from .modeling_utils import CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel from .modeling_utils import (PretrainedConfig, PreTrainedModel, add_start_docstrings)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -910,23 +910,71 @@ class TransfoXLPreTrainedModel(PreTrainedModel):
pass pass
TRANSFO_XL_START_DOCSTRING = r""" The Transformer-XL model was proposed in
`Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context`_
by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
It's a causal (uni-directional) transformer with relative positioning (sinusoïdal) embeddings which can reuse
previously computed hidden-states to attend to longer context (memory).
This model also uses adaptive softmax inputs and outputs (tied).
This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
refer to the PyTorch documentation for all matter related to general usage and behavior.
.. _`Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context`:
https://arxiv.org/abs/1901.02860
.. _`torch.nn.Module`:
https://pytorch.org/docs/stable/nn.html#module
Parameters:
config (:class:`~pytorch_transformers.TransfoXLConfig`): Model configuration class with all the parameters of the model.
"""
TRANSFO_XL_INPUTS_DOCSTRING = r"""
Inputs:
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using :class:`pytorch_transformers.TransfoXLTokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**mems**:
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `mems` output below). Can be used to speed up sequential decoding and attend to longer context.
**head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask indices selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
"""
@add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING)
class TransfoXLModel(TransfoXLPreTrainedModel): class TransfoXLModel(TransfoXLPreTrainedModel):
"""Transformer XL model ("Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"). r"""
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
Sequence of hidden-states at the last layer of the model.
**mems**: ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
Transformer XL uses relative positioning (with sinusiodal patterns) and adaptive softmax inputs which means that: Examples::
- you don't need to specify positioning embeddings indices. >>> config = TransfoXLConfig.from_pretrained('transfo-xl-wt103')
>>> tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
>>> model = TransfoXLModel(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids)
>>> last_hidden_states, mems = outputs[:2]
- the tokens in the vocabulary have to be sorted in decreasing frequency.
Args:
config: a TransfoXLConfig class instance with the configuration to build a new model
Example::
config = TransfoXLConfig()
model = TransfoXLModel(config)
""" """
def __init__(self, config): def __init__(self, config):
super(TransfoXLModel, self).__init__(config) super(TransfoXLModel, self).__init__(config)
@@ -1193,41 +1241,6 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
return outputs # last hidden state, new_mems, (all hidden states), (all attentions) return outputs # last hidden state, new_mems, (all hidden states), (all attentions)
def forward(self, input_ids, mems=None, head_mask=None): def forward(self, input_ids, mems=None, head_mask=None):
"""
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
Args:
`input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
with the token indices selected in the range [0, self.config.n_token[
`mems`: optional memory of hidden states from previous forward passes
as a list (num layers) of hidden states at the entry of each layer
each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
Returns:
A tuple of ``(last_hidden_state, new_mems)``.
``last_hidden_state``: the encoded-hidden-states at the top of the model
as a ``torch.FloatTensor`` of size [batch_size, sequence_length, self.config.d_model]
``new_mems``: list (num layers) of updated mem states at the entry of each layer
each mem state is a ``torch.FloatTensor`` of size [self.config.mem_len, batch_size, self.config.d_model]
Note that the first two dimensions are transposed in ``mems`` with regards to ``input_ids`` and
``labels``
Example::
# Already been converted into BPE token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_ids_next = torch.LongTensor([[53, 21, 1], [64, 23, 100]])
last_hidden_state, new_mems = model(input_ids)
# or
last_hidden_state, new_mems = model.forward(input_ids)
# Another time on input_ids_next using the memory:
last_hidden_state, new_mems = model(input_ids_next, new_mems)
"""
# the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
# so we transpose here from shape [bsz, len] to shape [len, bsz] # so we transpose here from shape [bsz, len] to shape [len, bsz]
input_ids = input_ids.transpose(0, 1).contiguous() input_ids = input_ids.transpose(0, 1).contiguous()
@@ -1239,27 +1252,45 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
return outputs # last hidden state, new_mems, (all hidden states), (all attentions) return outputs # last hidden state, new_mems, (all hidden states), (all attentions)
@add_start_docstrings("""The Transformer-XL Model with a language modeling head on top
(adaptive softmax with weights tied to the adaptive input embeddings)""",
TRANSFO_XL_START_DOCSTRING, TRANSFO_XL_INPUTS_DOCSTRING)
class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
"""Transformer XL model ("Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"). r"""
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
Labels for language modeling.
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
All labels set to ``-1`` are ignored (masked), the loss is only
computed for labels in ``[0, ..., config.vocab_size]``
This model adds an (adaptive) softmax head on top of the ``TransfoXLModel`` Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Language modeling loss.
**prediction_scores**: ``None`` if ``lm_labels`` is provided else ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
We don't output them when the loss is computed to speedup adaptive softmax decoding.
**mems**: ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
list of ``torch.FloatTensor`` (one for each layer):
that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
(see `mems` input above). Can be used to speed up sequential decoding and attend to longer context.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
Transformer XL uses a relative positioning (with sinusoidal patterns) and adaptive softmax inputs which means that: Examples::
- you don't need to specify positioning embeddings indices >>> config = TransfoXLConfig.from_pretrained('transfo-xl-wt103')
>>> tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
>>> model = TransfoXLLMHeadModel(config)
>>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
>>> outputs = model(input_ids)
>>> prediction_scores, mems = outputs[:2]
- the tokens in the vocabulary have to be sorted in decreasing frequency.
Call ``self.tie_weights()`` if you update/load the weights of the transformer to keep the weights tied.
Args:
config: a ``TransfoXLConfig`` class instance with the configuration to build a new model
Example::
config = TransfoXLConfig()
model = TransfoXLModel(config)
""" """
def __init__(self, config): def __init__(self, config):
super(TransfoXLLMHeadModel, self).__init__(config) super(TransfoXLLMHeadModel, self).__init__(config)
@@ -1310,44 +1341,6 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
return self.transformer.init_mems(data) return self.transformer.init_mems(data)
def forward(self, input_ids, labels=None, mems=None, head_mask=None): def forward(self, input_ids, labels=None, mems=None, head_mask=None):
"""
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
Args:
`input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length]
with the token indices selected in the range [0, self.config.n_token[
`labels`: an optional ``torch.LongTensor`` of shape [batch_size, sequence_length]
with the labels token indices selected in the range [0, self.config.n_token[
`mems`: an optional memory of hidden states from previous forward passes
as a list (num layers) of hidden states at the entry of each layer
each hidden states has shape [self.config.mem_len, bsz, self.config.d_model]
Note that the first two dimensions are transposed in `mems` with regards to `input_ids` and `labels`
Returns:
A tuple of (last_hidden_state, new_mems)
``last_hidden_state``: output of the (adaptive) softmax. If ``labels`` is ``None``, it is the negative
log likelihood of shape [batch_size, sequence_length]. Otherwise, it is the log probabilities of
tokens of, shape [batch_size, sequence_length, n_tokens].
``new_mems``: list (num layers) of updated mem states at the entry of each layer
each mem state is a ``torch.FloatTensor`` of size [self.config.mem_len, batch_size, self.config.d_model]
Note that the first two dimensions are transposed in ``mems`` with regards to ``input_ids`` and
``labels``
Example::
# Already been converted into BPE token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_ids_next = torch.LongTensor([[53, 21, 1], [64, 23, 100]])
last_hidden_state, new_mems = model(input_ids)
# or
last_hidden_state, new_mems = model.forward(input_ids)
# Another time on input_ids_next using the memory:
last_hidden_state, new_mems = model(input_ids_next, mems=new_mems)
"""
bsz = input_ids.size(0) bsz = input_ids.size(0)
tgt_len = input_ids.size(1) tgt_len = input_ids.size(1)