Refactored Docstrings of BERT, GPT2, GPT, TransfoXL, XLM and XLNet.
This commit is contained in:
@@ -101,6 +101,25 @@ def gelu(x):
|
||||
|
||||
class GPT2Config(PretrainedConfig):
|
||||
"""Configuration class to store the configuration of a `GPT2Model`.
|
||||
|
||||
Args:
|
||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
|
||||
n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
|
||||
n_positions: Number of positional embeddings.
|
||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||
n_embd: Dimensionality of the embeddings and hidden states.
|
||||
n_layer: Number of hidden layers in the Transformer encoder.
|
||||
n_head: Number of attention heads for each attention layer in
|
||||
the Transformer encoder.
|
||||
layer_norm_epsilon: epsilon to use in the layer norm layers
|
||||
resid_pdrop: The dropout probabilitiy for all fully connected
|
||||
layers in the embeddings, encoder, and pooler.
|
||||
attn_pdrop: The dropout ratio for the attention
|
||||
probabilities.
|
||||
embd_pdrop: The dropout ratio for the embeddings.
|
||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
predict_special_tokens: should we predict special tokens (when the model has a LM head)
|
||||
"""
|
||||
pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
|
||||
@@ -418,9 +437,11 @@ class GPT2Model(GPT2PreTrainedModel):
|
||||
GPT-2 use a single embedding matrix to store the word and special embeddings.
|
||||
Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
|
||||
Special tokens need to be trained during the fine-tuning if you use them.
|
||||
The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
|
||||
The number of special embeddings can be controlled using the `set_num_special_tokens(num_special_tokens)` function.
|
||||
|
||||
The embeddings are ordered as follow in the token embeddings matrix:
|
||||
::
|
||||
|
||||
The embeddings are ordered as follow in the token embeddings matrice:
|
||||
[0, ----------------------
|
||||
... -> word embeddings
|
||||
config.vocab_size - 1, ______________________
|
||||
@@ -428,47 +449,24 @@ class GPT2Model(GPT2PreTrainedModel):
|
||||
... -> special embeddings
|
||||
config.vocab_size + config.n_special - 1] ______________________
|
||||
|
||||
where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
|
||||
total_tokens_embeddings = config.vocab_size + config.n_special
|
||||
You should use the associate indices to index the embeddings.
|
||||
where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is equal to
|
||||
|
||||
Params:
|
||||
::
|
||||
|
||||
total_tokens_embeddings = config.vocab_size + config.n_special
|
||||
|
||||
You should use the associated indices to index the embeddings.
|
||||
|
||||
Args:
|
||||
`config`: a GPT2Config class instance with the configuration to build a new model
|
||||
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
|
||||
|
||||
Inputs:
|
||||
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
|
||||
were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
|
||||
`position_ids`: an optional torch.LongTensor with the same shape as input_ids
|
||||
with the position indices (selected in the range [0, config.n_positions - 1[.
|
||||
`token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
|
||||
You can use it to add a third type of embedding to each input token in the sequence
|
||||
(the previous two being the word and position embeddings).
|
||||
The input, position and token_type embeddings are summed inside the Transformer before the first
|
||||
self-attention block.
|
||||
`past`: an optional list of torch.LongTensor that contains pre-computed hidden-states
|
||||
(key and values in the attention blocks) to speed up sequential decoding
|
||||
(this is the presents output of the model, cf. below).
|
||||
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
|
||||
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
|
||||
|
||||
Outputs a tuple consisting of:
|
||||
`hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings)
|
||||
as torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
|
||||
(or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
|
||||
`presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as
|
||||
torch.FloatTensors. They can be reused to speed up sequential decoding.
|
||||
|
||||
Example usage:
|
||||
```python
|
||||
# Already been converted into BPE token ids
|
||||
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
|
||||
Example::
|
||||
|
||||
config = modeling_gpt2.GPT2Config()
|
||||
|
||||
model = modeling_gpt2.GPT2Model(config)
|
||||
hidden_states, presents = model(input_ids)
|
||||
```
|
||||
config = modeling_gpt2.GPT2Config()
|
||||
model = modeling_gpt2.GPT2Model(config)
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
@@ -485,7 +483,7 @@ class GPT2Model(GPT2PreTrainedModel):
|
||||
self.apply(self.init_weights)
|
||||
|
||||
def set_num_special_tokens(self, num_special_tokens=None):
|
||||
" Update input embeddings with new embedding matrice if needed "
|
||||
"""Update input embeddings with new embedding matrix if needed."""
|
||||
if num_special_tokens is None or self.config.n_special == num_special_tokens:
|
||||
return
|
||||
# Update config
|
||||
@@ -506,6 +504,47 @@ class GPT2Model(GPT2PreTrainedModel):
|
||||
self.h[layer].attn.prune_heads(heads)
|
||||
|
||||
def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None, head_mask=None):
|
||||
"""
|
||||
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
|
||||
|
||||
Args:
|
||||
`input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
|
||||
were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
|
||||
`position_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
|
||||
with the position indices (selected in the range [0, config.n_positions - 1[.
|
||||
`token_type_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
|
||||
You can use it to add a third type of embedding to each input token in the sequence
|
||||
(the previous two being the word and position embeddings).
|
||||
The input, position and token_type embeddings are summed inside the Transformer before the first
|
||||
self-attention block.
|
||||
`past`: an optional list of ``torch.LongTensor`` that contains pre-computed hidden-states
|
||||
(key and values in the attention blocks) to speed up sequential decoding
|
||||
(this is the presents output of the model, cf. below).
|
||||
`head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
|
||||
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
|
||||
|
||||
Returns:
|
||||
A tuple consisting of ``hidden_states`` and ``presents``.
|
||||
|
||||
``hidden_states`` are a list of all the encoded-hidden-states in the model (length of the list: number of
|
||||
layers + 1 for the output of the embeddings) as ``torch.FloatTensor`` of size [batch_size, sequence_length,
|
||||
hidden_size] (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of
|
||||
input_ids).
|
||||
|
||||
``presents`` are a list of pre-computed hidden-states (key and values in each attention blocks) as
|
||||
torch.FloatTensors. They can be reused to speed up sequential decoding.
|
||||
|
||||
|
||||
Example::
|
||||
|
||||
# Already been converted into BPE token ids
|
||||
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
|
||||
|
||||
hidden_states, presents = model(input_ids)
|
||||
# or
|
||||
hidden_states, presents = model.forward(input_ids)
|
||||
|
||||
"""
|
||||
if past is None:
|
||||
past_length = 0
|
||||
past = [None] * len(self.h)
|
||||
@@ -580,50 +619,18 @@ class GPT2Model(GPT2PreTrainedModel):
|
||||
class GPT2LMHeadModel(GPT2PreTrainedModel):
|
||||
"""OpenAI GPT-2 model with a Language Modeling head ("Language Models are Unsupervised Multitask Learners").
|
||||
|
||||
Params:
|
||||
Args:
|
||||
`config`: a GPT2Config class instance with the configuration to build a new model
|
||||
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
|
||||
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
|
||||
This can be used to compute head importance metrics. Default: False
|
||||
|
||||
Inputs:
|
||||
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
|
||||
were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
|
||||
`position_ids`: an optional torch.LongTensor with the same shape as input_ids
|
||||
with the position indices (selected in the range [0, config.n_positions - 1[.
|
||||
`token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
|
||||
You can use it to add a third type of embedding to each input token in the sequence
|
||||
(the previous two being the word and position embeddings).
|
||||
The input, position and token_type embeddings are summed inside the Transformer before the first
|
||||
self-attention block.
|
||||
`lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
|
||||
with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
|
||||
is only computed for the labels set in [0, ..., vocab_size]
|
||||
`past`: an optional list of torch.LongTensor that contains pre-computed hidden-states
|
||||
(key and values in the attention blocks) to speed up sequential decoding
|
||||
(this is the presents output of the model, cf. below).
|
||||
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
|
||||
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
|
||||
|
||||
Outputs:
|
||||
if `lm_labels` is not `None`:
|
||||
Outputs the language modeling loss.
|
||||
else a tuple:
|
||||
`lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, config.vocab_size]
|
||||
(or more generally [d_1, ..., d_n, config.vocab_size] were d_1 ... d_n are the dimension of input_ids)
|
||||
`presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as
|
||||
torch.FloatTensors. They can be reused to speed up sequential decoding.
|
||||
|
||||
Example usage:
|
||||
```python
|
||||
# Already been converted into BPE token ids
|
||||
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
|
||||
Example::
|
||||
|
||||
config = modeling_gpt2.GPT2Config()
|
||||
|
||||
model = modeling_gpt2.GPT2LMHeadModel(config)
|
||||
lm_logits, presents = model(input_ids)
|
||||
```
|
||||
config = modeling_gpt2.GPT2Config()
|
||||
model = modeling_gpt2.GPT2LMHeadModel(config)
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
@@ -633,14 +640,58 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
||||
self.apply(self.init_weights)
|
||||
|
||||
def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
|
||||
""" Update input and output embeddings with new embedding matrice
|
||||
Make sure we are sharing the embeddings
|
||||
"""
|
||||
Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings.
|
||||
TODO Shouldn't we put args + returns ?
|
||||
"""
|
||||
self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
|
||||
self.transformer.set_num_special_tokens(num_special_tokens)
|
||||
self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
|
||||
|
||||
def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None, head_mask=None):
|
||||
"""
|
||||
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
|
||||
|
||||
Args:
|
||||
`input_ids`: a ``torch.LongTensor`` of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
|
||||
were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
|
||||
`position_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
|
||||
with the position indices (selected in the range [0, config.n_positions - 1[.
|
||||
`token_type_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
|
||||
You can use it to add a third type of embedding to each input token in the sequence
|
||||
(the previous two being the word and position embeddings).
|
||||
The input, position and token_type embeddings are summed inside the Transformer before the first
|
||||
self-attention block.
|
||||
`lm_labels`: optional language modeling labels: ``torch.LongTensor`` of shape [batch_size, sequence_length]
|
||||
with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
|
||||
is only computed for the labels set in [0, ..., vocab_size]
|
||||
`past`: an optional list of ``torch.LongTensor`` that contains pre-computed hidden-states
|
||||
(key and values in the attention blocks) to speed up sequential decoding
|
||||
(this is the presents output of the model, cf. below).
|
||||
`head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
|
||||
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
|
||||
|
||||
Returns:
|
||||
If ``lm_labels`` is not ``None``, returns the language modeling loss. It ``lm_labels`` is ``None``, returns
|
||||
a tuple of (``lm_logits``, ``presents``).
|
||||
|
||||
``lm_logits`` is the language modeling logits as a ``torch.FloatTensor`` of size [batch_size,
|
||||
sequence_length, config.vocab_size] (or more generally [d_1, ..., d_n, config.vocab_size] were d_1 ...
|
||||
d_n are the dimension of input_ids).
|
||||
|
||||
``presents`` is a list of pre-computed hidden-states (key and values in each attention blocks) as
|
||||
torch.FloatTensors. They can be reused to speed up sequential decoding.
|
||||
|
||||
Example::
|
||||
|
||||
# Already been converted into BPE token ids
|
||||
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
|
||||
|
||||
lm_logits, presents = model(input_ids)
|
||||
# or
|
||||
lm_logits, presents = model.forward(input_ids)
|
||||
|
||||
"""
|
||||
transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, past, head_mask)
|
||||
hidden_states = transformer_outputs[0]
|
||||
|
||||
@@ -663,55 +714,16 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
||||
class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
||||
"""OpenAI GPT-2 model with a Language Modeling and a Multiple Choice head ("Language Models are Unsupervised Multitask Learners").
|
||||
|
||||
Params:
|
||||
Args:
|
||||
`config`: a GPT2Config class instance with the configuration to build a new model
|
||||
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
|
||||
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
|
||||
This can be used to compute head importance metrics. Default: False
|
||||
|
||||
Inputs:
|
||||
`input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token
|
||||
indices selected in the range [0, config.vocab_size[
|
||||
`mc_token_ids`: a torch.LongTensor of shape [batch_size, num_choices] with the index of the token from
|
||||
which we should take the hidden state to feed the multiple choice classifier (usually last token of the sequence)
|
||||
`position_ids`: an optional torch.LongTensor with the same shape as input_ids
|
||||
with the position indices (selected in the range [0, config.n_positions - 1[.
|
||||
`token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
|
||||
You can use it to add a third type of embedding to each input token in the sequence
|
||||
(the previous two being the word and position embeddings).
|
||||
The input, position and token_type embeddings are summed inside the Transformer before the first
|
||||
self-attention block.
|
||||
`lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, num_choices, sequence_length]
|
||||
with indices selected in [-1, 0, ..., config.vocab_size]. All labels set to -1 are ignored (masked), the loss
|
||||
is only computed for the labels set in [0, ..., config.vocab_size]
|
||||
`multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size]
|
||||
with indices selected in [0, ..., num_choices].
|
||||
`past`: an optional list of torch.LongTensor that contains pre-computed hidden-states
|
||||
(key and values in the attention blocks) to speed up sequential decoding
|
||||
(this is the presents output of the model, cf. below).
|
||||
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
|
||||
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
|
||||
Example::
|
||||
|
||||
Outputs:
|
||||
if `lm_labels` and `multiple_choice_labels` are not `None`:
|
||||
Outputs a tuple of losses with the language modeling loss and the multiple choice loss.
|
||||
else: a tuple with
|
||||
`lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, config.vocab_size]
|
||||
`multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]
|
||||
`presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as
|
||||
torch.FloatTensors. They can be reused to speed up sequential decoding.
|
||||
|
||||
Example usage:
|
||||
```python
|
||||
# Already been converted into BPE token ids
|
||||
input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]]]) # (bsz, number of choice, seq length)
|
||||
mc_token_ids = torch.LongTensor([[2], [1]]) # (bsz, number of choice)
|
||||
|
||||
config = modeling_gpt2.GPT2Config()
|
||||
|
||||
model = modeling_gpt2.GPT2DoubleHeadsModel(config)
|
||||
lm_logits, multiple_choice_logits, presents = model(input_ids, mc_token_ids)
|
||||
```
|
||||
config = modeling_gpt2.GPT2Config()
|
||||
model = modeling_gpt2.GPT2DoubleHeadsModel(config)
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
@@ -723,8 +735,9 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
||||
self.apply(self.init_weights)
|
||||
|
||||
def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
|
||||
""" Update input and output embeddings with new embedding matrice
|
||||
Make sure we are sharing the embeddings
|
||||
"""
|
||||
Update input and output embeddings with new embedding matrix.Make sure we are sharing the embeddings
|
||||
TODO Shouldn't we put args + returns ?
|
||||
"""
|
||||
self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
|
||||
self.transformer.set_num_special_tokens(num_special_tokens)
|
||||
@@ -732,6 +745,55 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
||||
|
||||
def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
|
||||
position_ids=None, past=None, head_mask=None):
|
||||
"""
|
||||
Performs a model forward pass. **Can be called by calling the class directly, once it has been instantiated.**
|
||||
|
||||
Args:
|
||||
`input_ids`: a ``torch.LongTensor`` of shape [batch_size, num_choices, sequence_length] with the BPE token
|
||||
indices selected in the range [0, config.vocab_size[
|
||||
`mc_token_ids`: a ``torch.LongTensor`` of shape [batch_size, num_choices] with the index of the token from
|
||||
which we should take the hidden state to feed the multiple choice classifier (usually last token of the sequence)
|
||||
`position_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
|
||||
with the position indices (selected in the range [0, config.n_positions - 1[.
|
||||
`token_type_ids`: an optional ``torch.LongTensor`` with the same shape as input_ids
|
||||
You can use it to add a third type of embedding to each input token in the sequence
|
||||
(the previous two being the word and position embeddings).
|
||||
The input, position and token_type embeddings are summed inside the Transformer before the first
|
||||
self-attention block.
|
||||
`lm_labels`: optional language modeling labels: ``torch.LongTensor`` of shape [batch_size, num_choices, sequence_length]
|
||||
with indices selected in [-1, 0, ..., config.vocab_size]. All labels set to -1 are ignored (masked), the loss
|
||||
is only computed for the labels set in [0, ..., config.vocab_size]
|
||||
`multiple_choice_labels`: optional multiple choice labels: ``torch.LongTensor`` of shape [batch_size]
|
||||
with indices selected in [0, ..., num_choices].
|
||||
`past`: an optional list of ``torch.LongTensor`` that contains pre-computed hidden-states
|
||||
(key and values in the attention blocks) to speed up sequential decoding
|
||||
(this is the presents output of the model, cf. below).
|
||||
`head_mask`: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
|
||||
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
|
||||
|
||||
Returns:
|
||||
If ``lm_labels`` and ``multiple_choice_labels`` are not ``None``, outputs a
|
||||
``tuple(language_modeling_loss, multiple_choice_loss)``. If they are not ``None``, outputs a
|
||||
``tuple(lm_logits, multiple_choice_logits, presents)``.
|
||||
|
||||
``lm_logits``: the language modeling logits as a ``torch.FloatTensor`` of size [batch_size, num_choices, sequence_length, config.vocab_size]
|
||||
|
||||
``multiple_choice_logits``: the multiple choice logits as a ``torch.FloatTensor`` of size [batch_size, num_choices]
|
||||
|
||||
``presents``: a list of pre-computed hidden-states (key and values in each attention blocks) as
|
||||
torch.FloatTensors. They can be reused to speed up sequential decoding.
|
||||
|
||||
Example::
|
||||
|
||||
# Already been converted into BPE token ids
|
||||
input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]]]) # (bsz, number of choice, seq length)
|
||||
mc_token_ids = torch.LongTensor([[2], [1]]) # (bsz, number of choice)
|
||||
|
||||
lm_logits, multiple_choice_logits, presents = model(input_ids, mc_token_ids)
|
||||
# or
|
||||
lm_logits, multiple_choice_logits, presents = model.forward(input_ids, mc_token_ids)
|
||||
|
||||
"""
|
||||
transformer_outputs = self.transformer(input_ids, position_ids, token_type_ids, past, head_mask)
|
||||
hidden_states = transformer_outputs[0]
|
||||
|
||||
|
||||
Reference in New Issue
Block a user