embeddings resizing + tie_weights
This commit is contained in:
@@ -507,23 +507,17 @@ class BertPredictionHeadTransform(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class BertLMPredictionHead(nn.Module):
|
class BertLMPredictionHead(nn.Module):
|
||||||
def __init__(self, config, bert_model_embedding_weights):
|
def __init__(self, config):
|
||||||
super(BertLMPredictionHead, self).__init__()
|
super(BertLMPredictionHead, self).__init__()
|
||||||
self.transform = BertPredictionHeadTransform(config)
|
self.transform = BertPredictionHeadTransform(config)
|
||||||
self.torchscript = config.torchscript
|
|
||||||
|
|
||||||
# The output weights are the same as the input embeddings, but there is
|
# The output weights are the same as the input embeddings, but there is
|
||||||
# an output-only bias for each token.
|
# an output-only bias for each token.
|
||||||
self.decoder = nn.Linear(bert_model_embedding_weights.size(1),
|
self.decoder = nn.Linear(config.hidden_size,
|
||||||
bert_model_embedding_weights.size(0),
|
config.vocab_size,
|
||||||
bias=False)
|
bias=False)
|
||||||
|
|
||||||
if self.torchscript:
|
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
|
||||||
self.decoder.weight = nn.Parameter(bert_model_embedding_weights.clone())
|
|
||||||
else:
|
|
||||||
self.decoder.weight = bert_model_embedding_weights
|
|
||||||
|
|
||||||
self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0)))
|
|
||||||
|
|
||||||
def forward(self, hidden_states):
|
def forward(self, hidden_states):
|
||||||
hidden_states = self.transform(hidden_states)
|
hidden_states = self.transform(hidden_states)
|
||||||
@@ -532,9 +526,9 @@ class BertLMPredictionHead(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class BertOnlyMLMHead(nn.Module):
|
class BertOnlyMLMHead(nn.Module):
|
||||||
def __init__(self, config, bert_model_embedding_weights):
|
def __init__(self, config):
|
||||||
super(BertOnlyMLMHead, self).__init__()
|
super(BertOnlyMLMHead, self).__init__()
|
||||||
self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
|
self.predictions = BertLMPredictionHead(config)
|
||||||
|
|
||||||
def forward(self, sequence_output):
|
def forward(self, sequence_output):
|
||||||
prediction_scores = self.predictions(sequence_output)
|
prediction_scores = self.predictions(sequence_output)
|
||||||
@@ -552,9 +546,9 @@ class BertOnlyNSPHead(nn.Module):
|
|||||||
|
|
||||||
|
|
||||||
class BertPreTrainingHeads(nn.Module):
|
class BertPreTrainingHeads(nn.Module):
|
||||||
def __init__(self, config, bert_model_embedding_weights):
|
def __init__(self, config):
|
||||||
super(BertPreTrainingHeads, self).__init__()
|
super(BertPreTrainingHeads, self).__init__()
|
||||||
self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
|
self.predictions = BertLMPredictionHead(config)
|
||||||
self.seq_relationship = nn.Linear(config.hidden_size, 2)
|
self.seq_relationship = nn.Linear(config.hidden_size, 2)
|
||||||
|
|
||||||
def forward(self, sequence_output, pooled_output):
|
def forward(self, sequence_output, pooled_output):
|
||||||
@@ -619,6 +613,11 @@ class BertModel(BertPreTrainedModel):
|
|||||||
|
|
||||||
self.apply(self.init_weights)
|
self.apply(self.init_weights)
|
||||||
|
|
||||||
|
def _resize_token_embeddings(self, new_num_tokens):
|
||||||
|
old_embeddings = self.embeddings.word_embeddings
|
||||||
|
new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
|
||||||
|
self.embeddings.word_embeddings = new_embeddings
|
||||||
|
|
||||||
def _prune_heads(self, heads_to_prune):
|
def _prune_heads(self, heads_to_prune):
|
||||||
""" Prunes heads of the model.
|
""" Prunes heads of the model.
|
||||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||||
@@ -750,9 +749,20 @@ class BertForPreTraining(BertPreTrainedModel):
|
|||||||
super(BertForPreTraining, self).__init__(config)
|
super(BertForPreTraining, self).__init__(config)
|
||||||
|
|
||||||
self.bert = BertModel(config)
|
self.bert = BertModel(config)
|
||||||
self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
|
self.cls = BertPreTrainingHeads(config)
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.apply(self.init_weights)
|
||||||
|
self.tie_weights()
|
||||||
|
|
||||||
|
def tie_weights(self):
|
||||||
|
""" Make sure we are sharing the input and output embeddings.
|
||||||
|
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
|
||||||
|
"""
|
||||||
|
input_embeddings = self.bert.embeddings.word_embeddings.weight
|
||||||
|
if self.config.torchscript:
|
||||||
|
self.cls.predictions.decoder.weight = nn.Parameter(input_embeddings.clone())
|
||||||
|
else:
|
||||||
|
self.cls.predictions.decoder.weight = input_embeddings # Tied weights
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
|
def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
|
||||||
next_sentence_label=None, head_mask=None):
|
next_sentence_label=None, head_mask=None):
|
||||||
@@ -845,9 +855,20 @@ class BertForMaskedLM(BertPreTrainedModel):
|
|||||||
super(BertForMaskedLM, self).__init__(config)
|
super(BertForMaskedLM, self).__init__(config)
|
||||||
|
|
||||||
self.bert = BertModel(config)
|
self.bert = BertModel(config)
|
||||||
self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
|
self.cls = BertOnlyMLMHead(config)
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.apply(self.init_weights)
|
||||||
|
self.tie_weights()
|
||||||
|
|
||||||
|
def tie_weights(self):
|
||||||
|
""" Make sure we are sharing the input and output embeddings.
|
||||||
|
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
|
||||||
|
"""
|
||||||
|
input_embeddings = self.bert.embeddings.word_embeddings.weight
|
||||||
|
if self.config.torchscript:
|
||||||
|
self.cls.predictions.decoder.weight = nn.Parameter(input_embeddings.clone())
|
||||||
|
else:
|
||||||
|
self.cls.predictions.decoder.weight = input_embeddings # Tied weights
|
||||||
|
|
||||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
|
def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -104,7 +104,6 @@ class GPT2Config(PretrainedConfig):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
|
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
|
||||||
n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
|
|
||||||
n_positions: Number of positional embeddings.
|
n_positions: Number of positional embeddings.
|
||||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||||
n_embd: Dimensionality of the embeddings and hidden states.
|
n_embd: Dimensionality of the embeddings and hidden states.
|
||||||
@@ -119,14 +118,12 @@ class GPT2Config(PretrainedConfig):
|
|||||||
embd_pdrop: The dropout ratio for the embeddings.
|
embd_pdrop: The dropout ratio for the embeddings.
|
||||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
initializer_range: The sttdev of the truncated_normal_initializer for
|
||||||
initializing all weight matrices.
|
initializing all weight matrices.
|
||||||
predict_special_tokens: should we predict special tokens (when the model has a LM head)
|
|
||||||
"""
|
"""
|
||||||
pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
|
pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
vocab_size_or_config_json_file=50257,
|
vocab_size_or_config_json_file=50257,
|
||||||
n_special=0,
|
|
||||||
n_positions=1024,
|
n_positions=1024,
|
||||||
n_ctx=1024,
|
n_ctx=1024,
|
||||||
n_embd=768,
|
n_embd=768,
|
||||||
@@ -137,7 +134,6 @@ class GPT2Config(PretrainedConfig):
|
|||||||
attn_pdrop=0.1,
|
attn_pdrop=0.1,
|
||||||
layer_norm_epsilon=1e-5,
|
layer_norm_epsilon=1e-5,
|
||||||
initializer_range=0.02,
|
initializer_range=0.02,
|
||||||
predict_special_tokens=True,
|
|
||||||
|
|
||||||
num_labels=1,
|
num_labels=1,
|
||||||
summary_type='token_ids',
|
summary_type='token_ids',
|
||||||
@@ -151,7 +147,6 @@ class GPT2Config(PretrainedConfig):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
|
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
|
||||||
n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
|
|
||||||
n_positions: Number of positional embeddings.
|
n_positions: Number of positional embeddings.
|
||||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||||
n_embd: Dimensionality of the embeddings and hidden states.
|
n_embd: Dimensionality of the embeddings and hidden states.
|
||||||
@@ -166,7 +161,6 @@ class GPT2Config(PretrainedConfig):
|
|||||||
embd_pdrop: The dropout ratio for the embeddings.
|
embd_pdrop: The dropout ratio for the embeddings.
|
||||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
initializer_range: The sttdev of the truncated_normal_initializer for
|
||||||
initializing all weight matrices.
|
initializing all weight matrices.
|
||||||
predict_special_tokens: should we predict special tokens (when the model has a LM head)
|
|
||||||
"""
|
"""
|
||||||
super(GPT2Config, self).__init__(**kwargs)
|
super(GPT2Config, self).__init__(**kwargs)
|
||||||
|
|
||||||
@@ -178,7 +172,6 @@ class GPT2Config(PretrainedConfig):
|
|||||||
self.__dict__[key] = value
|
self.__dict__[key] = value
|
||||||
elif isinstance(vocab_size_or_config_json_file, int):
|
elif isinstance(vocab_size_or_config_json_file, int):
|
||||||
self.vocab_size = vocab_size_or_config_json_file
|
self.vocab_size = vocab_size_or_config_json_file
|
||||||
self.n_special = n_special
|
|
||||||
self.n_ctx = n_ctx
|
self.n_ctx = n_ctx
|
||||||
self.n_positions = n_positions
|
self.n_positions = n_positions
|
||||||
self.n_embd = n_embd
|
self.n_embd = n_embd
|
||||||
@@ -189,7 +182,6 @@ class GPT2Config(PretrainedConfig):
|
|||||||
self.attn_pdrop = attn_pdrop
|
self.attn_pdrop = attn_pdrop
|
||||||
self.layer_norm_epsilon = layer_norm_epsilon
|
self.layer_norm_epsilon = layer_norm_epsilon
|
||||||
self.initializer_range = initializer_range
|
self.initializer_range = initializer_range
|
||||||
self.predict_special_tokens = predict_special_tokens
|
|
||||||
|
|
||||||
self.num_labels = num_labels
|
self.num_labels = num_labels
|
||||||
self.summary_type = summary_type
|
self.summary_type = summary_type
|
||||||
@@ -203,10 +195,6 @@ class GPT2Config(PretrainedConfig):
|
|||||||
"or the path to a pretrained model config file (str)"
|
"or the path to a pretrained model config file (str)"
|
||||||
)
|
)
|
||||||
|
|
||||||
@property
|
|
||||||
def total_tokens_embeddings(self):
|
|
||||||
return self.vocab_size + self.n_special
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def hidden_size(self):
|
def hidden_size(self):
|
||||||
return self.n_embd
|
return self.n_embd
|
||||||
@@ -347,34 +335,6 @@ class Block(nn.Module):
|
|||||||
return outputs # x, present, (attentions)
|
return outputs # x, present, (attentions)
|
||||||
|
|
||||||
|
|
||||||
class GPT2LMHead(nn.Module):
|
|
||||||
""" Language Model Head for the transformer """
|
|
||||||
|
|
||||||
def __init__(self, model_embeddings_weights, config):
|
|
||||||
super(GPT2LMHead, self).__init__()
|
|
||||||
self.n_embd = config.n_embd
|
|
||||||
self.vocab_size = config.vocab_size
|
|
||||||
self.predict_special_tokens = config.predict_special_tokens
|
|
||||||
self.torchscript = config.torchscript
|
|
||||||
embed_shape = model_embeddings_weights.shape
|
|
||||||
self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
|
|
||||||
self.set_embeddings_weights(model_embeddings_weights)
|
|
||||||
|
|
||||||
def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):
|
|
||||||
self.predict_special_tokens = predict_special_tokens
|
|
||||||
# Export to TorchScript can't handle parameter sharing so we are cloning them.
|
|
||||||
if self.torchscript:
|
|
||||||
self.decoder.weight = nn.Parameter(model_embeddings_weights.clone())
|
|
||||||
else:
|
|
||||||
self.decoder.weight = model_embeddings_weights # Tied weights
|
|
||||||
|
|
||||||
def forward(self, hidden_state):
|
|
||||||
lm_logits = self.decoder(hidden_state)
|
|
||||||
if not self.predict_special_tokens:
|
|
||||||
lm_logits = lm_logits[..., :self.vocab_size]
|
|
||||||
return lm_logits
|
|
||||||
|
|
||||||
|
|
||||||
class GPT2PreTrainedModel(PreTrainedModel):
|
class GPT2PreTrainedModel(PreTrainedModel):
|
||||||
""" An abstract class to handle weights initialization and
|
""" An abstract class to handle weights initialization and
|
||||||
a simple interface for dowloading and loading pretrained models.
|
a simple interface for dowloading and loading pretrained models.
|
||||||
@@ -400,36 +360,6 @@ class GPT2PreTrainedModel(PreTrainedModel):
|
|||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
module.weight.data.fill_(1.0)
|
module.weight.data.fill_(1.0)
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
|
|
||||||
"""
|
|
||||||
Instantiate a GPT2PreTrainedModel from a pre-trained model file or a pytorch state dict.
|
|
||||||
Download and cache the pre-trained model file if needed.
|
|
||||||
|
|
||||||
Params:
|
|
||||||
pretrained_model_name_or_path: either:
|
|
||||||
- a str with the name of a pre-trained model to load selected in the list of:
|
|
||||||
. `gpt2`
|
|
||||||
- a path or url to a pretrained model archive containing:
|
|
||||||
. `gpt2_config.json` a configuration file for the model
|
|
||||||
. `pytorch_model.bin` a PyTorch dump of a GPT2Model instance
|
|
||||||
- a path or url to a pretrained model archive containing:
|
|
||||||
. `gpt2_config.json` a configuration file for the model
|
|
||||||
. a TensorFlow checkpoint with trained weights
|
|
||||||
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
|
|
||||||
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
|
|
||||||
state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
|
|
||||||
*inputs, **kwargs: additional input for the specific GPT2 class
|
|
||||||
"""
|
|
||||||
num_special_tokens = kwargs.pop('num_special_tokens', None)
|
|
||||||
|
|
||||||
model = super().from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
|
||||||
|
|
||||||
# Add additional embeddings for special tokens if needed
|
|
||||||
# This step also make sure we are still sharing the output and input embeddings after loading weights
|
|
||||||
model.set_num_special_tokens(num_special_tokens)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
class GPT2Model(GPT2PreTrainedModel):
|
class GPT2Model(GPT2PreTrainedModel):
|
||||||
"""OpenAI GPT-2 model ("Language Models are Unsupervised Multitask Learners").
|
"""OpenAI GPT-2 model ("Language Models are Unsupervised Multitask Learners").
|
||||||
@@ -447,13 +377,13 @@ class GPT2Model(GPT2PreTrainedModel):
|
|||||||
config.vocab_size - 1, ______________________
|
config.vocab_size - 1, ______________________
|
||||||
config.vocab_size,
|
config.vocab_size,
|
||||||
... -> special embeddings
|
... -> special embeddings
|
||||||
config.vocab_size + config.n_special - 1] ______________________
|
config.vocab_size + n_special - 1] ______________________
|
||||||
|
|
||||||
where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is equal to
|
where total_tokens_embeddings is equal to
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
total_tokens_embeddings = config.vocab_size + config.n_special
|
total_tokens_embeddings = vocab_size + n_special
|
||||||
|
|
||||||
You should use the associated indices to index the embeddings.
|
You should use the associated indices to index the embeddings.
|
||||||
|
|
||||||
@@ -474,7 +404,7 @@ class GPT2Model(GPT2PreTrainedModel):
|
|||||||
self.output_hidden_states = config.output_hidden_states
|
self.output_hidden_states = config.output_hidden_states
|
||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
|
|
||||||
self.wte = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
|
self.wte = nn.Embedding(config.vocab_size, config.n_embd)
|
||||||
self.wpe = nn.Embedding(config.n_positions, config.n_embd)
|
self.wpe = nn.Embedding(config.n_positions, config.n_embd)
|
||||||
self.drop = nn.Dropout(config.embd_pdrop)
|
self.drop = nn.Dropout(config.embd_pdrop)
|
||||||
self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
|
self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
|
||||||
@@ -482,26 +412,8 @@ class GPT2Model(GPT2PreTrainedModel):
|
|||||||
|
|
||||||
self.apply(self.init_weights)
|
self.apply(self.init_weights)
|
||||||
|
|
||||||
def set_num_special_tokens(self, num_special_tokens=None):
|
def _resize_token_embeddings(self, new_num_tokens):
|
||||||
"""
|
self.wte = self._get_resized_embeddings(self.wte, new_num_tokens)
|
||||||
Update input embeddings with new embedding matrix if needed.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
num_special_tokens: Special tokens to be added to the embedding matrix
|
|
||||||
|
|
||||||
TODO Lysandre filled args
|
|
||||||
"""
|
|
||||||
if num_special_tokens is None or self.config.n_special == num_special_tokens:
|
|
||||||
return
|
|
||||||
# Update config
|
|
||||||
self.config.n_special = num_special_tokens
|
|
||||||
# Build new embeddings and initialize all new embeddings (in particular the special tokens)
|
|
||||||
old_embed = self.wte
|
|
||||||
self.wte = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
|
|
||||||
self.wte.to(old_embed.weight.device)
|
|
||||||
self.init_weights(self.wte)
|
|
||||||
# Copy word embeddings from the previous weights
|
|
||||||
self.wte.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
|
|
||||||
|
|
||||||
def _prune_heads(self, heads_to_prune):
|
def _prune_heads(self, heads_to_prune):
|
||||||
""" Prunes heads of the model.
|
""" Prunes heads of the model.
|
||||||
@@ -641,23 +553,20 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
|||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(GPT2LMHeadModel, self).__init__(config)
|
super(GPT2LMHeadModel, self).__init__(config)
|
||||||
self.transformer = GPT2Model(config)
|
self.transformer = GPT2Model(config)
|
||||||
self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
|
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.apply(self.init_weights)
|
||||||
|
self.tie_weights()
|
||||||
|
|
||||||
def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
|
def tie_weights(self):
|
||||||
|
""" Make sure we are sharing the input and output embeddings.
|
||||||
|
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
|
||||||
"""
|
"""
|
||||||
Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings.
|
input_embeddings = self.transformer.wte.weight
|
||||||
|
if self.config.torchscript:
|
||||||
Args:
|
self.lm_head.weight = nn.Parameter(input_embeddings.clone())
|
||||||
num_special_tokens: Special tokens to be added to the embedding matrix
|
else:
|
||||||
predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
|
self.lm_head.weight = input_embeddings # Tied weights
|
||||||
Defaults to True.
|
|
||||||
|
|
||||||
TODO Lysandre filled args
|
|
||||||
"""
|
|
||||||
self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
|
|
||||||
self.transformer.set_num_special_tokens(num_special_tokens)
|
|
||||||
self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
|
|
||||||
|
|
||||||
def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None, head_mask=None):
|
def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None, head_mask=None):
|
||||||
"""
|
"""
|
||||||
@@ -740,25 +649,20 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
|||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(GPT2DoubleHeadsModel, self).__init__(config)
|
super(GPT2DoubleHeadsModel, self).__init__(config)
|
||||||
self.transformer = GPT2Model(config)
|
self.transformer = GPT2Model(config)
|
||||||
self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
|
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
||||||
self.multiple_choice_head = SequenceSummary(config)
|
self.multiple_choice_head = SequenceSummary(config)
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.apply(self.init_weights)
|
||||||
|
|
||||||
def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
|
def tie_weights(self):
|
||||||
|
""" Make sure we are sharing the input and output embeddings.
|
||||||
|
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
|
||||||
"""
|
"""
|
||||||
Update input and output embeddings with new embedding matrix.Make sure we are sharing the embeddings
|
input_embeddings = self.transformer.wte.weight
|
||||||
|
if self.config.torchscript:
|
||||||
Args:
|
self.lm_head.weight = nn.Parameter(input_embeddings.clone())
|
||||||
num_special_tokens: Special tokens to be added to the embedding matrix
|
else:
|
||||||
predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
|
self.lm_head.weight = input_embeddings # Tied weights
|
||||||
Defaults to True.
|
|
||||||
|
|
||||||
TODO Lysandre filled args
|
|
||||||
"""
|
|
||||||
self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
|
|
||||||
self.transformer.set_num_special_tokens(num_special_tokens)
|
|
||||||
self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
|
|
||||||
|
|
||||||
def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
|
def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
|
||||||
position_ids=None, past=None, head_mask=None):
|
position_ids=None, past=None, head_mask=None):
|
||||||
|
|||||||
@@ -156,7 +156,6 @@ class OpenAIGPTConfig(PretrainedConfig):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
vocab_size_or_config_json_file=40478,
|
vocab_size_or_config_json_file=40478,
|
||||||
n_special=0,
|
|
||||||
n_positions=512,
|
n_positions=512,
|
||||||
n_ctx=512,
|
n_ctx=512,
|
||||||
n_embd=768,
|
n_embd=768,
|
||||||
@@ -190,7 +189,6 @@ class OpenAIGPTConfig(PretrainedConfig):
|
|||||||
self.__dict__[key] = value
|
self.__dict__[key] = value
|
||||||
elif isinstance(vocab_size_or_config_json_file, int):
|
elif isinstance(vocab_size_or_config_json_file, int):
|
||||||
self.vocab_size = vocab_size_or_config_json_file
|
self.vocab_size = vocab_size_or_config_json_file
|
||||||
self.n_special = n_special
|
|
||||||
self.n_ctx = n_ctx
|
self.n_ctx = n_ctx
|
||||||
self.n_positions = n_positions
|
self.n_positions = n_positions
|
||||||
self.n_embd = n_embd
|
self.n_embd = n_embd
|
||||||
@@ -216,10 +214,6 @@ class OpenAIGPTConfig(PretrainedConfig):
|
|||||||
"or the path to a pretrained model config file (str)"
|
"or the path to a pretrained model config file (str)"
|
||||||
)
|
)
|
||||||
|
|
||||||
@property
|
|
||||||
def total_tokens_embeddings(self):
|
|
||||||
return self.vocab_size + self.n_special
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def hidden_size(self):
|
def hidden_size(self):
|
||||||
return self.n_embd
|
return self.n_embd
|
||||||
@@ -355,34 +349,6 @@ class Block(nn.Module):
|
|||||||
return outputs
|
return outputs
|
||||||
|
|
||||||
|
|
||||||
class OpenAIGPTLMHead(nn.Module):
|
|
||||||
""" Language Model Head for the transformer """
|
|
||||||
|
|
||||||
def __init__(self, model_embeddings_weights, config):
|
|
||||||
super(OpenAIGPTLMHead, self).__init__()
|
|
||||||
self.n_embd = config.n_embd
|
|
||||||
self.vocab_size = config.vocab_size
|
|
||||||
self.predict_special_tokens = config.predict_special_tokens
|
|
||||||
self.torchscript = config.torchscript
|
|
||||||
embed_shape = model_embeddings_weights.shape
|
|
||||||
self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
|
|
||||||
self.set_embeddings_weights(model_embeddings_weights)
|
|
||||||
|
|
||||||
def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):
|
|
||||||
self.predict_special_tokens = predict_special_tokens
|
|
||||||
|
|
||||||
if self.torchscript:
|
|
||||||
self.decoder.weight = nn.Parameter(model_embeddings_weights.clone())
|
|
||||||
else:
|
|
||||||
self.decoder.weight = model_embeddings_weights # Tied weights
|
|
||||||
|
|
||||||
def forward(self, hidden_state):
|
|
||||||
lm_logits = self.decoder(hidden_state)
|
|
||||||
if not self.predict_special_tokens:
|
|
||||||
lm_logits = lm_logits[..., :self.vocab_size]
|
|
||||||
return lm_logits
|
|
||||||
|
|
||||||
|
|
||||||
class OpenAIGPTPreTrainedModel(PreTrainedModel):
|
class OpenAIGPTPreTrainedModel(PreTrainedModel):
|
||||||
""" An abstract class to handle weights initialization and
|
""" An abstract class to handle weights initialization and
|
||||||
a simple interface for dowloading and loading pretrained models.
|
a simple interface for dowloading and loading pretrained models.
|
||||||
@@ -408,36 +374,6 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
|
|||||||
module.bias.data.zero_()
|
module.bias.data.zero_()
|
||||||
module.weight.data.fill_(1.0)
|
module.weight.data.fill_(1.0)
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
|
|
||||||
"""
|
|
||||||
Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
|
|
||||||
Download and cache the pre-trained model file if needed.
|
|
||||||
|
|
||||||
Params:
|
|
||||||
pretrained_model_name_or_path: either:
|
|
||||||
- a str with the name of a pre-trained model to load selected in the list of:
|
|
||||||
- a path or url to a pretrained model archive containing:
|
|
||||||
. `config.json` a configuration file for the model
|
|
||||||
. `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
|
|
||||||
- a path or url to a pretrained model archive containing:
|
|
||||||
. `config.json` a configuration file for the model
|
|
||||||
. a series of NumPy files containing OpenAI TensorFlow trained weights
|
|
||||||
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
|
|
||||||
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
|
|
||||||
state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
|
|
||||||
*inputs, **kwargs: additional input for the specific OpenAI-GPT class
|
|
||||||
"""
|
|
||||||
num_special_tokens = kwargs.get('num_special_tokens', None)
|
|
||||||
kwargs.pop('num_special_tokens', None)
|
|
||||||
|
|
||||||
model = super(PreTrainedModel, cls).from_pretrained(pretrained_model_name_or_path, pretrained_model_name_or_path, *inputs, **kwargs)
|
|
||||||
|
|
||||||
# Add additional embeddings for special tokens if needed
|
|
||||||
# This step also make sure we are still sharing the output and input embeddings after loading weights
|
|
||||||
model.set_num_special_tokens(num_special_tokens)
|
|
||||||
return model
|
|
||||||
|
|
||||||
|
|
||||||
class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
||||||
"""OpenAI GPT model ("Improving Language Understanding by Generative Pre-Training").
|
"""OpenAI GPT model ("Improving Language Understanding by Generative Pre-Training").
|
||||||
@@ -457,13 +393,13 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
|||||||
config.vocab_size - 1, ______________________
|
config.vocab_size - 1, ______________________
|
||||||
config.vocab_size,
|
config.vocab_size,
|
||||||
... -> special embeddings
|
... -> special embeddings
|
||||||
config.vocab_size + config.n_special - 1] ______________________
|
config.vocab_size + n_special - 1] ______________________
|
||||||
|
|
||||||
where ``total_tokens_embeddings`` can be obtained as ``config.total_tokens_embeddings`` and is:
|
where ``total_tokens_embeddings`` is:
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
total_tokens_embeddings = config.vocab_size + config.n_special
|
total_tokens_embeddings = config.vocab_size + n_special
|
||||||
|
|
||||||
You should use the associated indices to index the embeddings.
|
You should use the associated indices to index the embeddings.
|
||||||
|
|
||||||
@@ -485,34 +421,15 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
|||||||
self.output_attentions = config.output_attentions
|
self.output_attentions = config.output_attentions
|
||||||
self.output_hidden_states = config.output_hidden_states
|
self.output_hidden_states = config.output_hidden_states
|
||||||
|
|
||||||
self.tokens_embed = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
|
self.tokens_embed = nn.Embedding(config.vocab_size, config.n_embd)
|
||||||
self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
|
self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
|
||||||
self.drop = nn.Dropout(config.embd_pdrop)
|
self.drop = nn.Dropout(config.embd_pdrop)
|
||||||
self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
|
self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.apply(self.init_weights)
|
||||||
|
|
||||||
def set_num_special_tokens(self, num_special_tokens=None):
|
def _resize_token_embeddings(self, new_num_tokens):
|
||||||
"""
|
self.tokens_embed = self._get_resized_embeddings(self.tokens_embed, new_num_tokens)
|
||||||
Update input embeddings with new embedding matrice if needed
|
|
||||||
|
|
||||||
Args:
|
|
||||||
num_special_tokens: Special tokens to be added to the embedding matrix
|
|
||||||
|
|
||||||
TODO Lysandre filled Args
|
|
||||||
|
|
||||||
"""
|
|
||||||
if num_special_tokens is None or self.config.n_special == num_special_tokens:
|
|
||||||
return
|
|
||||||
# Update config
|
|
||||||
self.config.n_special = num_special_tokens
|
|
||||||
# Build new embeddings and initialize all new embeddings (in particular the special tokens)
|
|
||||||
old_embed = self.tokens_embed
|
|
||||||
self.tokens_embed = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
|
|
||||||
self.tokens_embed.to(old_embed.weight.device)
|
|
||||||
self.init_weights(self.tokens_embed)
|
|
||||||
# Copy word embeddings from the previous weights
|
|
||||||
self.tokens_embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
|
|
||||||
|
|
||||||
def _prune_heads(self, heads_to_prune):
|
def _prune_heads(self, heads_to_prune):
|
||||||
""" Prunes heads of the model.
|
""" Prunes heads of the model.
|
||||||
@@ -657,24 +574,20 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
|
|||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(OpenAIGPTLMHeadModel, self).__init__(config)
|
super(OpenAIGPTLMHeadModel, self).__init__(config)
|
||||||
self.transformer = OpenAIGPTModel(config)
|
self.transformer = OpenAIGPTModel(config)
|
||||||
self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
|
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.apply(self.init_weights)
|
||||||
|
self.tie_weights()
|
||||||
|
|
||||||
def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
|
def tie_weights(self):
|
||||||
|
""" Make sure we are sharing the input and output embeddings.
|
||||||
|
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
|
||||||
"""
|
"""
|
||||||
Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings
|
input_embeddings = self.transformer.tokens_embed.weight
|
||||||
|
if self.config.torchscript:
|
||||||
Args:
|
self.lm_head.weight = nn.Parameter(input_embeddings.clone())
|
||||||
num_special_tokens: Special tokens to be added to the embedding matrix
|
else:
|
||||||
predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
|
self.lm_head.weight = input_embeddings # Tied weights
|
||||||
Defaults to True.
|
|
||||||
|
|
||||||
TODO Lysandre filled Args
|
|
||||||
|
|
||||||
"""
|
|
||||||
self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
|
|
||||||
self.transformer.set_num_special_tokens(num_special_tokens)
|
|
||||||
self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)
|
|
||||||
|
|
||||||
def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, head_mask=None):
|
def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, head_mask=None):
|
||||||
"""
|
"""
|
||||||
@@ -747,13 +660,13 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
|||||||
config.vocab_size - 1, ______________________
|
config.vocab_size - 1, ______________________
|
||||||
config.vocab_size,
|
config.vocab_size,
|
||||||
... -> special embeddings
|
... -> special embeddings
|
||||||
config.vocab_size + config.n_special - 1] ______________________
|
config.vocab_size + n_special - 1] ______________________
|
||||||
|
|
||||||
where ``total_tokens_embeddings`` can be obtained as ``config.total_tokens_embeddings`` and is:
|
where ``total_tokens_embeddings`` is:
|
||||||
|
|
||||||
::
|
::
|
||||||
|
|
||||||
total_tokens_embeddings = config.vocab_size + config.n_special
|
total_tokens_embeddings = config.vocab_size + .n_special
|
||||||
|
|
||||||
You should use the associate indices to index the embeddings.
|
You should use the associate indices to index the embeddings.
|
||||||
|
|
||||||
@@ -773,24 +686,21 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
|||||||
super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
|
super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
|
||||||
|
|
||||||
self.transformer = OpenAIGPTModel(config)
|
self.transformer = OpenAIGPTModel(config)
|
||||||
self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
|
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
||||||
self.multiple_choice_head = SequenceSummary(config)
|
self.multiple_choice_head = SequenceSummary(config)
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.apply(self.init_weights)
|
||||||
|
self.tie_weights()
|
||||||
|
|
||||||
def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
|
def tie_weights(self):
|
||||||
""" Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings.
|
""" Make sure we are sharing the input and output embeddings.
|
||||||
|
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
|
||||||
Args:
|
|
||||||
num_special_tokens: Special tokens to be added to the embedding matrix
|
|
||||||
predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
|
|
||||||
Defaults to True.
|
|
||||||
|
|
||||||
TODO Lysandre filled Args
|
|
||||||
"""
|
"""
|
||||||
self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
|
input_embeddings = self.transformer.tokens_embed.weight
|
||||||
self.transformer.set_num_special_tokens(num_special_tokens)
|
if self.config.torchscript:
|
||||||
self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)
|
self.lm_head.weight = nn.Parameter(input_embeddings.clone())
|
||||||
|
else:
|
||||||
|
self.lm_head.weight = input_embeddings # Tied weights
|
||||||
|
|
||||||
def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
|
def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
|
||||||
position_ids=None, head_mask=None):
|
position_ids=None, head_mask=None):
|
||||||
|
|||||||
@@ -287,6 +287,10 @@ class TransfoXLConfig(PretrainedConfig):
|
|||||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
raise ValueError("First argument must be either a vocabulary size (int)"
|
||||||
"or the path to a pretrained model config file (str)")
|
"or the path to a pretrained model config file (str)")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def vocab_size(self):
|
||||||
|
return self.n_token
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def hidden_size(self):
|
def hidden_size(self):
|
||||||
return self.d_model
|
return self.d_model
|
||||||
@@ -998,6 +1002,9 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
|
|||||||
|
|
||||||
self.apply(self.init_weights)
|
self.apply(self.init_weights)
|
||||||
|
|
||||||
|
def _resize_token_embeddings(self, new_num_tokens):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
def backward_compatible(self):
|
def backward_compatible(self):
|
||||||
self.sample_softmax = -1
|
self.sample_softmax = -1
|
||||||
|
|
||||||
|
|||||||
@@ -151,6 +151,7 @@ class PreTrainedModel(nn.Module):
|
|||||||
pretrained_model_archive_map = {}
|
pretrained_model_archive_map = {}
|
||||||
load_tf_weights = lambda model, config, path: None
|
load_tf_weights = lambda model, config, path: None
|
||||||
base_model_prefix = ""
|
base_model_prefix = ""
|
||||||
|
input_embeddings = None
|
||||||
|
|
||||||
def __init__(self, config, *inputs, **kwargs):
|
def __init__(self, config, *inputs, **kwargs):
|
||||||
super(PreTrainedModel, self).__init__()
|
super(PreTrainedModel, self).__init__()
|
||||||
@@ -164,12 +165,48 @@ class PreTrainedModel(nn.Module):
|
|||||||
# Save config in model
|
# Save config in model
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|
||||||
|
def _get_resized_embeddings(self, old_embeddings, new_num_tokens):
|
||||||
|
# Build new embeddings
|
||||||
|
old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
|
||||||
|
new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
|
||||||
|
new_embeddings.to(old_embeddings.weight.device)
|
||||||
|
|
||||||
|
# initialize all new embeddings (in particular added tokens)
|
||||||
|
self.init_weights(new_embeddings)
|
||||||
|
|
||||||
|
# Copy word embeddings from the previous weights
|
||||||
|
num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
|
||||||
|
new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
|
||||||
|
|
||||||
|
return new_embeddings
|
||||||
|
|
||||||
|
def resize_token_embeddings(self, new_num_tokens):
|
||||||
|
""" Resize input token embeddings matrix.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
new_num_tokens: New number of tokens in the embedding matrix.
|
||||||
|
Increasing the size will add newly initialized vectors at the end
|
||||||
|
Reducing the size will remove vectors from the end
|
||||||
|
"""
|
||||||
|
if new_num_tokens == self.config.vocab_size:
|
||||||
|
return
|
||||||
|
base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed
|
||||||
|
base_model._resize_token_embeddings(new_num_tokens)
|
||||||
|
|
||||||
|
# Update base model and current model config
|
||||||
|
self.config.vocab_size = new_num_tokens
|
||||||
|
base_model.vocab_size = new_num_tokens
|
||||||
|
|
||||||
|
# Tie weights again if needed
|
||||||
|
if hasattr(self, 'tie_weights'):
|
||||||
|
self.tie_weights()
|
||||||
|
|
||||||
def prune_heads(self, heads_to_prune):
|
def prune_heads(self, heads_to_prune):
|
||||||
""" Prunes heads of the base model.
|
""" Prunes heads of the base model.
|
||||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||||
"""
|
"""
|
||||||
model_to_prune = getattr(self, self.base_model_prefix, self) # get the base model if needed
|
base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed
|
||||||
model_to_prune._prune_heads(heads_to_prune)
|
base_model._prune_heads(heads_to_prune)
|
||||||
|
|
||||||
def save_pretrained(self, save_directory):
|
def save_pretrained(self, save_directory):
|
||||||
""" Save a model with its configuration file to a directory, so that it
|
""" Save a model with its configuration file to a directory, so that it
|
||||||
|
|||||||
@@ -104,7 +104,6 @@ class XLMConfig(PretrainedConfig):
|
|||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
vocab_size_or_config_json_file=30145,
|
vocab_size_or_config_json_file=30145,
|
||||||
n_special=0,
|
|
||||||
emb_dim=2048,
|
emb_dim=2048,
|
||||||
n_layers=12,
|
n_layers=12,
|
||||||
n_heads=16,
|
n_heads=16,
|
||||||
@@ -148,7 +147,6 @@ class XLMConfig(PretrainedConfig):
|
|||||||
self.__dict__[key] = value
|
self.__dict__[key] = value
|
||||||
elif isinstance(vocab_size_or_config_json_file, int):
|
elif isinstance(vocab_size_or_config_json_file, int):
|
||||||
self.n_words = vocab_size_or_config_json_file
|
self.n_words = vocab_size_or_config_json_file
|
||||||
self.n_special = n_special
|
|
||||||
self.emb_dim = emb_dim
|
self.emb_dim = emb_dim
|
||||||
self.n_layers = n_layers
|
self.n_layers = n_layers
|
||||||
self.n_heads = n_heads
|
self.n_heads = n_heads
|
||||||
@@ -183,8 +181,8 @@ class XLMConfig(PretrainedConfig):
|
|||||||
"or the path to a pretrained model config file (str)")
|
"or the path to a pretrained model config file (str)")
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def total_tokens_embeddings(self):
|
def vocab_size(self):
|
||||||
return self.n_words + self.n_special
|
return self.n_words
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def hidden_size(self):
|
def hidden_size(self):
|
||||||
@@ -479,6 +477,9 @@ class XLMModel(XLMPreTrainedModel):
|
|||||||
|
|
||||||
self.apply(self.init_weights)
|
self.apply(self.init_weights)
|
||||||
|
|
||||||
|
def _resize_token_embeddings(self, new_num_tokens):
|
||||||
|
self.embeddings = self._get_resized_embeddings(self.embeddings, new_num_tokens)
|
||||||
|
|
||||||
def _prune_heads(self, heads_to_prune):
|
def _prune_heads(self, heads_to_prune):
|
||||||
""" Prunes heads of the model.
|
""" Prunes heads of the model.
|
||||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||||
@@ -718,8 +719,6 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
|
|||||||
"""
|
"""
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
super(XLMWithLMHeadModel, self).__init__(config)
|
super(XLMWithLMHeadModel, self).__init__(config)
|
||||||
self.torchscript = config.torchscript
|
|
||||||
|
|
||||||
self.transformer = XLMModel(config)
|
self.transformer = XLMModel(config)
|
||||||
self.pred_layer = XLMPredLayer(config)
|
self.pred_layer = XLMPredLayer(config)
|
||||||
|
|
||||||
@@ -729,7 +728,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
|
|||||||
def tie_weights(self):
|
def tie_weights(self):
|
||||||
""" Make sure we are sharing the embeddings
|
""" Make sure we are sharing the embeddings
|
||||||
"""
|
"""
|
||||||
if self.torchscript:
|
if self.config.torchscript:
|
||||||
self.pred_layer.proj.weight = nn.Parameter(self.transformer.embeddings.weight.clone())
|
self.pred_layer.proj.weight = nn.Parameter(self.transformer.embeddings.weight.clone())
|
||||||
else:
|
else:
|
||||||
self.pred_layer.proj.weight = self.transformer.embeddings.weight
|
self.pred_layer.proj.weight = self.transformer.embeddings.weight
|
||||||
|
|||||||
@@ -312,6 +312,10 @@ class XLNetConfig(PretrainedConfig):
|
|||||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
raise ValueError("First argument must be either a vocabulary size (int)"
|
||||||
"or the path to a pretrained model config file (str)")
|
"or the path to a pretrained model config file (str)")
|
||||||
|
|
||||||
|
@property
|
||||||
|
def vocab_size(self):
|
||||||
|
return self.n_token
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def hidden_size(self):
|
def hidden_size(self):
|
||||||
return self.d_model
|
return self.d_model
|
||||||
@@ -654,6 +658,9 @@ class XLNetModel(XLNetPreTrainedModel):
|
|||||||
|
|
||||||
self.apply(self.init_weights)
|
self.apply(self.init_weights)
|
||||||
|
|
||||||
|
def _resize_token_embeddings(self, new_num_tokens):
|
||||||
|
self.word_embedding = self._get_resized_embeddings(self.word_embedding, new_num_tokens)
|
||||||
|
|
||||||
def _prune_heads(self, heads_to_prune):
|
def _prune_heads(self, heads_to_prune):
|
||||||
logger.info("Head pruning is not implemented for XLNet")
|
logger.info("Head pruning is not implemented for XLNet")
|
||||||
pass
|
pass
|
||||||
@@ -970,20 +977,17 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
|
|||||||
super(XLNetLMHeadModel, self).__init__(config)
|
super(XLNetLMHeadModel, self).__init__(config)
|
||||||
self.attn_type = config.attn_type
|
self.attn_type = config.attn_type
|
||||||
self.same_length = config.same_length
|
self.same_length = config.same_length
|
||||||
self.torchscript = config.torchscript
|
|
||||||
|
|
||||||
self.transformer = XLNetModel(config)
|
self.transformer = XLNetModel(config)
|
||||||
self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
|
self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
|
||||||
|
|
||||||
# Tie weights
|
|
||||||
|
|
||||||
self.apply(self.init_weights)
|
self.apply(self.init_weights)
|
||||||
self.tie_weights()
|
self.tie_weights()
|
||||||
|
|
||||||
def tie_weights(self):
|
def tie_weights(self):
|
||||||
""" Make sure we are sharing the embeddings
|
""" Make sure we are sharing the embeddings
|
||||||
"""
|
"""
|
||||||
if self.torchscript:
|
if self.config.torchscript:
|
||||||
self.lm_loss.weight = nn.Parameter(self.transformer.word_embedding.weight.clone())
|
self.lm_loss.weight = nn.Parameter(self.transformer.word_embedding.weight.clone())
|
||||||
else:
|
else:
|
||||||
self.lm_loss.weight = self.transformer.word_embedding.weight
|
self.lm_loss.weight = self.transformer.word_embedding.weight
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ from pytorch_transformers import (BertConfig, BertModel, BertForMaskedLM,
|
|||||||
BertForTokenClassification, BertForMultipleChoice)
|
BertForTokenClassification, BertForMultipleChoice)
|
||||||
from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
|
||||||
from .modeling_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
|
from .modeling_common_test import (create_and_check_commons, ConfigTester, ids_tensor)
|
||||||
|
|
||||||
|
|
||||||
class BertModelTest(unittest.TestCase):
|
class BertModelTest(unittest.TestCase):
|
||||||
|
|||||||
@@ -22,8 +22,15 @@ import shutil
|
|||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import logging
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
|
from pytorch_transformers import PretrainedConfig, PreTrainedModel
|
||||||
|
from pytorch_transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
|
||||||
|
|
||||||
def _config_zero_init(config):
|
def _config_zero_init(config):
|
||||||
configs_no_init = copy.deepcopy(config)
|
configs_no_init = copy.deepcopy(config)
|
||||||
for key in configs_no_init.__dict__.keys():
|
for key in configs_no_init.__dict__.keys():
|
||||||
@@ -242,6 +249,7 @@ class ConfigTester(object):
|
|||||||
|
|
||||||
def create_and_test_config_common_properties(self):
|
def create_and_test_config_common_properties(self):
|
||||||
config = self.config_class(**self.inputs_dict)
|
config = self.config_class(**self.inputs_dict)
|
||||||
|
self.parent.assertTrue(hasattr(config, 'vocab_size'))
|
||||||
self.parent.assertTrue(hasattr(config, 'hidden_size'))
|
self.parent.assertTrue(hasattr(config, 'hidden_size'))
|
||||||
self.parent.assertTrue(hasattr(config, 'num_attention_heads'))
|
self.parent.assertTrue(hasattr(config, 'num_attention_heads'))
|
||||||
self.parent.assertTrue(hasattr(config, 'num_hidden_layers'))
|
self.parent.assertTrue(hasattr(config, 'num_hidden_layers'))
|
||||||
@@ -276,7 +284,6 @@ class GPTModelTester(object):
|
|||||||
use_token_type_ids=True,
|
use_token_type_ids=True,
|
||||||
use_labels=True,
|
use_labels=True,
|
||||||
vocab_size=99,
|
vocab_size=99,
|
||||||
n_special=1,
|
|
||||||
n_positions=33,
|
n_positions=33,
|
||||||
hidden_size=32,
|
hidden_size=32,
|
||||||
num_hidden_layers=5,
|
num_hidden_layers=5,
|
||||||
@@ -299,7 +306,6 @@ class GPTModelTester(object):
|
|||||||
self.use_token_type_ids = use_token_type_ids
|
self.use_token_type_ids = use_token_type_ids
|
||||||
self.use_labels = use_labels
|
self.use_labels = use_labels
|
||||||
self.vocab_size = vocab_size
|
self.vocab_size = vocab_size
|
||||||
self.n_special = n_special
|
|
||||||
self.n_positions = n_positions
|
self.n_positions = n_positions
|
||||||
self.hidden_size = hidden_size
|
self.hidden_size = hidden_size
|
||||||
self.num_hidden_layers = num_hidden_layers
|
self.num_hidden_layers = num_hidden_layers
|
||||||
@@ -316,7 +322,7 @@ class GPTModelTester(object):
|
|||||||
self.all_model_classes = (base_model_class, lm_head_model_class, double_head_model_class)
|
self.all_model_classes = (base_model_class, lm_head_model_class, double_head_model_class)
|
||||||
|
|
||||||
def prepare_config_and_inputs(self):
|
def prepare_config_and_inputs(self):
|
||||||
total_num_tokens = self.vocab_size + self.n_special
|
total_num_tokens = self.vocab_size
|
||||||
input_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_num_tokens)
|
input_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_num_tokens)
|
||||||
|
|
||||||
position_ids = None
|
position_ids = None
|
||||||
@@ -338,7 +344,6 @@ class GPTModelTester(object):
|
|||||||
|
|
||||||
config = self.config_class(
|
config = self.config_class(
|
||||||
vocab_size_or_config_json_file=self.vocab_size,
|
vocab_size_or_config_json_file=self.vocab_size,
|
||||||
n_special=self.n_special,
|
|
||||||
n_positions=self.n_positions,
|
n_positions=self.n_positions,
|
||||||
n_embd=self.hidden_size,
|
n_embd=self.hidden_size,
|
||||||
n_layer=self.num_hidden_layers,
|
n_layer=self.num_hidden_layers,
|
||||||
@@ -370,7 +375,7 @@ class GPTModelTester(object):
|
|||||||
outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
|
outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
|
||||||
loss, lm_logits = outputs[:2]
|
loss, lm_logits = outputs[:2]
|
||||||
|
|
||||||
total_voc = self.n_special + self.vocab_size
|
total_voc = self.vocab_size
|
||||||
self.parent.assertListEqual(
|
self.parent.assertListEqual(
|
||||||
list(lm_logits.size()),
|
list(lm_logits.size()),
|
||||||
[self.batch_size, self.n_choices, self.seq_length, total_voc])
|
[self.batch_size, self.n_choices, self.seq_length, total_voc])
|
||||||
@@ -400,7 +405,7 @@ class GPTModelTester(object):
|
|||||||
lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4]
|
lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4]
|
||||||
loss = [lm_loss, mc_loss]
|
loss = [lm_loss, mc_loss]
|
||||||
|
|
||||||
total_voc = self.n_special + self.vocab_size
|
total_voc = self.vocab_size
|
||||||
self.parent.assertListEqual(
|
self.parent.assertListEqual(
|
||||||
list(lm_logits.size()),
|
list(lm_logits.size()),
|
||||||
[self.batch_size, self.n_choices, self.seq_length, total_voc])
|
[self.batch_size, self.n_choices, self.seq_length, total_voc])
|
||||||
@@ -441,6 +446,30 @@ class GPTModelTester(object):
|
|||||||
self.create_and_check_commons(*config_and_inputs)
|
self.create_and_check_commons(*config_and_inputs)
|
||||||
|
|
||||||
def run_slow_tests(self):
|
def run_slow_tests(self):
|
||||||
config_and_inputs = self.prepare_config_and_inputs()
|
self.create_and_check_model_from_pretrained()
|
||||||
self.create_and_check_model_from_pretrained(*config_and_inputs)
|
|
||||||
|
|
||||||
|
|
||||||
|
class ModelUtilsTest(unittest.TestCase):
|
||||||
|
def test_model_from_pretrained(self):
|
||||||
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||||
|
config = BertConfig.from_pretrained(model_name)
|
||||||
|
self.assertIsNotNone(config)
|
||||||
|
self.assertIsInstance(config, PretrainedConfig)
|
||||||
|
|
||||||
|
model = BertModel.from_pretrained(model_name)
|
||||||
|
model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
|
||||||
|
self.assertIsNotNone(model)
|
||||||
|
self.assertIsInstance(model, PreTrainedModel)
|
||||||
|
for value in loading_info.values():
|
||||||
|
self.assertEqual(len(value), 0)
|
||||||
|
|
||||||
|
config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
|
||||||
|
model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
|
||||||
|
self.assertEqual(model.config.output_attentions, True)
|
||||||
|
self.assertEqual(model.config.output_hidden_states, True)
|
||||||
|
self.assertEqual(model.config, config)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
unittest.main()
|
||||||
@@ -28,7 +28,7 @@ import torch
|
|||||||
from pytorch_transformers import (GPT2Config, GPT2Model,
|
from pytorch_transformers import (GPT2Config, GPT2Model,
|
||||||
GPT2LMHeadModel, GPT2DoubleHeadsModel)
|
GPT2LMHeadModel, GPT2DoubleHeadsModel)
|
||||||
|
|
||||||
from .modeling_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester)
|
from .modeling_common_test import (create_and_check_commons, ConfigTester, GPTModelTester)
|
||||||
|
|
||||||
class GPT2ModelTest(unittest.TestCase):
|
class GPT2ModelTest(unittest.TestCase):
|
||||||
|
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ import torch
|
|||||||
from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel,
|
from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel,
|
||||||
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
|
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
|
||||||
|
|
||||||
from .modeling_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester)
|
from .modeling_common_test import (create_and_check_commons, ConfigTester, GPTModelTester)
|
||||||
|
|
||||||
class OpenAIModelTest(unittest.TestCase):
|
class OpenAIModelTest(unittest.TestCase):
|
||||||
|
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ import torch
|
|||||||
from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
|
from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
|
||||||
from pytorch_transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
|
from pytorch_transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
|
||||||
from .modeling_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
|
from .modeling_common_test import ConfigTester, create_and_check_commons, ids_tensor
|
||||||
|
|
||||||
class TransfoXLModelTest(unittest.TestCase):
|
class TransfoXLModelTest(unittest.TestCase):
|
||||||
class TransfoXLModelTester(object):
|
class TransfoXLModelTester(object):
|
||||||
|
|||||||
@@ -1,47 +0,0 @@
|
|||||||
# coding=utf-8
|
|
||||||
# Copyright 2018 HuggingFace Inc..
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
from __future__ import absolute_import
|
|
||||||
from __future__ import division
|
|
||||||
from __future__ import print_function
|
|
||||||
|
|
||||||
import unittest
|
|
||||||
import logging
|
|
||||||
|
|
||||||
from pytorch_transformers import PretrainedConfig, PreTrainedModel
|
|
||||||
from pytorch_transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
|
||||||
|
|
||||||
class ModelUtilsTest(unittest.TestCase):
|
|
||||||
def test_model_from_pretrained(self):
|
|
||||||
logging.basicConfig(level=logging.INFO)
|
|
||||||
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
|
||||||
config = BertConfig.from_pretrained(model_name)
|
|
||||||
self.assertIsNotNone(config)
|
|
||||||
self.assertIsInstance(config, PretrainedConfig)
|
|
||||||
|
|
||||||
model = BertModel.from_pretrained(model_name)
|
|
||||||
model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
|
|
||||||
self.assertIsNotNone(model)
|
|
||||||
self.assertIsInstance(model, PreTrainedModel)
|
|
||||||
for value in loading_info.values():
|
|
||||||
self.assertEqual(len(value), 0)
|
|
||||||
|
|
||||||
config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
|
|
||||||
model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
|
|
||||||
self.assertEqual(model.config.output_attentions, True)
|
|
||||||
self.assertEqual(model.config.output_hidden_states, True)
|
|
||||||
self.assertEqual(model.config, config)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
unittest.main()
|
|
||||||
@@ -23,7 +23,7 @@ import pytest
|
|||||||
from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification)
|
from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification)
|
||||||
from pytorch_transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
|
from pytorch_transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
|
||||||
from .modeling_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
|
from .modeling_common_test import (create_and_check_commons, ConfigTester, ids_tensor)
|
||||||
|
|
||||||
|
|
||||||
class XLMModelTest(unittest.TestCase):
|
class XLMModelTest(unittest.TestCase):
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ import torch
|
|||||||
from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
|
from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
|
||||||
from pytorch_transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
|
from pytorch_transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||||
|
|
||||||
from .modeling_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
|
from .modeling_common_test import ConfigTester, create_and_check_commons, ids_tensor
|
||||||
|
|
||||||
class XLNetModelTest(unittest.TestCase):
|
class XLNetModelTest(unittest.TestCase):
|
||||||
class XLNetModelTester(object):
|
class XLNetModelTester(object):
|
||||||
|
|||||||
Reference in New Issue
Block a user