embeddings resizing + tie_weights
This commit is contained in:
@@ -507,23 +507,17 @@ class BertPredictionHeadTransform(nn.Module):
|
||||
|
||||
|
||||
class BertLMPredictionHead(nn.Module):
|
||||
def __init__(self, config, bert_model_embedding_weights):
|
||||
def __init__(self, config):
|
||||
super(BertLMPredictionHead, self).__init__()
|
||||
self.transform = BertPredictionHeadTransform(config)
|
||||
self.torchscript = config.torchscript
|
||||
|
||||
# The output weights are the same as the input embeddings, but there is
|
||||
# an output-only bias for each token.
|
||||
self.decoder = nn.Linear(bert_model_embedding_weights.size(1),
|
||||
bert_model_embedding_weights.size(0),
|
||||
self.decoder = nn.Linear(config.hidden_size,
|
||||
config.vocab_size,
|
||||
bias=False)
|
||||
|
||||
if self.torchscript:
|
||||
self.decoder.weight = nn.Parameter(bert_model_embedding_weights.clone())
|
||||
else:
|
||||
self.decoder.weight = bert_model_embedding_weights
|
||||
|
||||
self.bias = nn.Parameter(torch.zeros(bert_model_embedding_weights.size(0)))
|
||||
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
|
||||
|
||||
def forward(self, hidden_states):
|
||||
hidden_states = self.transform(hidden_states)
|
||||
@@ -532,9 +526,9 @@ class BertLMPredictionHead(nn.Module):
|
||||
|
||||
|
||||
class BertOnlyMLMHead(nn.Module):
|
||||
def __init__(self, config, bert_model_embedding_weights):
|
||||
def __init__(self, config):
|
||||
super(BertOnlyMLMHead, self).__init__()
|
||||
self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
|
||||
self.predictions = BertLMPredictionHead(config)
|
||||
|
||||
def forward(self, sequence_output):
|
||||
prediction_scores = self.predictions(sequence_output)
|
||||
@@ -552,9 +546,9 @@ class BertOnlyNSPHead(nn.Module):
|
||||
|
||||
|
||||
class BertPreTrainingHeads(nn.Module):
|
||||
def __init__(self, config, bert_model_embedding_weights):
|
||||
def __init__(self, config):
|
||||
super(BertPreTrainingHeads, self).__init__()
|
||||
self.predictions = BertLMPredictionHead(config, bert_model_embedding_weights)
|
||||
self.predictions = BertLMPredictionHead(config)
|
||||
self.seq_relationship = nn.Linear(config.hidden_size, 2)
|
||||
|
||||
def forward(self, sequence_output, pooled_output):
|
||||
@@ -619,6 +613,11 @@ class BertModel(BertPreTrainedModel):
|
||||
|
||||
self.apply(self.init_weights)
|
||||
|
||||
def _resize_token_embeddings(self, new_num_tokens):
|
||||
old_embeddings = self.embeddings.word_embeddings
|
||||
new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
|
||||
self.embeddings.word_embeddings = new_embeddings
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
""" Prunes heads of the model.
|
||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||
@@ -750,9 +749,20 @@ class BertForPreTraining(BertPreTrainedModel):
|
||||
super(BertForPreTraining, self).__init__(config)
|
||||
|
||||
self.bert = BertModel(config)
|
||||
self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
|
||||
self.cls = BertPreTrainingHeads(config)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
self.tie_weights()
|
||||
|
||||
def tie_weights(self):
|
||||
""" Make sure we are sharing the input and output embeddings.
|
||||
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
|
||||
"""
|
||||
input_embeddings = self.bert.embeddings.word_embeddings.weight
|
||||
if self.config.torchscript:
|
||||
self.cls.predictions.decoder.weight = nn.Parameter(input_embeddings.clone())
|
||||
else:
|
||||
self.cls.predictions.decoder.weight = input_embeddings # Tied weights
|
||||
|
||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
|
||||
next_sentence_label=None, head_mask=None):
|
||||
@@ -845,9 +855,20 @@ class BertForMaskedLM(BertPreTrainedModel):
|
||||
super(BertForMaskedLM, self).__init__(config)
|
||||
|
||||
self.bert = BertModel(config)
|
||||
self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
|
||||
self.cls = BertOnlyMLMHead(config)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
self.tie_weights()
|
||||
|
||||
def tie_weights(self):
|
||||
""" Make sure we are sharing the input and output embeddings.
|
||||
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
|
||||
"""
|
||||
input_embeddings = self.bert.embeddings.word_embeddings.weight
|
||||
if self.config.torchscript:
|
||||
self.cls.predictions.decoder.weight = nn.Parameter(input_embeddings.clone())
|
||||
else:
|
||||
self.cls.predictions.decoder.weight = input_embeddings # Tied weights
|
||||
|
||||
def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
|
||||
"""
|
||||
|
||||
@@ -104,7 +104,6 @@ class GPT2Config(PretrainedConfig):
|
||||
|
||||
Args:
|
||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
|
||||
n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
|
||||
n_positions: Number of positional embeddings.
|
||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||
n_embd: Dimensionality of the embeddings and hidden states.
|
||||
@@ -119,14 +118,12 @@ class GPT2Config(PretrainedConfig):
|
||||
embd_pdrop: The dropout ratio for the embeddings.
|
||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
predict_special_tokens: should we predict special tokens (when the model has a LM head)
|
||||
"""
|
||||
pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size_or_config_json_file=50257,
|
||||
n_special=0,
|
||||
n_positions=1024,
|
||||
n_ctx=1024,
|
||||
n_embd=768,
|
||||
@@ -137,7 +134,6 @@ class GPT2Config(PretrainedConfig):
|
||||
attn_pdrop=0.1,
|
||||
layer_norm_epsilon=1e-5,
|
||||
initializer_range=0.02,
|
||||
predict_special_tokens=True,
|
||||
|
||||
num_labels=1,
|
||||
summary_type='token_ids',
|
||||
@@ -151,7 +147,6 @@ class GPT2Config(PretrainedConfig):
|
||||
|
||||
Args:
|
||||
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
|
||||
n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
|
||||
n_positions: Number of positional embeddings.
|
||||
n_ctx: Size of the causal mask (usually same as n_positions).
|
||||
n_embd: Dimensionality of the embeddings and hidden states.
|
||||
@@ -166,7 +161,6 @@ class GPT2Config(PretrainedConfig):
|
||||
embd_pdrop: The dropout ratio for the embeddings.
|
||||
initializer_range: The sttdev of the truncated_normal_initializer for
|
||||
initializing all weight matrices.
|
||||
predict_special_tokens: should we predict special tokens (when the model has a LM head)
|
||||
"""
|
||||
super(GPT2Config, self).__init__(**kwargs)
|
||||
|
||||
@@ -178,7 +172,6 @@ class GPT2Config(PretrainedConfig):
|
||||
self.__dict__[key] = value
|
||||
elif isinstance(vocab_size_or_config_json_file, int):
|
||||
self.vocab_size = vocab_size_or_config_json_file
|
||||
self.n_special = n_special
|
||||
self.n_ctx = n_ctx
|
||||
self.n_positions = n_positions
|
||||
self.n_embd = n_embd
|
||||
@@ -189,7 +182,6 @@ class GPT2Config(PretrainedConfig):
|
||||
self.attn_pdrop = attn_pdrop
|
||||
self.layer_norm_epsilon = layer_norm_epsilon
|
||||
self.initializer_range = initializer_range
|
||||
self.predict_special_tokens = predict_special_tokens
|
||||
|
||||
self.num_labels = num_labels
|
||||
self.summary_type = summary_type
|
||||
@@ -203,10 +195,6 @@ class GPT2Config(PretrainedConfig):
|
||||
"or the path to a pretrained model config file (str)"
|
||||
)
|
||||
|
||||
@property
|
||||
def total_tokens_embeddings(self):
|
||||
return self.vocab_size + self.n_special
|
||||
|
||||
@property
|
||||
def hidden_size(self):
|
||||
return self.n_embd
|
||||
@@ -347,34 +335,6 @@ class Block(nn.Module):
|
||||
return outputs # x, present, (attentions)
|
||||
|
||||
|
||||
class GPT2LMHead(nn.Module):
|
||||
""" Language Model Head for the transformer """
|
||||
|
||||
def __init__(self, model_embeddings_weights, config):
|
||||
super(GPT2LMHead, self).__init__()
|
||||
self.n_embd = config.n_embd
|
||||
self.vocab_size = config.vocab_size
|
||||
self.predict_special_tokens = config.predict_special_tokens
|
||||
self.torchscript = config.torchscript
|
||||
embed_shape = model_embeddings_weights.shape
|
||||
self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
|
||||
self.set_embeddings_weights(model_embeddings_weights)
|
||||
|
||||
def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):
|
||||
self.predict_special_tokens = predict_special_tokens
|
||||
# Export to TorchScript can't handle parameter sharing so we are cloning them.
|
||||
if self.torchscript:
|
||||
self.decoder.weight = nn.Parameter(model_embeddings_weights.clone())
|
||||
else:
|
||||
self.decoder.weight = model_embeddings_weights # Tied weights
|
||||
|
||||
def forward(self, hidden_state):
|
||||
lm_logits = self.decoder(hidden_state)
|
||||
if not self.predict_special_tokens:
|
||||
lm_logits = lm_logits[..., :self.vocab_size]
|
||||
return lm_logits
|
||||
|
||||
|
||||
class GPT2PreTrainedModel(PreTrainedModel):
|
||||
""" An abstract class to handle weights initialization and
|
||||
a simple interface for dowloading and loading pretrained models.
|
||||
@@ -400,36 +360,6 @@ class GPT2PreTrainedModel(PreTrainedModel):
|
||||
module.bias.data.zero_()
|
||||
module.weight.data.fill_(1.0)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
|
||||
"""
|
||||
Instantiate a GPT2PreTrainedModel from a pre-trained model file or a pytorch state dict.
|
||||
Download and cache the pre-trained model file if needed.
|
||||
|
||||
Params:
|
||||
pretrained_model_name_or_path: either:
|
||||
- a str with the name of a pre-trained model to load selected in the list of:
|
||||
. `gpt2`
|
||||
- a path or url to a pretrained model archive containing:
|
||||
. `gpt2_config.json` a configuration file for the model
|
||||
. `pytorch_model.bin` a PyTorch dump of a GPT2Model instance
|
||||
- a path or url to a pretrained model archive containing:
|
||||
. `gpt2_config.json` a configuration file for the model
|
||||
. a TensorFlow checkpoint with trained weights
|
||||
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
|
||||
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
|
||||
state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
|
||||
*inputs, **kwargs: additional input for the specific GPT2 class
|
||||
"""
|
||||
num_special_tokens = kwargs.pop('num_special_tokens', None)
|
||||
|
||||
model = super().from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
|
||||
|
||||
# Add additional embeddings for special tokens if needed
|
||||
# This step also make sure we are still sharing the output and input embeddings after loading weights
|
||||
model.set_num_special_tokens(num_special_tokens)
|
||||
return model
|
||||
|
||||
|
||||
class GPT2Model(GPT2PreTrainedModel):
|
||||
"""OpenAI GPT-2 model ("Language Models are Unsupervised Multitask Learners").
|
||||
@@ -447,13 +377,13 @@ class GPT2Model(GPT2PreTrainedModel):
|
||||
config.vocab_size - 1, ______________________
|
||||
config.vocab_size,
|
||||
... -> special embeddings
|
||||
config.vocab_size + config.n_special - 1] ______________________
|
||||
config.vocab_size + n_special - 1] ______________________
|
||||
|
||||
where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is equal to
|
||||
where total_tokens_embeddings is equal to
|
||||
|
||||
::
|
||||
|
||||
total_tokens_embeddings = config.vocab_size + config.n_special
|
||||
total_tokens_embeddings = vocab_size + n_special
|
||||
|
||||
You should use the associated indices to index the embeddings.
|
||||
|
||||
@@ -474,7 +404,7 @@ class GPT2Model(GPT2PreTrainedModel):
|
||||
self.output_hidden_states = config.output_hidden_states
|
||||
self.output_attentions = config.output_attentions
|
||||
|
||||
self.wte = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
|
||||
self.wte = nn.Embedding(config.vocab_size, config.n_embd)
|
||||
self.wpe = nn.Embedding(config.n_positions, config.n_embd)
|
||||
self.drop = nn.Dropout(config.embd_pdrop)
|
||||
self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
|
||||
@@ -482,26 +412,8 @@ class GPT2Model(GPT2PreTrainedModel):
|
||||
|
||||
self.apply(self.init_weights)
|
||||
|
||||
def set_num_special_tokens(self, num_special_tokens=None):
|
||||
"""
|
||||
Update input embeddings with new embedding matrix if needed.
|
||||
|
||||
Args:
|
||||
num_special_tokens: Special tokens to be added to the embedding matrix
|
||||
|
||||
TODO Lysandre filled args
|
||||
"""
|
||||
if num_special_tokens is None or self.config.n_special == num_special_tokens:
|
||||
return
|
||||
# Update config
|
||||
self.config.n_special = num_special_tokens
|
||||
# Build new embeddings and initialize all new embeddings (in particular the special tokens)
|
||||
old_embed = self.wte
|
||||
self.wte = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
|
||||
self.wte.to(old_embed.weight.device)
|
||||
self.init_weights(self.wte)
|
||||
# Copy word embeddings from the previous weights
|
||||
self.wte.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
|
||||
def _resize_token_embeddings(self, new_num_tokens):
|
||||
self.wte = self._get_resized_embeddings(self.wte, new_num_tokens)
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
""" Prunes heads of the model.
|
||||
@@ -641,23 +553,20 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
|
||||
def __init__(self, config):
|
||||
super(GPT2LMHeadModel, self).__init__(config)
|
||||
self.transformer = GPT2Model(config)
|
||||
self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
|
||||
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
self.tie_weights()
|
||||
|
||||
def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
|
||||
def tie_weights(self):
|
||||
""" Make sure we are sharing the input and output embeddings.
|
||||
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
|
||||
"""
|
||||
Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings.
|
||||
|
||||
Args:
|
||||
num_special_tokens: Special tokens to be added to the embedding matrix
|
||||
predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
|
||||
Defaults to True.
|
||||
|
||||
TODO Lysandre filled args
|
||||
"""
|
||||
self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
|
||||
self.transformer.set_num_special_tokens(num_special_tokens)
|
||||
self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
|
||||
input_embeddings = self.transformer.wte.weight
|
||||
if self.config.torchscript:
|
||||
self.lm_head.weight = nn.Parameter(input_embeddings.clone())
|
||||
else:
|
||||
self.lm_head.weight = input_embeddings # Tied weights
|
||||
|
||||
def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None, head_mask=None):
|
||||
"""
|
||||
@@ -740,25 +649,20 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
|
||||
def __init__(self, config):
|
||||
super(GPT2DoubleHeadsModel, self).__init__(config)
|
||||
self.transformer = GPT2Model(config)
|
||||
self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
|
||||
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
||||
self.multiple_choice_head = SequenceSummary(config)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
|
||||
def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
|
||||
def tie_weights(self):
|
||||
""" Make sure we are sharing the input and output embeddings.
|
||||
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
|
||||
"""
|
||||
Update input and output embeddings with new embedding matrix.Make sure we are sharing the embeddings
|
||||
|
||||
Args:
|
||||
num_special_tokens: Special tokens to be added to the embedding matrix
|
||||
predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
|
||||
Defaults to True.
|
||||
|
||||
TODO Lysandre filled args
|
||||
"""
|
||||
self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
|
||||
self.transformer.set_num_special_tokens(num_special_tokens)
|
||||
self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
|
||||
input_embeddings = self.transformer.wte.weight
|
||||
if self.config.torchscript:
|
||||
self.lm_head.weight = nn.Parameter(input_embeddings.clone())
|
||||
else:
|
||||
self.lm_head.weight = input_embeddings # Tied weights
|
||||
|
||||
def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
|
||||
position_ids=None, past=None, head_mask=None):
|
||||
|
||||
@@ -156,7 +156,6 @@ class OpenAIGPTConfig(PretrainedConfig):
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size_or_config_json_file=40478,
|
||||
n_special=0,
|
||||
n_positions=512,
|
||||
n_ctx=512,
|
||||
n_embd=768,
|
||||
@@ -190,7 +189,6 @@ class OpenAIGPTConfig(PretrainedConfig):
|
||||
self.__dict__[key] = value
|
||||
elif isinstance(vocab_size_or_config_json_file, int):
|
||||
self.vocab_size = vocab_size_or_config_json_file
|
||||
self.n_special = n_special
|
||||
self.n_ctx = n_ctx
|
||||
self.n_positions = n_positions
|
||||
self.n_embd = n_embd
|
||||
@@ -216,10 +214,6 @@ class OpenAIGPTConfig(PretrainedConfig):
|
||||
"or the path to a pretrained model config file (str)"
|
||||
)
|
||||
|
||||
@property
|
||||
def total_tokens_embeddings(self):
|
||||
return self.vocab_size + self.n_special
|
||||
|
||||
@property
|
||||
def hidden_size(self):
|
||||
return self.n_embd
|
||||
@@ -355,34 +349,6 @@ class Block(nn.Module):
|
||||
return outputs
|
||||
|
||||
|
||||
class OpenAIGPTLMHead(nn.Module):
|
||||
""" Language Model Head for the transformer """
|
||||
|
||||
def __init__(self, model_embeddings_weights, config):
|
||||
super(OpenAIGPTLMHead, self).__init__()
|
||||
self.n_embd = config.n_embd
|
||||
self.vocab_size = config.vocab_size
|
||||
self.predict_special_tokens = config.predict_special_tokens
|
||||
self.torchscript = config.torchscript
|
||||
embed_shape = model_embeddings_weights.shape
|
||||
self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
|
||||
self.set_embeddings_weights(model_embeddings_weights)
|
||||
|
||||
def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):
|
||||
self.predict_special_tokens = predict_special_tokens
|
||||
|
||||
if self.torchscript:
|
||||
self.decoder.weight = nn.Parameter(model_embeddings_weights.clone())
|
||||
else:
|
||||
self.decoder.weight = model_embeddings_weights # Tied weights
|
||||
|
||||
def forward(self, hidden_state):
|
||||
lm_logits = self.decoder(hidden_state)
|
||||
if not self.predict_special_tokens:
|
||||
lm_logits = lm_logits[..., :self.vocab_size]
|
||||
return lm_logits
|
||||
|
||||
|
||||
class OpenAIGPTPreTrainedModel(PreTrainedModel):
|
||||
""" An abstract class to handle weights initialization and
|
||||
a simple interface for dowloading and loading pretrained models.
|
||||
@@ -408,36 +374,6 @@ class OpenAIGPTPreTrainedModel(PreTrainedModel):
|
||||
module.bias.data.zero_()
|
||||
module.weight.data.fill_(1.0)
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
|
||||
"""
|
||||
Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
|
||||
Download and cache the pre-trained model file if needed.
|
||||
|
||||
Params:
|
||||
pretrained_model_name_or_path: either:
|
||||
- a str with the name of a pre-trained model to load selected in the list of:
|
||||
- a path or url to a pretrained model archive containing:
|
||||
. `config.json` a configuration file for the model
|
||||
. `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
|
||||
- a path or url to a pretrained model archive containing:
|
||||
. `config.json` a configuration file for the model
|
||||
. a series of NumPy files containing OpenAI TensorFlow trained weights
|
||||
from_tf: should we load the weights from a locally saved TensorFlow checkpoint
|
||||
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
|
||||
state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
|
||||
*inputs, **kwargs: additional input for the specific OpenAI-GPT class
|
||||
"""
|
||||
num_special_tokens = kwargs.get('num_special_tokens', None)
|
||||
kwargs.pop('num_special_tokens', None)
|
||||
|
||||
model = super(PreTrainedModel, cls).from_pretrained(pretrained_model_name_or_path, pretrained_model_name_or_path, *inputs, **kwargs)
|
||||
|
||||
# Add additional embeddings for special tokens if needed
|
||||
# This step also make sure we are still sharing the output and input embeddings after loading weights
|
||||
model.set_num_special_tokens(num_special_tokens)
|
||||
return model
|
||||
|
||||
|
||||
class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
||||
"""OpenAI GPT model ("Improving Language Understanding by Generative Pre-Training").
|
||||
@@ -457,13 +393,13 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
||||
config.vocab_size - 1, ______________________
|
||||
config.vocab_size,
|
||||
... -> special embeddings
|
||||
config.vocab_size + config.n_special - 1] ______________________
|
||||
config.vocab_size + n_special - 1] ______________________
|
||||
|
||||
where ``total_tokens_embeddings`` can be obtained as ``config.total_tokens_embeddings`` and is:
|
||||
where ``total_tokens_embeddings`` is:
|
||||
|
||||
::
|
||||
|
||||
total_tokens_embeddings = config.vocab_size + config.n_special
|
||||
total_tokens_embeddings = config.vocab_size + n_special
|
||||
|
||||
You should use the associated indices to index the embeddings.
|
||||
|
||||
@@ -485,34 +421,15 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
|
||||
self.output_attentions = config.output_attentions
|
||||
self.output_hidden_states = config.output_hidden_states
|
||||
|
||||
self.tokens_embed = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
|
||||
self.tokens_embed = nn.Embedding(config.vocab_size, config.n_embd)
|
||||
self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
|
||||
self.drop = nn.Dropout(config.embd_pdrop)
|
||||
self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
|
||||
|
||||
self.apply(self.init_weights)
|
||||
|
||||
def set_num_special_tokens(self, num_special_tokens=None):
|
||||
"""
|
||||
Update input embeddings with new embedding matrice if needed
|
||||
|
||||
Args:
|
||||
num_special_tokens: Special tokens to be added to the embedding matrix
|
||||
|
||||
TODO Lysandre filled Args
|
||||
|
||||
"""
|
||||
if num_special_tokens is None or self.config.n_special == num_special_tokens:
|
||||
return
|
||||
# Update config
|
||||
self.config.n_special = num_special_tokens
|
||||
# Build new embeddings and initialize all new embeddings (in particular the special tokens)
|
||||
old_embed = self.tokens_embed
|
||||
self.tokens_embed = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
|
||||
self.tokens_embed.to(old_embed.weight.device)
|
||||
self.init_weights(self.tokens_embed)
|
||||
# Copy word embeddings from the previous weights
|
||||
self.tokens_embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
|
||||
def _resize_token_embeddings(self, new_num_tokens):
|
||||
self.tokens_embed = self._get_resized_embeddings(self.tokens_embed, new_num_tokens)
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
""" Prunes heads of the model.
|
||||
@@ -657,24 +574,20 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
|
||||
def __init__(self, config):
|
||||
super(OpenAIGPTLMHeadModel, self).__init__(config)
|
||||
self.transformer = OpenAIGPTModel(config)
|
||||
self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
|
||||
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
self.tie_weights()
|
||||
|
||||
def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
|
||||
def tie_weights(self):
|
||||
""" Make sure we are sharing the input and output embeddings.
|
||||
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
|
||||
"""
|
||||
Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings
|
||||
|
||||
Args:
|
||||
num_special_tokens: Special tokens to be added to the embedding matrix
|
||||
predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
|
||||
Defaults to True.
|
||||
|
||||
TODO Lysandre filled Args
|
||||
|
||||
"""
|
||||
self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
|
||||
self.transformer.set_num_special_tokens(num_special_tokens)
|
||||
self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)
|
||||
input_embeddings = self.transformer.tokens_embed.weight
|
||||
if self.config.torchscript:
|
||||
self.lm_head.weight = nn.Parameter(input_embeddings.clone())
|
||||
else:
|
||||
self.lm_head.weight = input_embeddings # Tied weights
|
||||
|
||||
def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, head_mask=None):
|
||||
"""
|
||||
@@ -747,13 +660,13 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
||||
config.vocab_size - 1, ______________________
|
||||
config.vocab_size,
|
||||
... -> special embeddings
|
||||
config.vocab_size + config.n_special - 1] ______________________
|
||||
config.vocab_size + n_special - 1] ______________________
|
||||
|
||||
where ``total_tokens_embeddings`` can be obtained as ``config.total_tokens_embeddings`` and is:
|
||||
where ``total_tokens_embeddings`` is:
|
||||
|
||||
::
|
||||
|
||||
total_tokens_embeddings = config.vocab_size + config.n_special
|
||||
total_tokens_embeddings = config.vocab_size + .n_special
|
||||
|
||||
You should use the associate indices to index the embeddings.
|
||||
|
||||
@@ -773,24 +686,21 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
|
||||
super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
|
||||
|
||||
self.transformer = OpenAIGPTModel(config)
|
||||
self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
|
||||
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
|
||||
self.multiple_choice_head = SequenceSummary(config)
|
||||
|
||||
self.apply(self.init_weights)
|
||||
self.tie_weights()
|
||||
|
||||
def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
|
||||
""" Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings.
|
||||
|
||||
Args:
|
||||
num_special_tokens: Special tokens to be added to the embedding matrix
|
||||
predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
|
||||
Defaults to True.
|
||||
|
||||
TODO Lysandre filled Args
|
||||
def tie_weights(self):
|
||||
""" Make sure we are sharing the input and output embeddings.
|
||||
Export to TorchScript can't handle parameter sharing so we are cloning them instead.
|
||||
"""
|
||||
self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
|
||||
self.transformer.set_num_special_tokens(num_special_tokens)
|
||||
self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)
|
||||
input_embeddings = self.transformer.tokens_embed.weight
|
||||
if self.config.torchscript:
|
||||
self.lm_head.weight = nn.Parameter(input_embeddings.clone())
|
||||
else:
|
||||
self.lm_head.weight = input_embeddings # Tied weights
|
||||
|
||||
def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
|
||||
position_ids=None, head_mask=None):
|
||||
|
||||
@@ -287,6 +287,10 @@ class TransfoXLConfig(PretrainedConfig):
|
||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
||||
"or the path to a pretrained model config file (str)")
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return self.n_token
|
||||
|
||||
@property
|
||||
def hidden_size(self):
|
||||
return self.d_model
|
||||
@@ -998,6 +1002,9 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
|
||||
|
||||
self.apply(self.init_weights)
|
||||
|
||||
def _resize_token_embeddings(self, new_num_tokens):
|
||||
raise NotImplementedError
|
||||
|
||||
def backward_compatible(self):
|
||||
self.sample_softmax = -1
|
||||
|
||||
|
||||
@@ -151,6 +151,7 @@ class PreTrainedModel(nn.Module):
|
||||
pretrained_model_archive_map = {}
|
||||
load_tf_weights = lambda model, config, path: None
|
||||
base_model_prefix = ""
|
||||
input_embeddings = None
|
||||
|
||||
def __init__(self, config, *inputs, **kwargs):
|
||||
super(PreTrainedModel, self).__init__()
|
||||
@@ -164,12 +165,48 @@ class PreTrainedModel(nn.Module):
|
||||
# Save config in model
|
||||
self.config = config
|
||||
|
||||
def _get_resized_embeddings(self, old_embeddings, new_num_tokens):
|
||||
# Build new embeddings
|
||||
old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
|
||||
new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
|
||||
new_embeddings.to(old_embeddings.weight.device)
|
||||
|
||||
# initialize all new embeddings (in particular added tokens)
|
||||
self.init_weights(new_embeddings)
|
||||
|
||||
# Copy word embeddings from the previous weights
|
||||
num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
|
||||
new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
|
||||
|
||||
return new_embeddings
|
||||
|
||||
def resize_token_embeddings(self, new_num_tokens):
|
||||
""" Resize input token embeddings matrix.
|
||||
|
||||
Args:
|
||||
new_num_tokens: New number of tokens in the embedding matrix.
|
||||
Increasing the size will add newly initialized vectors at the end
|
||||
Reducing the size will remove vectors from the end
|
||||
"""
|
||||
if new_num_tokens == self.config.vocab_size:
|
||||
return
|
||||
base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed
|
||||
base_model._resize_token_embeddings(new_num_tokens)
|
||||
|
||||
# Update base model and current model config
|
||||
self.config.vocab_size = new_num_tokens
|
||||
base_model.vocab_size = new_num_tokens
|
||||
|
||||
# Tie weights again if needed
|
||||
if hasattr(self, 'tie_weights'):
|
||||
self.tie_weights()
|
||||
|
||||
def prune_heads(self, heads_to_prune):
|
||||
""" Prunes heads of the base model.
|
||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||
"""
|
||||
model_to_prune = getattr(self, self.base_model_prefix, self) # get the base model if needed
|
||||
model_to_prune._prune_heads(heads_to_prune)
|
||||
base_model = getattr(self, self.base_model_prefix, self) # get the base model if needed
|
||||
base_model._prune_heads(heads_to_prune)
|
||||
|
||||
def save_pretrained(self, save_directory):
|
||||
""" Save a model with its configuration file to a directory, so that it
|
||||
|
||||
@@ -104,7 +104,6 @@ class XLMConfig(PretrainedConfig):
|
||||
|
||||
def __init__(self,
|
||||
vocab_size_or_config_json_file=30145,
|
||||
n_special=0,
|
||||
emb_dim=2048,
|
||||
n_layers=12,
|
||||
n_heads=16,
|
||||
@@ -148,7 +147,6 @@ class XLMConfig(PretrainedConfig):
|
||||
self.__dict__[key] = value
|
||||
elif isinstance(vocab_size_or_config_json_file, int):
|
||||
self.n_words = vocab_size_or_config_json_file
|
||||
self.n_special = n_special
|
||||
self.emb_dim = emb_dim
|
||||
self.n_layers = n_layers
|
||||
self.n_heads = n_heads
|
||||
@@ -183,8 +181,8 @@ class XLMConfig(PretrainedConfig):
|
||||
"or the path to a pretrained model config file (str)")
|
||||
|
||||
@property
|
||||
def total_tokens_embeddings(self):
|
||||
return self.n_words + self.n_special
|
||||
def vocab_size(self):
|
||||
return self.n_words
|
||||
|
||||
@property
|
||||
def hidden_size(self):
|
||||
@@ -479,6 +477,9 @@ class XLMModel(XLMPreTrainedModel):
|
||||
|
||||
self.apply(self.init_weights)
|
||||
|
||||
def _resize_token_embeddings(self, new_num_tokens):
|
||||
self.embeddings = self._get_resized_embeddings(self.embeddings, new_num_tokens)
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
""" Prunes heads of the model.
|
||||
heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
|
||||
@@ -718,8 +719,6 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
|
||||
"""
|
||||
def __init__(self, config):
|
||||
super(XLMWithLMHeadModel, self).__init__(config)
|
||||
self.torchscript = config.torchscript
|
||||
|
||||
self.transformer = XLMModel(config)
|
||||
self.pred_layer = XLMPredLayer(config)
|
||||
|
||||
@@ -729,7 +728,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
|
||||
def tie_weights(self):
|
||||
""" Make sure we are sharing the embeddings
|
||||
"""
|
||||
if self.torchscript:
|
||||
if self.config.torchscript:
|
||||
self.pred_layer.proj.weight = nn.Parameter(self.transformer.embeddings.weight.clone())
|
||||
else:
|
||||
self.pred_layer.proj.weight = self.transformer.embeddings.weight
|
||||
|
||||
@@ -312,6 +312,10 @@ class XLNetConfig(PretrainedConfig):
|
||||
raise ValueError("First argument must be either a vocabulary size (int)"
|
||||
"or the path to a pretrained model config file (str)")
|
||||
|
||||
@property
|
||||
def vocab_size(self):
|
||||
return self.n_token
|
||||
|
||||
@property
|
||||
def hidden_size(self):
|
||||
return self.d_model
|
||||
@@ -654,6 +658,9 @@ class XLNetModel(XLNetPreTrainedModel):
|
||||
|
||||
self.apply(self.init_weights)
|
||||
|
||||
def _resize_token_embeddings(self, new_num_tokens):
|
||||
self.word_embedding = self._get_resized_embeddings(self.word_embedding, new_num_tokens)
|
||||
|
||||
def _prune_heads(self, heads_to_prune):
|
||||
logger.info("Head pruning is not implemented for XLNet")
|
||||
pass
|
||||
@@ -970,20 +977,17 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
|
||||
super(XLNetLMHeadModel, self).__init__(config)
|
||||
self.attn_type = config.attn_type
|
||||
self.same_length = config.same_length
|
||||
self.torchscript = config.torchscript
|
||||
|
||||
self.transformer = XLNetModel(config)
|
||||
self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
|
||||
|
||||
# Tie weights
|
||||
|
||||
self.apply(self.init_weights)
|
||||
self.tie_weights()
|
||||
|
||||
def tie_weights(self):
|
||||
""" Make sure we are sharing the embeddings
|
||||
"""
|
||||
if self.torchscript:
|
||||
if self.config.torchscript:
|
||||
self.lm_loss.weight = nn.Parameter(self.transformer.word_embedding.weight.clone())
|
||||
else:
|
||||
self.lm_loss.weight = self.transformer.word_embedding.weight
|
||||
|
||||
@@ -26,7 +26,7 @@ from pytorch_transformers import (BertConfig, BertModel, BertForMaskedLM,
|
||||
BertForTokenClassification, BertForMultipleChoice)
|
||||
from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
|
||||
from .modeling_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
|
||||
from .modeling_common_test import (create_and_check_commons, ConfigTester, ids_tensor)
|
||||
|
||||
|
||||
class BertModelTest(unittest.TestCase):
|
||||
|
||||
@@ -22,8 +22,15 @@ import shutil
|
||||
import json
|
||||
import random
|
||||
|
||||
import unittest
|
||||
import logging
|
||||
|
||||
import torch
|
||||
|
||||
from pytorch_transformers import PretrainedConfig, PreTrainedModel
|
||||
from pytorch_transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
|
||||
|
||||
def _config_zero_init(config):
|
||||
configs_no_init = copy.deepcopy(config)
|
||||
for key in configs_no_init.__dict__.keys():
|
||||
@@ -242,6 +249,7 @@ class ConfigTester(object):
|
||||
|
||||
def create_and_test_config_common_properties(self):
|
||||
config = self.config_class(**self.inputs_dict)
|
||||
self.parent.assertTrue(hasattr(config, 'vocab_size'))
|
||||
self.parent.assertTrue(hasattr(config, 'hidden_size'))
|
||||
self.parent.assertTrue(hasattr(config, 'num_attention_heads'))
|
||||
self.parent.assertTrue(hasattr(config, 'num_hidden_layers'))
|
||||
@@ -276,7 +284,6 @@ class GPTModelTester(object):
|
||||
use_token_type_ids=True,
|
||||
use_labels=True,
|
||||
vocab_size=99,
|
||||
n_special=1,
|
||||
n_positions=33,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=5,
|
||||
@@ -299,7 +306,6 @@ class GPTModelTester(object):
|
||||
self.use_token_type_ids = use_token_type_ids
|
||||
self.use_labels = use_labels
|
||||
self.vocab_size = vocab_size
|
||||
self.n_special = n_special
|
||||
self.n_positions = n_positions
|
||||
self.hidden_size = hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
@@ -316,7 +322,7 @@ class GPTModelTester(object):
|
||||
self.all_model_classes = (base_model_class, lm_head_model_class, double_head_model_class)
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
total_num_tokens = self.vocab_size + self.n_special
|
||||
total_num_tokens = self.vocab_size
|
||||
input_ids = ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_num_tokens)
|
||||
|
||||
position_ids = None
|
||||
@@ -338,7 +344,6 @@ class GPTModelTester(object):
|
||||
|
||||
config = self.config_class(
|
||||
vocab_size_or_config_json_file=self.vocab_size,
|
||||
n_special=self.n_special,
|
||||
n_positions=self.n_positions,
|
||||
n_embd=self.hidden_size,
|
||||
n_layer=self.num_hidden_layers,
|
||||
@@ -370,7 +375,7 @@ class GPTModelTester(object):
|
||||
outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
|
||||
loss, lm_logits = outputs[:2]
|
||||
|
||||
total_voc = self.n_special + self.vocab_size
|
||||
total_voc = self.vocab_size
|
||||
self.parent.assertListEqual(
|
||||
list(lm_logits.size()),
|
||||
[self.batch_size, self.n_choices, self.seq_length, total_voc])
|
||||
@@ -400,7 +405,7 @@ class GPTModelTester(object):
|
||||
lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4]
|
||||
loss = [lm_loss, mc_loss]
|
||||
|
||||
total_voc = self.n_special + self.vocab_size
|
||||
total_voc = self.vocab_size
|
||||
self.parent.assertListEqual(
|
||||
list(lm_logits.size()),
|
||||
[self.batch_size, self.n_choices, self.seq_length, total_voc])
|
||||
@@ -441,6 +446,30 @@ class GPTModelTester(object):
|
||||
self.create_and_check_commons(*config_and_inputs)
|
||||
|
||||
def run_slow_tests(self):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
self.create_and_check_model_from_pretrained(*config_and_inputs)
|
||||
self.create_and_check_model_from_pretrained()
|
||||
|
||||
|
||||
class ModelUtilsTest(unittest.TestCase):
|
||||
def test_model_from_pretrained(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
config = BertConfig.from_pretrained(model_name)
|
||||
self.assertIsNotNone(config)
|
||||
self.assertIsInstance(config, PretrainedConfig)
|
||||
|
||||
model = BertModel.from_pretrained(model_name)
|
||||
model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
|
||||
self.assertIsNotNone(model)
|
||||
self.assertIsInstance(model, PreTrainedModel)
|
||||
for value in loading_info.values():
|
||||
self.assertEqual(len(value), 0)
|
||||
|
||||
config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
|
||||
model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
|
||||
self.assertEqual(model.config.output_attentions, True)
|
||||
self.assertEqual(model.config.output_hidden_states, True)
|
||||
self.assertEqual(model.config, config)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -28,7 +28,7 @@ import torch
|
||||
from pytorch_transformers import (GPT2Config, GPT2Model,
|
||||
GPT2LMHeadModel, GPT2DoubleHeadsModel)
|
||||
|
||||
from .modeling_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester)
|
||||
from .modeling_common_test import (create_and_check_commons, ConfigTester, GPTModelTester)
|
||||
|
||||
class GPT2ModelTest(unittest.TestCase):
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ import torch
|
||||
from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel,
|
||||
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
|
||||
|
||||
from .modeling_tests_commons import (create_and_check_commons, ConfigTester, GPTModelTester)
|
||||
from .modeling_common_test import (create_and_check_commons, ConfigTester, GPTModelTester)
|
||||
|
||||
class OpenAIModelTest(unittest.TestCase):
|
||||
|
||||
|
||||
@@ -28,7 +28,7 @@ import torch
|
||||
from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
|
||||
from pytorch_transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
|
||||
from .modeling_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
|
||||
from .modeling_common_test import ConfigTester, create_and_check_commons, ids_tensor
|
||||
|
||||
class TransfoXLModelTest(unittest.TestCase):
|
||||
class TransfoXLModelTester(object):
|
||||
|
||||
@@ -1,47 +0,0 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 HuggingFace Inc..
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import logging
|
||||
|
||||
from pytorch_transformers import PretrainedConfig, PreTrainedModel
|
||||
from pytorch_transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
|
||||
class ModelUtilsTest(unittest.TestCase):
|
||||
def test_model_from_pretrained(self):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
|
||||
config = BertConfig.from_pretrained(model_name)
|
||||
self.assertIsNotNone(config)
|
||||
self.assertIsInstance(config, PretrainedConfig)
|
||||
|
||||
model = BertModel.from_pretrained(model_name)
|
||||
model, loading_info = BertModel.from_pretrained(model_name, output_loading_info=True)
|
||||
self.assertIsNotNone(model)
|
||||
self.assertIsInstance(model, PreTrainedModel)
|
||||
for value in loading_info.values():
|
||||
self.assertEqual(len(value), 0)
|
||||
|
||||
config = BertConfig.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
|
||||
model = BertModel.from_pretrained(model_name, output_attentions=True, output_hidden_states=True)
|
||||
self.assertEqual(model.config.output_attentions, True)
|
||||
self.assertEqual(model.config.output_hidden_states, True)
|
||||
self.assertEqual(model.config, config)
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -23,7 +23,7 @@ import pytest
|
||||
from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification)
|
||||
from pytorch_transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
|
||||
from .modeling_tests_commons import (create_and_check_commons, ConfigTester, ids_tensor)
|
||||
from .modeling_common_test import (create_and_check_commons, ConfigTester, ids_tensor)
|
||||
|
||||
|
||||
class XLMModelTest(unittest.TestCase):
|
||||
|
||||
@@ -28,7 +28,7 @@ import torch
|
||||
from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
|
||||
from pytorch_transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
|
||||
|
||||
from .modeling_tests_commons import ConfigTester, create_and_check_commons, ids_tensor
|
||||
from .modeling_common_test import ConfigTester, create_and_check_commons, ids_tensor
|
||||
|
||||
class XLNetModelTest(unittest.TestCase):
|
||||
class XLNetModelTester(object):
|
||||
|
||||
Reference in New Issue
Block a user