From 5456d82311d0f0896741709df72e9ba9434f6082 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 29 Jan 2019 09:54:18 +0100 Subject: [PATCH] more versatile model loading --- pytorch_pretrained_bert/modeling.py | 4 ++- pytorch_pretrained_bert/modeling_openai.py | 42 ++++++++++++---------- 2 files changed, 27 insertions(+), 19 deletions(-) diff --git a/pytorch_pretrained_bert/modeling.py b/pytorch_pretrained_bert/modeling.py index dc14eadd82..8d71b8e955 100644 --- a/pytorch_pretrained_bert/modeling.py +++ b/pytorch_pretrained_bert/modeling.py @@ -606,7 +606,9 @@ class BertPreTrainedModel(nn.Module): for name, child in module._modules.items(): if child is not None: load(child, prefix + name + '.') - start_prefix = 'bert.' if not hasattr(model, 'bert') and any(s.startwith('bert.') for s in state_dict.keys()) else '' + start_prefix = '' + if not hasattr(model, 'bert') and any(s.startswith('bert.') for s in state_dict.keys()): + start_prefix = 'bert.' load(model, prefix=start_prefix) if len(missing_keys) > 0: logger.info("Weights of {} not initialized from pretrained model: {}".format( diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py index 88e5690e9b..e71a3910f8 100644 --- a/pytorch_pretrained_bert/modeling_openai.py +++ b/pytorch_pretrained_bert/modeling_openai.py @@ -120,6 +120,7 @@ class OpenAIGPTConfig(object): self, vocab_size_or_config_json_file=40478, n_special=0, + n_positions=512, n_ctx=512, n_embd=768, n_layer=12, @@ -135,7 +136,8 @@ class OpenAIGPTConfig(object): Args: vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file. n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...) - n_ctx: Number of positional embeddings. + n_positions: Number of positional embeddings. + n_ctx: Size of the causal mask (usually same as n_positions). n_embd: Dimensionality of the embeddings and hidden states. n_layer: Number of hidden layers in the Transformer encoder. n_head: Number of attention heads for each attention layer in @@ -159,6 +161,7 @@ class OpenAIGPTConfig(object): self.vocab_size = vocab_size_or_config_json_file self.n_special = n_special self.n_ctx = n_ctx + self.n_positions = n_positions self.n_embd = n_embd self.n_layer = n_layer self.n_head = n_head @@ -175,7 +178,7 @@ class OpenAIGPTConfig(object): @property def total_num_embeddings(self): - return self.vocab_size + self.n_special + self.n_ctx + return self.vocab_size + self.n_special + self.n_positions @classmethod def from_dict(cls, json_object): @@ -234,7 +237,7 @@ class Attention(nn.Module): n_state = nx # in Attention: n_state=768 (nx=n_embd) # [switch nx => n_state from Block to Attention to keep identical to TF implem] assert n_state % config.n_head == 0 - self.register_buffer("b", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx)) + self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx)) self.n_head = config.n_head self.split_size = n_state self.scale = scale @@ -247,9 +250,9 @@ class Attention(nn.Module): w = torch.matmul(q, k) if self.scale: w = w / math.sqrt(v.size(-1)) - # w = w * self.b + -1e9 * (1 - self.b) # TF implem method: mask_attn_weights + # w = w * self.bias + -1e9 * (1 - self.bias) # TF implem method: mask_attn_weights # XD: self.b may be larger than w, so we need to crop it - b = self.b[:, :, : w.size(-2), : w.size(-1)] + b = self.bias[:, :, : w.size(-2), : w.size(-1)] w = w * b + -1e9 * (1 - b) w = nn.Softmax(dim=-1)(w) @@ -474,10 +477,12 @@ class OpenAIGPTPreTrainedModel(nn.Module): new_keys = [] for key in state_dict.keys(): new_key = None - if "gamma" in key: - new_key = key.replace("gamma", "weight") - if "beta" in key: - new_key = key.replace("beta", "bias") + if key.endswith(".g"): + new_key = key[:-2] + ".weight" + elif key.endswith(".b"): + new_key = key[:-2] + ".bias" + elif key.endswith(".w"): + new_key = key[:-2] + ".weight" if new_key: old_keys.append(key) new_keys.append(new_key) @@ -502,7 +507,8 @@ class OpenAIGPTPreTrainedModel(nn.Module): if child is not None: load(child, prefix + name + ".") - if hasattr(model, "transformer") and all(not s.startwith('transformer.') for s in state_dict.keys()): + start_model = model + if hasattr(model, "transformer") and all(not s.startswith('transformer.') for s in state_dict.keys()): start_model = model.transformer load(start_model, prefix="") @@ -541,7 +547,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): total_num_embeddings - 1] ______________________ where total_num_embeddings can be obtained as config.total_num_embeddings and is: - total_num_embeddings = config.vocab_size + config.n_special + config.n_ctx + total_num_embeddings = config.vocab_size + config.n_special + config.n_positions You should use the associate indices to index the embeddings. The special embeddings ([SEP], [CLS]...) are not pre-trained and need to be trained during the fine-tuning if you use them. @@ -554,7 +560,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length] were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[ `position_ids`: an optional torch.LongTensor with the same shape as input_ids - with the position indices (selected in the range [config.vocab_size + config.n_special, config.vocab_size + config.n_special + config.n_ctx - 1[. + with the position indices (selected in the range [config.vocab_size + config.n_special, config.vocab_size + config.n_special + config.n_positions - 1[. `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids You can use it to add a third embedding (the previous two being the word and position embeddings) to each token in the sentence. @@ -578,7 +584,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): def __init__(self, config): super(OpenAIGPTModel, self).__init__(config) - total_embeddings_size = config.vocab_size + config.n_special + config.n_ctx + total_embeddings_size = config.vocab_size + config.n_special + config.n_positions self.embed = nn.Embedding(total_embeddings_size, config.n_embd) self.drop = nn.Dropout(config.embd_pdrop) block = Block(config.n_ctx, config, scale=True) @@ -598,7 +604,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): self.init_weights(self.embed) # Copy word and positional embeddings from the previous weights self.embed.weight.data[: self.config.vocab_size, :] = old_embed.weight.data[: self.config.vocab_size, :] - self.embed.weight.data[-self.config.n_ctx :, :] = old_embed.weight.data[-self.config.n_ctx :, :] + self.embed.weight.data[-self.config.n_positions :, :] = old_embed.weight.data[-self.config.n_positions :, :] def forward(self, input_ids, position_ids=None, token_type_ids=None): if position_ids is None: @@ -645,7 +651,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): total_num_embeddings - 1] ______________________ where total_num_embeddings can be obtained as config.total_num_embeddings and is: - total_num_embeddings = config.vocab_size + config.n_special + config.n_ctx + total_num_embeddings = config.vocab_size + config.n_special + config.n_positions You should use these indices to index the word, special and position embeddings. The special embeddings ([SEP], [CLS]...) are not pre-trained and need to be trained during the fine-tuning if you use them. @@ -658,7 +664,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length] were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[ `position_ids`: an optional torch.LongTensor with the same shape as input_ids - with the position indices (selected in the range [config.vocab_size + config.n_special, config.vocab_size + config.n_special + config.n_ctx - 1[. + with the position indices (selected in the range [config.vocab_size + config.n_special, config.vocab_size + config.n_special + config.n_positions - 1[. `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids You can use it to add a third embedding (the previous two being the word and position embeddings) to each token in the sentence. @@ -725,7 +731,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): total_num_embeddings - 1] ______________________ where total_num_embeddings can be obtained as config.total_num_embeddings and is: - total_num_embeddings = config.vocab_size + config.n_special + config.n_ctx + total_num_embeddings = config.vocab_size + config.n_special + config.n_positions You should use these indices to index the word, special and position embeddings. The special embeddings ([SEP], [CLS]...) are not pre-trained and need to be trained during the fine-tuning if you use them. @@ -741,7 +747,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): with a value of 1 were the last hidden state is (usually the [CLS] token) and 0 otherwise. `position_ids`: an optional torch.LongTensor with the same shape as input_ids with the position indices (selected in the range [config.vocab_size + config.n_special, - config.vocab_size + config.n_special + config.n_ctx - 1[. + config.vocab_size + config.n_special + config.n_positions - 1[. `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids You can use it to add a third embedding (the previous two being the word and position embeddings) to each token in the sentence.