From bd91ae654faf8bc45eb68b13668d2013df4ffb9c Mon Sep 17 00:00:00 2001 From: lukovnikov Date: Tue, 6 Nov 2018 18:21:44 +0100 Subject: [PATCH] moved bert to qelos-util --- hf_bert/__init__.py | 0 modeling.py | 11 +++++-- tests/mytest.py | 71 --------------------------------------------- 3 files changed, 8 insertions(+), 74 deletions(-) create mode 100644 hf_bert/__init__.py delete mode 100644 tests/mytest.py diff --git a/hf_bert/__init__.py b/hf_bert/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/modeling.py b/modeling.py index 4cbb99f2fa..dd43c9c46a 100644 --- a/modeling.py +++ b/modeling.py @@ -34,6 +34,10 @@ def gelu(x): return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) +def swish(x): + return x * torch.sigmoid(x) + + class BertConfig(object): """Configuration class to store the configuration of a `BertModel`. """ @@ -60,7 +64,7 @@ class BertConfig(object): intermediate_size: The size of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. hidden_act: The non-linear activation function (function or string) in the - encoder and pooler. + encoder and pooler. If string, "gelu", "relu" and "swish" supported. hidden_dropout_prob: The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob: The dropout ratio for the attention @@ -237,7 +241,8 @@ class BERTIntermediate(nn.Module): def __init__(self, config): super(BERTIntermediate, self).__init__() self.dense = nn.Linear(config.hidden_size, config.intermediate_size) - self.intermediate_act_fn = gelu + act2fn = {"gelu": gelu, "relu": torch.nn.ReLU, "swish": swish} + self.intermediate_act_fn = act2fn[config.hidden_act] if isinstance(config.hidden_act, str) else config.hidden_act def forward(self, hidden_states): hidden_states = self.dense(hidden_states) @@ -355,7 +360,7 @@ class BertModel(nn.Module): all_encoder_layers = self.encoder(embedding_output, extended_attention_mask) sequence_output = all_encoder_layers[-1] pooled_output = self.pooler(sequence_output) - return [embedding_output] + all_encoder_layers, pooled_output + return all_encoder_layers, pooled_output class BertForSequenceClassification(nn.Module): """BERT model for classification. diff --git a/tests/mytest.py b/tests/mytest.py deleted file mode 100644 index 2b2dadecda..0000000000 --- a/tests/mytest.py +++ /dev/null @@ -1,71 +0,0 @@ -import unittest -import json -import random - -import torch -import numpy as np - -import modeling -import convert_tf_checkpoint_to_pytorch - -import grouch - - -class MyTest(unittest.TestCase): - def test_loading_and_running(self): - bertpath = "../../grouch/data/bert/bert-base/" - configpath = bertpath + "bert_config.json" - ckptpath = bertpath + "bert_model.ckpt" - m = convert_tf_checkpoint_to_pytorch.convert(configpath, ckptpath) - m.eval() - # print(m) - - input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) - input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) - token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) - - all_y, pool_y = m(input_ids, token_type_ids, input_mask) - print(pool_y.shape) - # np.save("_bert_ref_pool_out.npy", pool_y.detach().numpy()) - # np.save("_bert_ref_all_out.npy", torch.stack(all_y, 0).detach().numpy()) - - config = grouch.TransformerBERT.load_config(configpath) - gm = grouch.TransformerBERT.init_from_config(config) - gm.load_weights_from_tf_checkpoint(ckptpath) - gm.eval() - - g_all_y, g_pool_y = gm(input_ids, token_type_ids, input_mask) - print(g_pool_y.shape) - - # check embeddings - # print(m.embeddings) - # print(gm.emb) - # hugging_emb = m.embeddings(input_ids, token_type_ids) - # grouch_emb = gm.emb(input_ids, token_type_ids) - - print((all_y[0] - g_all_y[0]).norm()) - # print(all_y[0][:, :, :10] - g_all_y[0][:, :, :10]) - self.assertTrue(np.allclose(all_y[0].detach().numpy(), g_all_y[0].detach().numpy(), atol=1e-7)) - print("embeddings good") - - print(m.encoder.layer[0]) - print(gm.encoder.layers[0]) - print("norm of diff at layer 1", (all_y[1] - g_all_y[1]).norm()) - # print(all_y[1][:, :, :10] - g_all_y[1][:, :, :10]) - self.assertTrue(np.allclose(all_y[1].detach().numpy(), g_all_y[1].detach().numpy(), atol=1e-6)) - - # hugging_layer = m.encoder.layer[0] - # grouch_layer = gm.encoder.layers[0] - # print("comparing weights") - # print((hugging_layer.attention.self.query.weight - grouch_layer.slf_attn.q_proj.weight).norm()) - # print((hugging_layer.attention.self.query.bias - grouch_layer.slf_attn.q_proj.bias).norm()) - # print((hugging_layer.attention.self.key.weight - grouch_layer.slf_attn.k_proj.weight).norm()) - # print((hugging_layer.attention.self.key.bias - grouch_layer.slf_attn.k_proj.bias).norm()) - # print((hugging_layer.attention.self.value.weight - grouch_layer.slf_attn.v_proj.weight).norm()) - # print((hugging_layer.attention.self.value.bias - grouch_layer.slf_attn.v_proj.bias).norm()) - # print((hugging_layer.attention.output.dense.weight - grouch_layer.slf_attn.vw_proj.weight).norm()) - # print((hugging_layer.attention.output.dense.bias - grouch_layer.slf_attn.vw_proj.bias).norm()) - - print("norm of diff at last layer", (all_y[-1] - g_all_y[-1]).norm()) - # print(all_y[-1][:, :, :10] - g_all_y[-1][:, :, :10]) - self.assertTrue(np.allclose(all_y[-1].detach().numpy(), g_all_y[-1].detach().numpy(), atol=1e-4)) \ No newline at end of file