From 8aa22af0c51ffc837453eb6f4302e7b3cca0a9ab Mon Sep 17 00:00:00 2001 From: thomwolf Date: Sat, 3 Nov 2018 03:11:13 +0100 Subject: [PATCH] fixing model --- Comparing TF and PT models.ipynb | 383 +++++++++++++++++++------------ extract_features_pytorch.py | 20 +- modeling_pytorch.py | 49 ++-- 3 files changed, 278 insertions(+), 174 deletions(-) diff --git a/Comparing TF and PT models.ipynb b/Comparing TF and PT models.ipynb index e042bfc290..2f18d3b13c 100644 --- a/Comparing TF and PT models.ipynb +++ b/Comparing TF and PT models.ipynb @@ -12,8 +12,8 @@ "execution_count": 1, "metadata": { "ExecuteTime": { - "end_time": "2018-11-02T14:09:09.239405Z", - "start_time": "2018-11-02T14:09:08.126668Z" + "end_time": "2018-11-03T02:09:37.498678Z", + "start_time": "2018-11-03T02:09:36.366672Z" } }, "outputs": [], @@ -26,8 +26,8 @@ "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2018-11-02T14:09:09.370511Z", - "start_time": "2018-11-02T14:09:09.242527Z" + "end_time": "2018-11-03T02:09:37.621865Z", + "start_time": "2018-11-03T02:09:37.500988Z" } }, "outputs": [ @@ -52,7 +52,7 @@ "max_seq_length=128\n", "input_file=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/pytorch-pretrained-BERT/input.txt\"\n", "\n", - "layer_indexes = [-1]\n", + "layer_indexes = list(range(12))\n", "bert_config = modeling.BertConfig.from_json_file(bert_config_file)\n", "tokenizer = tokenization.FullTokenizer(\n", " vocab_file=vocab_file, do_lower_case=True)\n", @@ -70,8 +70,8 @@ "execution_count": 3, "metadata": { "ExecuteTime": { - "end_time": "2018-11-02T14:09:12.514617Z", - "start_time": "2018-11-02T14:09:09.372137Z" + "end_time": "2018-11-03T02:09:40.831618Z", + "start_time": "2018-11-03T02:09:37.624063Z" } }, "outputs": [ @@ -79,15 +79,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "WARNING:tensorflow:Estimator's model_fn (.model_fn at 0x12b266ae8>) includes params argument, but params are not passed to Estimator.\n", - "WARNING:tensorflow:Using temporary folder as model directory: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmphrjfnoqh\n", - "INFO:tensorflow:Using config: {'_model_dir': '/var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmphrjfnoqh', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n", + "WARNING:tensorflow:Estimator's model_fn (.model_fn at 0x12b0bcc80>) includes params argument, but params are not passed to Estimator.\n", + "WARNING:tensorflow:Using temporary folder as model directory: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpgpb5nz3u\n", + "INFO:tensorflow:Using config: {'_model_dir': '/var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpgpb5nz3u', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n", "graph_options {\n", " rewrite_options {\n", " meta_optimizer_iterations: ONE\n", " }\n", "}\n", - ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': , '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\n", + ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': , '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\n", "WARNING:tensorflow:Setting TPUConfig.num_shards==1 is an unsupported behavior. Please fix as soon as possible (leaving num_shards as None.\n", "INFO:tensorflow:_TPUContext: eval_on_tpu True\n", "WARNING:tensorflow:eval_on_tpu ignored because use_tpu is False.\n" @@ -126,8 +126,8 @@ "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2018-11-02T14:09:17.745970Z", - "start_time": "2018-11-02T14:09:12.516953Z" + "end_time": "2018-11-03T02:09:46.413197Z", + "start_time": "2018-11-03T02:09:40.834621Z" } }, "outputs": [ @@ -135,42 +135,53 @@ "name": "stdout", "output_type": "stream", "text": [ - "INFO:tensorflow:Could not find trained model in model_dir: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmphrjfnoqh, running initialization to predict.\n", + "INFO:tensorflow:Could not find trained model in model_dir: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpgpb5nz3u, running initialization to predict.\n", "INFO:tensorflow:Calling model_fn.\n", "INFO:tensorflow:Running infer on CPU\n", "INFO:tensorflow:Done calling model_fn.\n", "INFO:tensorflow:Graph was finalized.\n", "INFO:tensorflow:Running local_init_op.\n", "INFO:tensorflow:Done running local_init_op.\n", + "extracting layer 0\n", + "extracting layer 1\n", + "extracting layer 2\n", + "extracting layer 3\n", + "extracting layer 4\n", + "extracting layer 5\n", + "extracting layer 6\n", + "extracting layer 7\n", + "extracting layer 8\n", + "extracting layer 9\n", + "extracting layer 10\n", + "extracting layer 11\n", "INFO:tensorflow:prediction_loop marked as finished\n", "INFO:tensorflow:prediction_loop marked as finished\n" ] } ], "source": [ - "all_out = []\n", + "tensorflow_all_out = []\n", "for result in estimator.predict(input_fn, yield_single_examples=True):\n", " unique_id = int(result[\"unique_id\"])\n", " feature = unique_id_to_feature[unique_id]\n", " output_json = collections.OrderedDict()\n", " output_json[\"linex_index\"] = unique_id\n", - " all_out_features = []\n", - " for (i, token) in enumerate(feature.tokens):\n", - " all_layers = []\n", - " for (j, layer_index) in enumerate(layer_indexes):\n", - " layer_output = result[\"layer_output_%d\" % j]\n", - " layers = collections.OrderedDict()\n", - " layers[\"index\"] = layer_index\n", - " layers[\"values\"] = [\n", - " round(float(x), 6) for x in layer_output[i:(i + 1)].flat\n", - " ]\n", - " all_layers.append(layers)\n", - " out_features = collections.OrderedDict()\n", - " out_features[\"token\"] = token\n", - " out_features[\"layers\"] = all_layers\n", - " all_out_features.append(out_features)\n", - " output_json[\"features\"] = all_out_features\n", - " all_out.append(output_json)" + " tensorflow_all_out_features = []\n", + " # for (i, token) in enumerate(feature.tokens):\n", + " all_layers = []\n", + " for (j, layer_index) in enumerate(layer_indexes):\n", + " print(\"extracting layer {}\".format(j))\n", + " layer_output = result[\"layer_output_%d\" % j]\n", + " layers = collections.OrderedDict()\n", + " layers[\"index\"] = layer_index\n", + " layers[\"values\"] = layer_output\n", + " all_layers.append(layers)\n", + " tensorflow_out_features = collections.OrderedDict()\n", + " tensorflow_out_features[\"layers\"] = all_layers\n", + " tensorflow_all_out_features.append(tensorflow_out_features)\n", + "\n", + " output_json[\"features\"] = tensorflow_all_out_features\n", + " tensorflow_all_out.append(output_json)" ] }, { @@ -178,8 +189,8 @@ "execution_count": 5, "metadata": { "ExecuteTime": { - "end_time": "2018-11-02T14:09:17.780532Z", - "start_time": "2018-11-02T14:09:17.748778Z" + "end_time": "2018-11-03T02:09:46.460128Z", + "start_time": "2018-11-03T02:09:46.416138Z" } }, "outputs": [ @@ -190,15 +201,28 @@ "1\n", "2\n", "odict_keys(['linex_index', 'features'])\n", - "14\n" + "number of tokens 1\n", + "number of layers 12\n" ] + }, + { + "data": { + "text/plain": [ + "(128, 768)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "print(len(all_out))\n", - "print(len(all_out[0]))\n", - "print(all_out[0].keys())\n", - "print(len(all_out[0]['features']))" + "print(len(tensorflow_all_out))\n", + "print(len(tensorflow_all_out[0]))\n", + "print(tensorflow_all_out[0].keys())\n", + "print(\"number of tokens\", len(tensorflow_all_out[0]['features']))\n", + "print(\"number of layers\", len(tensorflow_all_out[0]['features'][0]['layers']))\n", + "tensorflow_all_out[0]['features'][0]['layers'][0]['values'].shape" ] }, { @@ -206,34 +230,13 @@ "execution_count": 6, "metadata": { "ExecuteTime": { - "end_time": "2018-11-02T14:09:17.818968Z", - "start_time": "2018-11-02T14:09:17.782121Z" + "end_time": "2018-11-03T02:09:46.498637Z", + "start_time": "2018-11-03T02:09:46.463115Z" } }, - "outputs": [ - { - "data": { - "text/plain": [ - "[-0.628111,\n", - " 0.193215,\n", - " -0.75185,\n", - " -0.040464,\n", - " -0.875331,\n", - " 0.15654,\n", - " 1.385444,\n", - " 1.066997,\n", - " -0.349549,\n", - " 0.270686]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "tensorflow_output = all_out[0]['features'][0]['layers'][0]['values']\n", - "tensorflow_output[:10]" + "tensorflow_outputs = list(tensorflow_all_out[0]['features'][0]['layers'][t]['values'] for t in layer_indexes)" ] }, { @@ -248,12 +251,13 @@ "execution_count": 7, "metadata": { "ExecuteTime": { - "end_time": "2018-11-02T14:09:17.954196Z", - "start_time": "2018-11-02T14:09:17.821115Z" + "end_time": "2018-11-03T02:09:46.660303Z", + "start_time": "2018-11-03T02:09:46.501325Z" } }, "outputs": [], "source": [ + "import extract_features_pytorch\n", "from extract_features_pytorch import *" ] }, @@ -262,8 +266,8 @@ "execution_count": 8, "metadata": { "ExecuteTime": { - "end_time": "2018-11-02T14:09:19.196475Z", - "start_time": "2018-11-02T14:09:17.956199Z" + "end_time": "2018-11-03T02:09:48.292135Z", + "start_time": "2018-11-03T02:09:46.661921Z" } }, "outputs": [ @@ -574,7 +578,7 @@ "init_checkpoint_pt=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/google_models/uncased_L-12_H-768_A-12/pytorch_model.bin\"\n", "\n", "device = torch.device(\"cpu\")\n", - "model = BertModel(bert_config)\n", + "model = extract_features_pytorch.BertModel(bert_config)\n", "model.load_state_dict(torch.load(init_checkpoint_pt, map_location='cpu'))\n", "model.to(device)" ] @@ -584,8 +588,8 @@ "execution_count": 9, "metadata": { "ExecuteTime": { - "end_time": "2018-11-02T14:09:19.236256Z", - "start_time": "2018-11-02T14:09:19.198407Z" + "end_time": "2018-11-03T02:09:48.332982Z", + "start_time": "2018-11-03T02:09:48.294056Z" }, "code_folding": [] }, @@ -896,9 +900,10 @@ "source": [ "all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)\n", "all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)\n", + "all_input_type_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long)\n", "all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)\n", "\n", - "eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)\n", + "eval_data = TensorDataset(all_input_ids, all_input_mask, all_input_type_ids, all_example_index)\n", "eval_sampler = SequentialSampler(eval_data)\n", "eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=1)\n", "\n", @@ -907,41 +912,86 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 16, "metadata": { "ExecuteTime": { - "end_time": "2018-11-02T14:09:19.671994Z", - "start_time": "2018-11-02T14:09:19.239454Z" + "end_time": "2018-11-03T02:09:54.371188Z", + "start_time": "2018-11-03T02:09:53.976875Z" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([[ 101, 2040, 2001, 3958, 27227, 1029, 102, 3958, 27227, 2001,\n", + " 1037, 13997, 11510, 102, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0]])\n", + "tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", + " 0, 0, 0, 0, 0, 0, 0, 0]])\n", + "tensor([0])\n", + "layer 0 0\n", + "layer 1 1\n", + "layer 2 2\n", + "layer 3 3\n", + "layer 4 4\n", + "layer 5 5\n", + "layer 6 6\n", + "layer 7 7\n", + "layer 8 8\n", + "layer 9 9\n", + "layer 10 10\n", + "layer 11 11\n" + ] + } + ], "source": [ + "layer_indexes = list(range(12))\n", + "\n", "pytorch_all_out = []\n", - "for input_ids, input_mask, example_indices in eval_dataloader:\n", + "for input_ids, input_mask, input_type_ids, example_indices in eval_dataloader:\n", + " print(input_ids)\n", + " print(input_mask)\n", + " print(example_indices)\n", " input_ids = input_ids.to(device)\n", " input_mask = input_mask.float().to(device)\n", "\n", - " all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask)\n", + " all_encoder_layers, _ = model(input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)\n", "\n", - " for enc_layers, example_index in zip(all_encoder_layers, example_indices):\n", + " for b, example_index in enumerate(example_indices):\n", " feature = features[example_index.item()]\n", " unique_id = int(feature.unique_id)\n", " # feature = unique_id_to_feature[unique_id]\n", " output_json = collections.OrderedDict()\n", " output_json[\"linex_index\"] = unique_id\n", " all_out_features = []\n", - " for (i, token) in enumerate(feature.tokens):\n", - " all_layers = []\n", - " for (j, layer_index) in enumerate(layer_indexes):\n", - " layer_output = enc_layers[int(layer_index)].detach().cpu().numpy()\n", - " layers = collections.OrderedDict()\n", - " layers[\"index\"] = layer_index\n", - " layers[\"values\"] = [\n", - " round(float(x), 6) for x in layer_output[i:(i + 1)].flat\n", - " ]\n", - " all_layers.append(layers)\n", + " # for (i, token) in enumerate(feature.tokens):\n", + " all_layers = []\n", + " for (j, layer_index) in enumerate(layer_indexes):\n", + " print(\"layer\", j, layer_index)\n", + " layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()\n", + " layer_output = layer_output[b]\n", + " layers = collections.OrderedDict()\n", + " layers[\"index\"] = layer_index\n", + " layer_output = layer_output\n", + " layers[\"values\"] = layer_output if not isinstance(layer_output, (int, float)) else [layer_output]\n", + " all_layers.append(layers)\n", + "\n", " out_features = collections.OrderedDict()\n", - " out_features[\"token\"] = token\n", " out_features[\"layers\"] = all_layers\n", " all_out_features.append(out_features)\n", " output_json[\"features\"] = all_out_features\n", @@ -950,11 +1000,11 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 17, "metadata": { "ExecuteTime": { - "end_time": "2018-11-02T14:09:19.706616Z", - "start_time": "2018-11-02T14:09:19.673670Z" + "end_time": "2018-11-03T02:09:57.139854Z", + "start_time": "2018-11-03T02:09:57.104636Z" } }, "outputs": [ @@ -965,84 +1015,127 @@ "1\n", "2\n", "odict_keys(['linex_index', 'features'])\n", - "14\n" + "number of tokens 1\n", + "number of layers 12\n", + "hidden_size 128\n" ] + }, + { + "data": { + "text/plain": [ + "(128, 768)" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ "print(len(pytorch_all_out))\n", "print(len(pytorch_all_out[0]))\n", "print(pytorch_all_out[0].keys())\n", - "print(len(pytorch_all_out[0]['features']))" + "print(\"number of tokens\", len(pytorch_all_out))\n", + "print(\"number of layers\", len(pytorch_all_out[0]['features'][0]['layers']))\n", + "print(\"hidden_size\", len(pytorch_all_out[0]['features'][0]['layers'][0]['values']))\n", + "pytorch_all_out[0]['features'][0]['layers'][0]['values'].shape" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 18, "metadata": { "ExecuteTime": { - "end_time": "2018-11-02T14:10:28.295669Z", - "start_time": "2018-11-02T14:10:28.263140Z" + "end_time": "2018-11-03T02:09:59.000058Z", + "start_time": "2018-11-03T02:09:58.967575Z" } }, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(128, 768)\n", + "(128, 768)\n" + ] + } + ], + "source": [ + "pytorch_outputs = list(pytorch_all_out[0]['features'][0]['layers'][t]['values'] for t in layer_indexes)\n", + "print(pytorch_outputs[0].shape)\n", + "print(pytorch_outputs[1].shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-03T02:09:59.462123Z", + "start_time": "2018-11-03T02:09:59.430932Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(128, 768)\n", + "(128, 768)\n" + ] + } + ], + "source": [ + "print(tensorflow_outputs[0].shape)\n", + "print(tensorflow_outputs[1].shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-03T02:10:00.014784Z", + "start_time": "2018-11-03T02:09:59.983978Z" + } + }, + "outputs": [], + "source": [ + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-03T02:10:09.582557Z", + "start_time": "2018-11-03T02:10:09.549308Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(128, 768) (128, 768)\n" + ] + }, { "data": { "text/plain": [ - "[-0.016153,\n", - " -0.697252,\n", - " -0.298296,\n", - " -0.167194,\n", - " -0.219306,\n", - " 0.061712,\n", - " -0.006953,\n", - " 0.366519,\n", - " -0.031027,\n", - " -0.33547]" + "4.1671223e-07" ] }, - "execution_count": 13, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "pytorch_output = pytorch_all_out[0]['features'][0]['layers'][0]['values']\n", - "pytorch_output[:10]" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": { - "ExecuteTime": { - "end_time": "2018-11-02T14:10:34.540457Z", - "start_time": "2018-11-02T14:10:34.510109Z" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[-0.628111,\n", - " 0.193215,\n", - " -0.75185,\n", - " -0.040464,\n", - " -0.875331,\n", - " 0.15654,\n", - " 1.385444,\n", - " 1.066997,\n", - " -0.349549,\n", - " 0.270686]" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "tensorflow_output[:10]" + "i = 11\n", + "print(np.array(tensorflow_outputs[i]).shape, np.array(pytorch_outputs[i]).shape)\n", + "np.sqrt(np.mean((np.array(tensorflow_outputs[i]) - np.array(pytorch_outputs[i]))**2.0))" ] }, { diff --git a/extract_features_pytorch.py b/extract_features_pytorch.py index 7596298cca..53a91ae48f 100644 --- a/extract_features_pytorch.py +++ b/extract_features_pytorch.py @@ -268,29 +268,31 @@ def main(): input_mask = input_mask.float().to(device) all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask) + all_encoder_layers = all_encoder_layers - for enc_layers, example_index in zip(all_encoder_layers, example_indices): + for b, example_index in enumerate(example_indices): feature = features[example_index.item()] unique_id = int(feature.unique_id) # feature = unique_id_to_feature[unique_id] output_json = collections.OrderedDict() output_json["linex_index"] = unique_id - all_features = [] + all_out_features = [] for (i, token) in enumerate(feature.tokens): all_layers = [] for (j, layer_index) in enumerate(layer_indexes): - layer_output = enc_layers[int(layer_index)].detach().cpu().numpy() + layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy() + layer_output = layer_output[b] layers = collections.OrderedDict() layers["index"] = layer_index layers["values"] = [ - round(float(x), 6) for x in layer_output[i:(i + 1)].flat + round(x.item(), 6) for x in layer_output[i] ] all_layers.append(layers) - features = collections.OrderedDict() - features["token"] = token - features["layers"] = all_layers - all_features.append(features) - output_json["features"] = all_features + out_features = collections.OrderedDict() + out_features["token"] = token + out_features["layers"] = all_layers + all_out_features.append(out_features) + output_json["features"] = all_out_features writer.write(json.dumps(output_json) + "\n") diff --git a/modeling_pytorch.py b/modeling_pytorch.py index 4a8514e3a0..76e34e09d4 100644 --- a/modeling_pytorch.py +++ b/modeling_pytorch.py @@ -27,8 +27,9 @@ import torch.nn as nn from torch.nn import CrossEntropyLoss def gelu(x): - return 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) - # OpenAI GPT gelu version was : 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) + return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) + # OpenAI GPT gelu version : + # return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) class BertConfig(object): @@ -157,7 +158,7 @@ class BERTEmbeddings(nn.Module): words_embeddings = self.word_embeddings(input_ids) position_embeddings = self.position_embeddings(position_ids) token_type_embeddings = self.token_type_embeddings(token_type_ids) - + embeddings = words_embeddings + position_embeddings + token_type_embeddings embeddings = self.LayerNorm(embeddings) embeddings = self.dropout(embeddings) @@ -196,19 +197,19 @@ class BERTSelfAttention(nn.Module): # T = `to_tensor` sequence length # N = `num_attention_heads` # H = `size_per_head` - query_layer = self.query(hidden_states) - key_layer = self.key(hidden_states) - value_layer = self.value(hidden_states) + mixed_query_layer = self.query(hidden_states) + mixed_key_layer = self.key(hidden_states) + mixed_value_layer = self.value(hidden_states) - query_layer = self.transpose_for_scores(query_layer) - key_layer = self.transpose_for_scores(key_layer, is_key_tensor=True) - value_layer = self.transpose_for_scores(value_layer) + query_layer = self.transpose_for_scores(mixed_query_layer) + key_layer = self.transpose_for_scores(mixed_key_layer) #, is_key_tensor=True) + value_layer = self.transpose_for_scores(mixed_value_layer) # Take the dot product between "query" and "key" to get the raw # attention scores. # `attention_scores` = [B, N, F, T] - attention_scores = torch.matmul(query_layer, key_layer) - attention_scores = attention_scores / math.sqrt(self.attention_head_size) + attention_scores_no_norm = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + attention_scores_no_mask = attention_scores_no_norm / math.sqrt(self.attention_head_size) # TODO clean up this (precompute) # MY PYTORCH: w = w * self.b + -1e9 * (1 - self.b) # TF implem method: mask_attn_weights @@ -220,21 +221,26 @@ class BERTSelfAttention(nn.Module): # adder = (1.0 - attention_mask) * -10000.0 # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. - attention_scores += attention_mask + attention_scores = attention_scores_no_mask + attention_mask # Normalize the attention scores to probabilities. # `attention_probs` = [B, N, F, T] - attention_probs = nn.Softmax(dim=-1)(attention_scores) + attention_probs_no_drop = nn.Softmax(dim=-1)(attention_scores) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. - attention_probs = self.dropout(attention_probs) + attention_probs = self.dropout(attention_probs_no_drop) context_layer = torch.matmul(attention_probs, value_layer) context_layer = context_layer.permute(0, 2, 1, 3).contiguous() new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) context_layer = context_layer.view(*new_context_layer_shape) + # aux_attention = attention_probs[0, 0, 0, :].view(1, 128, 1) + # aux_attention = aux_attention.permute(0, 2, 1, 3).contiguous().view(1, 128, 768) + # aux_attention = key_layer.permute(0, 2, 3, 1).contiguous().view(1, 128, 768) + # aux_attention = key_layer.permute(0, 2, 1, 3).contiguous().view(1, 128, 768) + return context_layer @@ -246,7 +252,7 @@ class BERTSelfOutput(nn.Module): self.dropout = nn.Dropout(config.hidden_dropout_prob) def forward(self, hidden_states, input_tensor): - hidden_states = self.dense(input_tensor) + hidden_states = self.dense(hidden_states) hidden_states = self.dropout(hidden_states) hidden_states = self.LayerNorm(hidden_states + input_tensor) return hidden_states @@ -259,8 +265,8 @@ class BERTAttention(nn.Module): self.output = BERTSelfOutput(config) def forward(self, input_tensor, attention_mask): - attention_output = self.self(input_tensor, attention_mask) - attention_output = self.output(attention_output, input_tensor) + self_output = self.self(input_tensor, attention_mask) + attention_output = self.output(self_output, input_tensor) return attention_output @@ -388,13 +394,16 @@ class BertModel(nn.Module): if token_type_ids is None: token_type_ids = torch.zeros_like(input_ids) - attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) - attention_mask = (1.0 - attention_mask) * -10000.0 + extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 embedding_output = self.embeddings(input_ids, token_type_ids) - all_encoder_layers = self.encoder(embedding_output, attention_mask) + all_encoder_layers = self.encoder(embedding_output, extended_attention_mask) sequence_output = all_encoder_layers[-1] pooled_output = self.pooler(sequence_output) + + # TODO DEbugging + # all_encoder_layers = [attention_mask, embeddings_sum, embedding_output] + all_encoder_layers return all_encoder_layers, pooled_output class BertForSequenceClassification(nn.Module):