From 8aa22af0c51ffc837453eb6f4302e7b3cca0a9ab Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Sat, 3 Nov 2018 03:11:13 +0100
Subject: [PATCH] fixing model

---
 Comparing TF and PT models.ipynb | 383 +++++++++++++++++++------------
 extract_features_pytorch.py      |  20 +-
 modeling_pytorch.py              |  49 ++--
 3 files changed, 278 insertions(+), 174 deletions(-)
diff --git a/Comparing TF and PT models.ipynb b/Comparing TF and PT models.ipynb
index e042bfc290..2f18d3b13c 100644
--- a/Comparing TF and PT models.ipynb	
+++ b/Comparing TF and PT models.ipynb	
@@ -12,8 +12,8 @@
    "execution_count": 1,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-11-02T14:09:09.239405Z",
-     "start_time": "2018-11-02T14:09:08.126668Z"
+     "end_time": "2018-11-03T02:09:37.498678Z",
+     "start_time": "2018-11-03T02:09:36.366672Z"
     }
    },
    "outputs": [],
@@ -26,8 +26,8 @@
    "execution_count": 2,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-11-02T14:09:09.370511Z",
-     "start_time": "2018-11-02T14:09:09.242527Z"
+     "end_time": "2018-11-03T02:09:37.621865Z",
+     "start_time": "2018-11-03T02:09:37.500988Z"
     }
    },
    "outputs": [
@@ -52,7 +52,7 @@
     "max_seq_length=128\n",
     "input_file=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/pytorch-pretrained-BERT/input.txt\"\n",
     "\n",
-    "layer_indexes = [-1]\n",
+    "layer_indexes = list(range(12))\n",
     "bert_config = modeling.BertConfig.from_json_file(bert_config_file)\n",
     "tokenizer = tokenization.FullTokenizer(\n",
     "    vocab_file=vocab_file, do_lower_case=True)\n",
@@ -70,8 +70,8 @@
    "execution_count": 3,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-11-02T14:09:12.514617Z",
-     "start_time": "2018-11-02T14:09:09.372137Z"
+     "end_time": "2018-11-03T02:09:40.831618Z",
+     "start_time": "2018-11-03T02:09:37.624063Z"
     }
    },
    "outputs": [
@@ -79,15 +79,15 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "WARNING:tensorflow:Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x12b266ae8>) includes params argument, but params are not passed to Estimator.\n",
-      "WARNING:tensorflow:Using temporary folder as model directory: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmphrjfnoqh\n",
-      "INFO:tensorflow:Using config: {'_model_dir': '/var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmphrjfnoqh', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n",
+      "WARNING:tensorflow:Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x12b0bcc80>) includes params argument, but params are not passed to Estimator.\n",
+      "WARNING:tensorflow:Using temporary folder as model directory: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpgpb5nz3u\n",
+      "INFO:tensorflow:Using config: {'_model_dir': '/var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpgpb5nz3u', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n",
       "graph_options {\n",
       "  rewrite_options {\n",
       "    meta_optimizer_iterations: ONE\n",
       "  }\n",
       "}\n",
-      ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12e2c1160>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\n",
+      ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12e1160f0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\n",
       "WARNING:tensorflow:Setting TPUConfig.num_shards==1 is an unsupported behavior. Please fix as soon as possible (leaving num_shards as None.\n",
       "INFO:tensorflow:_TPUContext: eval_on_tpu True\n",
       "WARNING:tensorflow:eval_on_tpu ignored because use_tpu is False.\n"
@@ -126,8 +126,8 @@
    "execution_count": 4,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-11-02T14:09:17.745970Z",
-     "start_time": "2018-11-02T14:09:12.516953Z"
+     "end_time": "2018-11-03T02:09:46.413197Z",
+     "start_time": "2018-11-03T02:09:40.834621Z"
     }
    },
    "outputs": [
@@ -135,42 +135,53 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "INFO:tensorflow:Could not find trained model in model_dir: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmphrjfnoqh, running initialization to predict.\n",
+      "INFO:tensorflow:Could not find trained model in model_dir: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpgpb5nz3u, running initialization to predict.\n",
       "INFO:tensorflow:Calling model_fn.\n",
       "INFO:tensorflow:Running infer on CPU\n",
       "INFO:tensorflow:Done calling model_fn.\n",
       "INFO:tensorflow:Graph was finalized.\n",
       "INFO:tensorflow:Running local_init_op.\n",
       "INFO:tensorflow:Done running local_init_op.\n",
+      "extracting layer 0\n",
+      "extracting layer 1\n",
+      "extracting layer 2\n",
+      "extracting layer 3\n",
+      "extracting layer 4\n",
+      "extracting layer 5\n",
+      "extracting layer 6\n",
+      "extracting layer 7\n",
+      "extracting layer 8\n",
+      "extracting layer 9\n",
+      "extracting layer 10\n",
+      "extracting layer 11\n",
       "INFO:tensorflow:prediction_loop marked as finished\n",
       "INFO:tensorflow:prediction_loop marked as finished\n"
      ]
     }
    ],
    "source": [
-    "all_out = []\n",
+    "tensorflow_all_out = []\n",
     "for result in estimator.predict(input_fn, yield_single_examples=True):\n",
     "    unique_id = int(result[\"unique_id\"])\n",
     "    feature = unique_id_to_feature[unique_id]\n",
     "    output_json = collections.OrderedDict()\n",
     "    output_json[\"linex_index\"] = unique_id\n",
-    "    all_out_features = []\n",
-    "    for (i, token) in enumerate(feature.tokens):\n",
-    "        all_layers = []\n",
-    "        for (j, layer_index) in enumerate(layer_indexes):\n",
-    "            layer_output = result[\"layer_output_%d\" % j]\n",
-    "            layers = collections.OrderedDict()\n",
-    "            layers[\"index\"] = layer_index\n",
-    "            layers[\"values\"] = [\n",
-    "                round(float(x), 6) for x in layer_output[i:(i + 1)].flat\n",
-    "            ]\n",
-    "            all_layers.append(layers)\n",
-    "        out_features = collections.OrderedDict()\n",
-    "        out_features[\"token\"] = token\n",
-    "        out_features[\"layers\"] = all_layers\n",
-    "        all_out_features.append(out_features)\n",
-    "    output_json[\"features\"] = all_out_features\n",
-    "    all_out.append(output_json)"
+    "    tensorflow_all_out_features = []\n",
+    "    # for (i, token) in enumerate(feature.tokens):\n",
+    "    all_layers = []\n",
+    "    for (j, layer_index) in enumerate(layer_indexes):\n",
+    "        print(\"extracting layer {}\".format(j))\n",
+    "        layer_output = result[\"layer_output_%d\" % j]\n",
+    "        layers = collections.OrderedDict()\n",
+    "        layers[\"index\"] = layer_index\n",
+    "        layers[\"values\"] = layer_output\n",
+    "        all_layers.append(layers)\n",
+    "    tensorflow_out_features = collections.OrderedDict()\n",
+    "    tensorflow_out_features[\"layers\"] = all_layers\n",
+    "    tensorflow_all_out_features.append(tensorflow_out_features)\n",
+    "\n",
+    "    output_json[\"features\"] = tensorflow_all_out_features\n",
+    "    tensorflow_all_out.append(output_json)"
    ]
   },
   {
@@ -178,8 +189,8 @@
    "execution_count": 5,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-11-02T14:09:17.780532Z",
-     "start_time": "2018-11-02T14:09:17.748778Z"
+     "end_time": "2018-11-03T02:09:46.460128Z",
+     "start_time": "2018-11-03T02:09:46.416138Z"
     }
    },
    "outputs": [
@@ -190,15 +201,28 @@
       "1\n",
       "2\n",
       "odict_keys(['linex_index', 'features'])\n",
-      "14\n"
+      "number of tokens 1\n",
+      "number of layers 12\n"
      ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(128, 768)"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
-    "print(len(all_out))\n",
-    "print(len(all_out[0]))\n",
-    "print(all_out[0].keys())\n",
-    "print(len(all_out[0]['features']))"
+    "print(len(tensorflow_all_out))\n",
+    "print(len(tensorflow_all_out[0]))\n",
+    "print(tensorflow_all_out[0].keys())\n",
+    "print(\"number of tokens\", len(tensorflow_all_out[0]['features']))\n",
+    "print(\"number of layers\", len(tensorflow_all_out[0]['features'][0]['layers']))\n",
+    "tensorflow_all_out[0]['features'][0]['layers'][0]['values'].shape"
    ]
   },
   {
@@ -206,34 +230,13 @@
    "execution_count": 6,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-11-02T14:09:17.818968Z",
-     "start_time": "2018-11-02T14:09:17.782121Z"
+     "end_time": "2018-11-03T02:09:46.498637Z",
+     "start_time": "2018-11-03T02:09:46.463115Z"
     }
    },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[-0.628111,\n",
-       " 0.193215,\n",
-       " -0.75185,\n",
-       " -0.040464,\n",
-       " -0.875331,\n",
-       " 0.15654,\n",
-       " 1.385444,\n",
-       " 1.066997,\n",
-       " -0.349549,\n",
-       " 0.270686]"
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "tensorflow_output = all_out[0]['features'][0]['layers'][0]['values']\n",
-    "tensorflow_output[:10]"
+    "tensorflow_outputs = list(tensorflow_all_out[0]['features'][0]['layers'][t]['values'] for t in layer_indexes)"
    ]
   },
   {
@@ -248,12 +251,13 @@
    "execution_count": 7,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-11-02T14:09:17.954196Z",
-     "start_time": "2018-11-02T14:09:17.821115Z"
+     "end_time": "2018-11-03T02:09:46.660303Z",
+     "start_time": "2018-11-03T02:09:46.501325Z"
     }
    },
    "outputs": [],
    "source": [
+    "import extract_features_pytorch\n",
     "from extract_features_pytorch import *"
    ]
   },
@@ -262,8 +266,8 @@
    "execution_count": 8,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-11-02T14:09:19.196475Z",
-     "start_time": "2018-11-02T14:09:17.956199Z"
+     "end_time": "2018-11-03T02:09:48.292135Z",
+     "start_time": "2018-11-03T02:09:46.661921Z"
     }
    },
    "outputs": [
@@ -574,7 +578,7 @@
     "init_checkpoint_pt=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/google_models/uncased_L-12_H-768_A-12/pytorch_model.bin\"\n",
     "\n",
     "device = torch.device(\"cpu\")\n",
-    "model = BertModel(bert_config)\n",
+    "model = extract_features_pytorch.BertModel(bert_config)\n",
     "model.load_state_dict(torch.load(init_checkpoint_pt, map_location='cpu'))\n",
     "model.to(device)"
    ]
@@ -584,8 +588,8 @@
    "execution_count": 9,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-11-02T14:09:19.236256Z",
-     "start_time": "2018-11-02T14:09:19.198407Z"
+     "end_time": "2018-11-03T02:09:48.332982Z",
+     "start_time": "2018-11-03T02:09:48.294056Z"
     },
     "code_folding": []
    },
@@ -896,9 +900,10 @@
    "source": [
     "all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)\n",
     "all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)\n",
+    "all_input_type_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long)\n",
     "all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)\n",
     "\n",
-    "eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)\n",
+    "eval_data = TensorDataset(all_input_ids, all_input_mask, all_input_type_ids, all_example_index)\n",
     "eval_sampler = SequentialSampler(eval_data)\n",
     "eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=1)\n",
     "\n",
@@ -907,41 +912,86 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 16,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-11-02T14:09:19.671994Z",
-     "start_time": "2018-11-02T14:09:19.239454Z"
+     "end_time": "2018-11-03T02:09:54.371188Z",
+     "start_time": "2018-11-03T02:09:53.976875Z"
     }
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensor([[  101,  2040,  2001,  3958, 27227,  1029,   102,  3958, 27227,  2001,\n",
+      "          1037, 13997, 11510,   102,     0,     0,     0,     0,     0,     0,\n",
+      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+      "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
+      "             0,     0,     0,     0,     0,     0,     0,     0]])\n",
+      "tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+      "         0, 0, 0, 0, 0, 0, 0, 0]])\n",
+      "tensor([0])\n",
+      "layer 0 0\n",
+      "layer 1 1\n",
+      "layer 2 2\n",
+      "layer 3 3\n",
+      "layer 4 4\n",
+      "layer 5 5\n",
+      "layer 6 6\n",
+      "layer 7 7\n",
+      "layer 8 8\n",
+      "layer 9 9\n",
+      "layer 10 10\n",
+      "layer 11 11\n"
+     ]
+    }
+   ],
    "source": [
+    "layer_indexes = list(range(12))\n",
+    "\n",
     "pytorch_all_out = []\n",
-    "for input_ids, input_mask, example_indices in eval_dataloader:\n",
+    "for input_ids, input_mask, input_type_ids, example_indices in eval_dataloader:\n",
+    "    print(input_ids)\n",
+    "    print(input_mask)\n",
+    "    print(example_indices)\n",
     "    input_ids = input_ids.to(device)\n",
     "    input_mask = input_mask.float().to(device)\n",
     "\n",
-    "    all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask)\n",
+    "    all_encoder_layers, _ = model(input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)\n",
     "\n",
-    "    for enc_layers, example_index in zip(all_encoder_layers, example_indices):\n",
+    "    for b, example_index in enumerate(example_indices):\n",
     "        feature = features[example_index.item()]\n",
     "        unique_id = int(feature.unique_id)\n",
     "        # feature = unique_id_to_feature[unique_id]\n",
     "        output_json = collections.OrderedDict()\n",
     "        output_json[\"linex_index\"] = unique_id\n",
     "        all_out_features = []\n",
-    "        for (i, token) in enumerate(feature.tokens):\n",
-    "            all_layers = []\n",
-    "            for (j, layer_index) in enumerate(layer_indexes):\n",
-    "                layer_output = enc_layers[int(layer_index)].detach().cpu().numpy()\n",
-    "                layers = collections.OrderedDict()\n",
-    "                layers[\"index\"] = layer_index\n",
-    "                layers[\"values\"] = [\n",
-    "                    round(float(x), 6) for x in layer_output[i:(i + 1)].flat\n",
-    "                ]\n",
-    "                all_layers.append(layers)\n",
+    "        # for (i, token) in enumerate(feature.tokens):\n",
+    "        all_layers = []\n",
+    "        for (j, layer_index) in enumerate(layer_indexes):\n",
+    "            print(\"layer\", j, layer_index)\n",
+    "            layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()\n",
+    "            layer_output = layer_output[b]\n",
+    "            layers = collections.OrderedDict()\n",
+    "            layers[\"index\"] = layer_index\n",
+    "            layer_output = layer_output\n",
+    "            layers[\"values\"] = layer_output if not isinstance(layer_output, (int, float)) else [layer_output]\n",
+    "            all_layers.append(layers)\n",
+    "\n",
     "            out_features = collections.OrderedDict()\n",
-    "            out_features[\"token\"] = token\n",
     "            out_features[\"layers\"] = all_layers\n",
     "            all_out_features.append(out_features)\n",
     "        output_json[\"features\"] = all_out_features\n",
@@ -950,11 +1000,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 17,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-11-02T14:09:19.706616Z",
-     "start_time": "2018-11-02T14:09:19.673670Z"
+     "end_time": "2018-11-03T02:09:57.139854Z",
+     "start_time": "2018-11-03T02:09:57.104636Z"
     }
    },
    "outputs": [
@@ -965,84 +1015,127 @@
       "1\n",
       "2\n",
       "odict_keys(['linex_index', 'features'])\n",
-      "14\n"
+      "number of tokens 1\n",
+      "number of layers 12\n",
+      "hidden_size 128\n"
      ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(128, 768)"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
     "print(len(pytorch_all_out))\n",
     "print(len(pytorch_all_out[0]))\n",
     "print(pytorch_all_out[0].keys())\n",
-    "print(len(pytorch_all_out[0]['features']))"
+    "print(\"number of tokens\", len(pytorch_all_out))\n",
+    "print(\"number of layers\", len(pytorch_all_out[0]['features'][0]['layers']))\n",
+    "print(\"hidden_size\", len(pytorch_all_out[0]['features'][0]['layers'][0]['values']))\n",
+    "pytorch_all_out[0]['features'][0]['layers'][0]['values'].shape"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 18,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-11-02T14:10:28.295669Z",
-     "start_time": "2018-11-02T14:10:28.263140Z"
+     "end_time": "2018-11-03T02:09:59.000058Z",
+     "start_time": "2018-11-03T02:09:58.967575Z"
     }
    },
    "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(128, 768)\n",
+      "(128, 768)\n"
+     ]
+    }
+   ],
+   "source": [
+    "pytorch_outputs = list(pytorch_all_out[0]['features'][0]['layers'][t]['values'] for t in layer_indexes)\n",
+    "print(pytorch_outputs[0].shape)\n",
+    "print(pytorch_outputs[1].shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-03T02:09:59.462123Z",
+     "start_time": "2018-11-03T02:09:59.430932Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(128, 768)\n",
+      "(128, 768)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(tensorflow_outputs[0].shape)\n",
+    "print(tensorflow_outputs[1].shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-03T02:10:00.014784Z",
+     "start_time": "2018-11-03T02:09:59.983978Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-03T02:10:09.582557Z",
+     "start_time": "2018-11-03T02:10:09.549308Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(128, 768) (128, 768)\n"
+     ]
+    },
     {
      "data": {
       "text/plain": [
-       "[-0.016153,\n",
-       " -0.697252,\n",
-       " -0.298296,\n",
-       " -0.167194,\n",
-       " -0.219306,\n",
-       " 0.061712,\n",
-       " -0.006953,\n",
-       " 0.366519,\n",
-       " -0.031027,\n",
-       " -0.33547]"
+       "4.1671223e-07"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 24,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "pytorch_output = pytorch_all_out[0]['features'][0]['layers'][0]['values']\n",
-    "pytorch_output[:10]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-02T14:10:34.540457Z",
-     "start_time": "2018-11-02T14:10:34.510109Z"
-    }
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[-0.628111,\n",
-       " 0.193215,\n",
-       " -0.75185,\n",
-       " -0.040464,\n",
-       " -0.875331,\n",
-       " 0.15654,\n",
-       " 1.385444,\n",
-       " 1.066997,\n",
-       " -0.349549,\n",
-       " 0.270686]"
-      ]
-     },
-     "execution_count": 14,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "tensorflow_output[:10]"
+    "i = 11\n",
+    "print(np.array(tensorflow_outputs[i]).shape, np.array(pytorch_outputs[i]).shape)\n",
+    "np.sqrt(np.mean((np.array(tensorflow_outputs[i]) - np.array(pytorch_outputs[i]))**2.0))"
    ]
   },
   {
diff --git a/extract_features_pytorch.py b/extract_features_pytorch.py
index 7596298cca..53a91ae48f 100644
--- a/extract_features_pytorch.py
+++ b/extract_features_pytorch.py
@@ -268,29 +268,31 @@ def main():
             input_mask = input_mask.float().to(device)
 
             all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask)
+            all_encoder_layers = all_encoder_layers
 
-            for enc_layers, example_index in zip(all_encoder_layers, example_indices):
+            for b, example_index in enumerate(example_indices):
                 feature = features[example_index.item()]
                 unique_id = int(feature.unique_id)
                 # feature = unique_id_to_feature[unique_id]
                 output_json = collections.OrderedDict()
                 output_json["linex_index"] = unique_id
-                all_features = []
+                all_out_features = []
                 for (i, token) in enumerate(feature.tokens):
                     all_layers = []
                     for (j, layer_index) in enumerate(layer_indexes):
-                        layer_output = enc_layers[int(layer_index)].detach().cpu().numpy()
+                        layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()
+                        layer_output = layer_output[b]
                         layers = collections.OrderedDict()
                         layers["index"] = layer_index
                         layers["values"] = [
-                            round(float(x), 6) for x in layer_output[i:(i + 1)].flat
+                            round(x.item(), 6) for x in layer_output[i]
                         ]
                         all_layers.append(layers)
-                    features = collections.OrderedDict()
-                    features["token"] = token
-                    features["layers"] = all_layers
-                    all_features.append(features)
-                output_json["features"] = all_features
+                    out_features = collections.OrderedDict()
+                    out_features["token"] = token
+                    out_features["layers"] = all_layers
+                    all_out_features.append(out_features)
+                output_json["features"] = all_out_features
                 writer.write(json.dumps(output_json) + "\n")
 
 
diff --git a/modeling_pytorch.py b/modeling_pytorch.py
index 4a8514e3a0..76e34e09d4 100644
--- a/modeling_pytorch.py
+++ b/modeling_pytorch.py
@@ -27,8 +27,9 @@ import torch.nn as nn
 from torch.nn import CrossEntropyLoss
 
 def gelu(x):
-    return 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
-    # OpenAI GPT gelu version was : 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+    # OpenAI GPT gelu version :
+    # return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
 
 
 class BertConfig(object):
@@ -157,7 +158,7 @@ class BERTEmbeddings(nn.Module):
         words_embeddings = self.word_embeddings(input_ids)
         position_embeddings = self.position_embeddings(position_ids)
         token_type_embeddings = self.token_type_embeddings(token_type_ids)
-    
+
         embeddings = words_embeddings + position_embeddings + token_type_embeddings
         embeddings = self.LayerNorm(embeddings)
         embeddings = self.dropout(embeddings)
@@ -196,19 +197,19 @@ class BERTSelfAttention(nn.Module):
         #   T = `to_tensor` sequence length
         #   N = `num_attention_heads`
         #   H = `size_per_head`
-        query_layer = self.query(hidden_states)
-        key_layer = self.key(hidden_states)
-        value_layer = self.value(hidden_states)
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
 
-        query_layer = self.transpose_for_scores(query_layer)
-        key_layer = self.transpose_for_scores(key_layer, is_key_tensor=True)
-        value_layer = self.transpose_for_scores(value_layer)
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer) #, is_key_tensor=True)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
 
         # Take the dot product between "query" and "key" to get the raw
         # attention scores.
         # `attention_scores` = [B, N, F, T]
-        attention_scores = torch.matmul(query_layer, key_layer)
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        attention_scores_no_norm = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores_no_mask = attention_scores_no_norm / math.sqrt(self.attention_head_size)
 
         # TODO clean up this (precompute)
         # MY PYTORCH: w = w * self.b + -1e9 * (1 - self.b)  # TF implem method: mask_attn_weights
@@ -220,21 +221,26 @@ class BERTSelfAttention(nn.Module):
         # adder = (1.0 - attention_mask) * -10000.0
         # Since we are adding it to the raw scores before the softmax, this is
         # effectively the same as removing these entirely.
-        attention_scores += attention_mask
+        attention_scores = attention_scores_no_mask + attention_mask
 
         # Normalize the attention scores to probabilities.
         # `attention_probs` = [B, N, F, T]
-        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        attention_probs_no_drop = nn.Softmax(dim=-1)(attention_scores)
 
         # This is actually dropping out entire tokens to attend to, which might
         # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
+        attention_probs = self.dropout(attention_probs_no_drop)
 
         context_layer = torch.matmul(attention_probs, value_layer)
         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
         context_layer = context_layer.view(*new_context_layer_shape)
 
+        # aux_attention = attention_probs[0, 0, 0, :].view(1, 128, 1)
+        # aux_attention = aux_attention.permute(0, 2, 1, 3).contiguous().view(1, 128, 768)
+        # aux_attention = key_layer.permute(0, 2, 3, 1).contiguous().view(1, 128, 768)
+        # aux_attention = key_layer.permute(0, 2, 1, 3).contiguous().view(1, 128, 768)
+
         return context_layer
 
 
@@ -246,7 +252,7 @@ class BERTSelfOutput(nn.Module):
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, hidden_states, input_tensor):
-        hidden_states = self.dense(input_tensor)
+        hidden_states = self.dense(hidden_states)
         hidden_states = self.dropout(hidden_states)
         hidden_states = self.LayerNorm(hidden_states + input_tensor)
         return hidden_states
@@ -259,8 +265,8 @@ class BERTAttention(nn.Module):
         self.output = BERTSelfOutput(config)
 
     def forward(self, input_tensor, attention_mask):
-        attention_output = self.self(input_tensor, attention_mask)
-        attention_output = self.output(attention_output, input_tensor)
+        self_output = self.self(input_tensor, attention_mask)
+        attention_output = self.output(self_output, input_tensor)
         return attention_output
 
 
@@ -388,13 +394,16 @@ class BertModel(nn.Module):
         if token_type_ids is None:
             token_type_ids = torch.zeros_like(input_ids)
 
-        attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-        attention_mask = (1.0 - attention_mask) * -10000.0
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
 
         embedding_output = self.embeddings(input_ids, token_type_ids)
-        all_encoder_layers = self.encoder(embedding_output, attention_mask)
+        all_encoder_layers = self.encoder(embedding_output, extended_attention_mask)
         sequence_output = all_encoder_layers[-1]
         pooled_output = self.pooler(sequence_output)
+
+        # TODO DEbugging
+        # all_encoder_layers = [attention_mask, embeddings_sum, embedding_output] + all_encoder_layers
         return all_encoder_layers, pooled_output
 
 class BertForSequenceClassification(nn.Module):