From fd647e8c87569121653aa8b4b6cef72a9344e8fd Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 16 Nov 2018 11:04:31 +0100
Subject: [PATCH] comparison masked LM ok

---
 .../Comparing TF and PT models_MLM_NSP.ipynb  | 5149 ++++++++++++++---
 1 file changed, 4348 insertions(+), 801 deletions(-)

diff --git a/notebooks/Comparing TF and PT models_MLM_NSP.ipynb b/notebooks/Comparing TF and PT models_MLM_NSP.ipynb
index 7b226e8371..d5e6bac68f 100644
--- a/notebooks/Comparing TF and PT models_MLM_NSP.ipynb	
+++ b/notebooks/Comparing TF and PT models_MLM_NSP.ipynb	
@@ -22,8 +22,8 @@
    "execution_count": 1,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-11-05T13:58:50.559657Z",
-     "start_time": "2018-11-05T13:58:50.546096Z"
+     "end_time": "2018-11-16T10:02:26.999106Z",
+     "start_time": "2018-11-16T10:02:26.985709Z"
     }
    },
    "outputs": [],
@@ -44,8 +44,8 @@
    "execution_count": 2,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-11-05T13:58:50.574455Z",
-     "start_time": "2018-11-05T13:58:50.561988Z"
+     "end_time": "2018-11-16T10:02:27.664528Z",
+     "start_time": "2018-11-16T10:02:27.651019Z"
     }
    },
    "outputs": [],
@@ -58,7 +58,10 @@
     "init_checkpoint = model_dir + \"bert_model.ckpt\"\n",
     "\n",
     "input_file = \"./samples/input.txt\"\n",
-    "max_seq_length = 128"
+    "max_seq_length = 128\n",
+    "max_predictions_per_seq = 20\n",
+    "\n",
+    "masked_lm_positions = [6]"
    ]
   },
   {
@@ -66,21 +69,33 @@
    "execution_count": 3,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-11-05T13:58:52.202531Z",
-     "start_time": "2018-11-05T13:58:50.576198Z"
+     "end_time": "2018-11-16T10:02:30.202182Z",
+     "start_time": "2018-11-16T10:02:28.112570Z"
     }
    },
    "outputs": [],
    "source": [
     "import importlib.util\n",
     "import sys\n",
+    "import tensorflow as tf\n",
+    "import pytorch_pretrained_bert as ppb\n",
     "\n",
-    "spec = importlib.util.spec_from_file_location('*', original_tf_inplem_dir + '/extract_features.py')\n",
-    "module = importlib.util.module_from_spec(spec)\n",
-    "spec.loader.exec_module(module)\n",
-    "sys.modules['extract_features_tensorflow'] = module\n",
+    "def del_all_flags(FLAGS):\n",
+    "    flags_dict = FLAGS._flags()    \n",
+    "    keys_list = [keys for keys in flags_dict]    \n",
+    "    for keys in keys_list:\n",
+    "        FLAGS.__delattr__(keys)\n",
     "\n",
-    "from extract_features_tensorflow import *"
+    "del_all_flags(tf.flags.FLAGS)\n",
+    "import tensorflow_code.extract_features as ef\n",
+    "del_all_flags(tf.flags.FLAGS)\n",
+    "import tensorflow_code.modeling as tfm\n",
+    "del_all_flags(tf.flags.FLAGS)\n",
+    "import tensorflow_code.tokenization as tft\n",
+    "del_all_flags(tf.flags.FLAGS)\n",
+    "import tensorflow_code.run_pretraining as rp\n",
+    "del_all_flags(tf.flags.FLAGS)\n",
+    "import tensorflow_code.create_pretraining_data as cpp"
    ]
   },
   {
@@ -88,36 +103,66 @@
    "execution_count": 4,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-11-05T13:58:52.325822Z",
-     "start_time": "2018-11-05T13:58:52.205361Z"
-    }
+     "end_time": "2018-11-16T10:02:30.238027Z",
+     "start_time": "2018-11-16T10:02:30.204943Z"
+    },
+    "code_folding": [
+     15
+    ]
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "INFO:tensorflow:*** Example ***\n",
-      "INFO:tensorflow:unique_id: 0\n",
-      "INFO:tensorflow:tokens: [CLS] who was jim henson ? [SEP] jim henson was a puppet ##eer [SEP]\n",
-      "INFO:tensorflow:input_ids: 101 2040 2001 3958 27227 1029 102 3958 27227 2001 1037 13997 11510 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n",
-      "INFO:tensorflow:input_type_ids: 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "layer_indexes = list(range(12))\n",
-    "bert_config = modeling.BertConfig.from_json_file(bert_config_file)\n",
-    "tokenizer = tokenization.BertTokenizer(\n",
-    "    vocab_file=vocab_file, do_lower_case=True)\n",
-    "examples = read_examples(input_file)\n",
+    "import re\n",
+    "class InputExample(object):\n",
+    "    \"\"\"A single instance example.\"\"\"\n",
     "\n",
-    "features = convert_examples_to_features(\n",
-    "    examples=examples, seq_length=max_seq_length, tokenizer=tokenizer)\n",
-    "unique_id_to_feature = {}\n",
-    "for feature in features:\n",
-    "    unique_id_to_feature[feature.unique_id] = feature"
+    "    def __init__(self, tokens, segment_ids, masked_lm_positions,\n",
+    "                 masked_lm_labels, is_random_next):\n",
+    "        self.tokens = tokens\n",
+    "        self.segment_ids = segment_ids\n",
+    "        self.masked_lm_positions = masked_lm_positions\n",
+    "        self.masked_lm_labels = masked_lm_labels\n",
+    "        self.is_random_next = is_random_next\n",
+    "    def __repr__(self):\n",
+    "        return '\\n'.join(k + \":\" + str(v) for k, v in self.__dict__.items())\n",
+    "\n",
+    "\n",
+    "def read_examples(input_file, tokenizer, masked_lm_positions):\n",
+    "    \"\"\"Read a list of `InputExample`s from an input file.\"\"\"\n",
+    "    examples = []\n",
+    "    unique_id = 0\n",
+    "    with tf.gfile.GFile(input_file, \"r\") as reader:\n",
+    "        while True:\n",
+    "            line = reader.readline()#tokenization.convert_to_unicode(reader.readline())\n",
+    "            if not line:\n",
+    "                break\n",
+    "            line = line.strip()\n",
+    "            text_a = None\n",
+    "            text_b = None\n",
+    "            m = re.match(r\"^(.*) \\|\\|\\| (.*)$\", line)\n",
+    "            if m is None:\n",
+    "                text_a = line\n",
+    "            else:\n",
+    "                text_a = m.group(1)\n",
+    "                text_b = m.group(2)\n",
+    "            tokens_a = tokenizer.tokenize(text_a)\n",
+    "            tokens_b = None\n",
+    "            if text_b:\n",
+    "                tokens_b = tokenizer.tokenize(text_b)\n",
+    "            tokens = tokens_a + tokens_b\n",
+    "            masked_lm_labels = []\n",
+    "            for m_pos in masked_lm_positions:\n",
+    "                masked_lm_labels.append(tokens[m_pos])\n",
+    "                tokens[m_pos] = '[MASK]'\n",
+    "            examples.append(\n",
+    "                InputExample(\n",
+    "                    tokens = tokens,\n",
+    "                    segment_ids = [0] * len(tokens_a) + [1] * len(tokens_b),\n",
+    "                    masked_lm_positions = masked_lm_positions,\n",
+    "                    masked_lm_labels = masked_lm_labels,\n",
+    "                    is_random_next = False))\n",
+    "            unique_id += 1\n",
+    "    return examples"
    ]
   },
   {
@@ -125,8 +170,8 @@
    "execution_count": 5,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-11-05T13:58:55.939938Z",
-     "start_time": "2018-11-05T13:58:52.330202Z"
+     "end_time": "2018-11-16T10:02:30.304018Z",
+     "start_time": "2018-11-16T10:02:30.240189Z"
     }
    },
    "outputs": [
@@ -134,19 +179,534 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "WARNING:tensorflow:Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x12839dbf8>) includes params argument, but params are not passed to Estimator.\n",
-      "WARNING:tensorflow:Using temporary folder as model directory: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpdbx_h23u\n",
-      "INFO:tensorflow:Using config: {'_model_dir': '/var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpdbx_h23u', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n",
+      "tokens:['who', 'was', 'jim', 'henson', '?', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer']\n",
+      "segment_ids:[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]\n",
+      "masked_lm_positions:[6]\n",
+      "masked_lm_labels:['henson']\n",
+      "is_random_next:False\n"
+     ]
+    }
+   ],
+   "source": [
+    "bert_config = tfm.BertConfig.from_json_file(bert_config_file)\n",
+    "tokenizer = ppb.BertTokenizer(\n",
+    "    vocab_file=vocab_file, do_lower_case=True)\n",
+    "examples = read_examples(input_file, tokenizer, masked_lm_positions=masked_lm_positions)\n",
+    "\n",
+    "print(examples[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-16T10:02:33.324167Z",
+     "start_time": "2018-11-16T10:02:33.291909Z"
+    },
+    "code_folding": [
+     16
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "class InputFeatures(object):\n",
+    "    \"\"\"A single set of features of data.\"\"\"\n",
+    "\n",
+    "    def __init__(self, input_ids, input_mask, segment_ids, masked_lm_positions,\n",
+    "                 masked_lm_ids, masked_lm_weights, next_sentence_label):\n",
+    "        self.input_ids = input_ids\n",
+    "        self.input_mask = input_mask\n",
+    "        self.segment_ids = segment_ids\n",
+    "        self.masked_lm_positions = masked_lm_positions\n",
+    "        self.masked_lm_ids = masked_lm_ids\n",
+    "        self.masked_lm_weights = masked_lm_weights\n",
+    "        self.next_sentence_labels = next_sentence_label\n",
+    "\n",
+    "    def __repr__(self):\n",
+    "        return '\\n'.join(k + \":\" + str(v) for k, v in self.__dict__.items())\n",
+    "\n",
+    "def pretraining_convert_examples_to_features(instances, tokenizer, max_seq_length,\n",
+    "                                 max_predictions_per_seq):\n",
+    "    \"\"\"Create TF example files from `TrainingInstance`s.\"\"\"\n",
+    "    features = []\n",
+    "    for (inst_index, instance) in enumerate(instances):\n",
+    "        input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)\n",
+    "        input_mask = [1] * len(input_ids)\n",
+    "        segment_ids = list(instance.segment_ids)\n",
+    "        assert len(input_ids) <= max_seq_length\n",
+    "\n",
+    "        while len(input_ids) < max_seq_length:\n",
+    "            input_ids.append(0)\n",
+    "            input_mask.append(0)\n",
+    "            segment_ids.append(0)\n",
+    "\n",
+    "        assert len(input_ids) == max_seq_length\n",
+    "        assert len(input_mask) == max_seq_length\n",
+    "        assert len(segment_ids) == max_seq_length\n",
+    "\n",
+    "        masked_lm_positions = list(instance.masked_lm_positions)\n",
+    "        masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)\n",
+    "        masked_lm_weights = [1.0] * len(masked_lm_ids)\n",
+    "\n",
+    "        while len(masked_lm_positions) < max_predictions_per_seq:\n",
+    "            masked_lm_positions.append(0)\n",
+    "            masked_lm_ids.append(0)\n",
+    "            masked_lm_weights.append(0.0)\n",
+    "\n",
+    "        next_sentence_label = 1 if instance.is_random_next else 0\n",
+    "\n",
+    "        features.append(\n",
+    "            InputFeatures(input_ids, input_mask, segment_ids,\n",
+    "                          masked_lm_positions, masked_lm_ids,\n",
+    "                          masked_lm_weights, next_sentence_label))\n",
+    "\n",
+    "        if inst_index < 5:\n",
+    "            tf.logging.info(\"*** Example ***\")\n",
+    "            tf.logging.info(\"tokens: %s\" % \" \".join(\n",
+    "                [str(x) for x in instance.tokens]))\n",
+    "            tf.logging.info(\"features: %s\" % str(features[-1]))\n",
+    "    return features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-16T10:02:34.185367Z",
+     "start_time": "2018-11-16T10:02:34.155046Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:*** Example ***\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:34 - INFO - tensorflow -   *** Example ***\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:tokens: who was jim henson ? jim [MASK] was a puppet ##eer\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:34 - INFO - tensorflow -   tokens: who was jim henson ? jim [MASK] was a puppet ##eer\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:features: input_ids:[2040, 2001, 3958, 27227, 1029, 3958, 103, 2001, 1037, 13997, 11510, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
+      "input_mask:[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
+      "segment_ids:[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
+      "masked_lm_positions:[6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
+      "masked_lm_ids:[27227, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
+      "masked_lm_weights:[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]\n",
+      "next_sentence_labels:0\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:34 - INFO - tensorflow -   features: input_ids:[2040, 2001, 3958, 27227, 1029, 3958, 103, 2001, 1037, 13997, 11510, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
+      "input_mask:[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
+      "segment_ids:[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
+      "masked_lm_positions:[6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
+      "masked_lm_ids:[27227, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
+      "masked_lm_weights:[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]\n",
+      "next_sentence_labels:0\n"
+     ]
+    }
+   ],
+   "source": [
+    "features = pretraining_convert_examples_to_features(\n",
+    "    instances=examples, max_seq_length=max_seq_length, \n",
+    "    max_predictions_per_seq=max_predictions_per_seq, tokenizer=tokenizer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-16T10:02:34.912005Z",
+     "start_time": "2018-11-16T10:02:34.882111Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "def input_fn_builder(features, seq_length, max_predictions_per_seq, tokenizer):\n",
+    "    \"\"\"Creates an `input_fn` closure to be passed to TPUEstimator.\"\"\"\n",
+    "\n",
+    "    all_input_ids = []\n",
+    "    all_input_mask = []\n",
+    "    all_segment_ids = []\n",
+    "    all_masked_lm_positions = []\n",
+    "    all_masked_lm_ids = []\n",
+    "    all_masked_lm_weights = []\n",
+    "    all_next_sentence_labels = []\n",
+    "\n",
+    "    for feature in features:\n",
+    "        all_input_ids.append(feature.input_ids)\n",
+    "        all_input_mask.append(feature.input_mask)\n",
+    "        all_segment_ids.append(feature.segment_ids)\n",
+    "        all_masked_lm_positions.append(feature.masked_lm_positions)\n",
+    "        all_masked_lm_ids.append(feature.masked_lm_ids)\n",
+    "        all_masked_lm_weights.append(feature.masked_lm_weights)\n",
+    "        all_next_sentence_labels.append(feature.next_sentence_labels)\n",
+    "\n",
+    "    def input_fn(params):\n",
+    "        \"\"\"The actual input function.\"\"\"\n",
+    "        batch_size = params[\"batch_size\"]\n",
+    "\n",
+    "        num_examples = len(features)\n",
+    "\n",
+    "        # This is for demo purposes and does NOT scale to large data sets. We do\n",
+    "        # not use Dataset.from_generator() because that uses tf.py_func which is\n",
+    "        # not TPU compatible. The right way to load data is with TFRecordReader.\n",
+    "        d = tf.data.Dataset.from_tensor_slices({\n",
+    "            \"input_ids\":\n",
+    "                tf.constant(\n",
+    "                    all_input_ids, shape=[num_examples, seq_length],\n",
+    "                    dtype=tf.int32),\n",
+    "            \"input_mask\":\n",
+    "                tf.constant(\n",
+    "                    all_input_mask,\n",
+    "                    shape=[num_examples, seq_length],\n",
+    "                    dtype=tf.int32),\n",
+    "            \"segment_ids\":\n",
+    "                tf.constant(\n",
+    "                    all_segment_ids,\n",
+    "                    shape=[num_examples, seq_length],\n",
+    "                    dtype=tf.int32),\n",
+    "            \"masked_lm_positions\":\n",
+    "                tf.constant(\n",
+    "                    all_masked_lm_positions,\n",
+    "                    shape=[num_examples, max_predictions_per_seq],\n",
+    "                    dtype=tf.int32),\n",
+    "        \"masked_lm_ids\":\n",
+    "                tf.constant(\n",
+    "                    all_masked_lm_ids,\n",
+    "                    shape=[num_examples, max_predictions_per_seq],\n",
+    "                    dtype=tf.int32),\n",
+    "        \"masked_lm_weights\":\n",
+    "                tf.constant(\n",
+    "                    all_masked_lm_weights,\n",
+    "                    shape=[num_examples, max_predictions_per_seq],\n",
+    "                    dtype=tf.float32),\n",
+    "        \"next_sentence_labels\":\n",
+    "                tf.constant(\n",
+    "                    all_next_sentence_labels,\n",
+    "                    shape=[num_examples, 1],\n",
+    "                    dtype=tf.int32),\n",
+    "        })\n",
+    "\n",
+    "        d = d.batch(batch_size=batch_size, drop_remainder=False)\n",
+    "        return d\n",
+    "\n",
+    "    return input_fn\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-16T10:02:35.671603Z",
+     "start_time": "2018-11-16T10:02:35.626167Z"
+    },
+    "code_folding": [
+     64,
+     77
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "def model_fn_builder(bert_config, init_checkpoint, learning_rate,\n",
+    "                     num_train_steps, num_warmup_steps, use_tpu,\n",
+    "                     use_one_hot_embeddings):\n",
+    "    \"\"\"Returns `model_fn` closure for TPUEstimator.\"\"\"\n",
+    "\n",
+    "    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument\n",
+    "        \"\"\"The `model_fn` for TPUEstimator.\"\"\"\n",
+    "\n",
+    "        tf.logging.info(\"*** Features ***\")\n",
+    "        for name in sorted(features.keys()):\n",
+    "            tf.logging.info(\"  name = %s, shape = %s\" % (name, features[name].shape))\n",
+    "\n",
+    "        input_ids = features[\"input_ids\"]\n",
+    "        input_mask = features[\"input_mask\"]\n",
+    "        segment_ids = features[\"segment_ids\"]\n",
+    "        masked_lm_positions = features[\"masked_lm_positions\"]\n",
+    "        masked_lm_ids = features[\"masked_lm_ids\"]\n",
+    "        masked_lm_weights = features[\"masked_lm_weights\"]\n",
+    "        next_sentence_labels = features[\"next_sentence_labels\"]\n",
+    "\n",
+    "        is_training = (mode == tf.estimator.ModeKeys.TRAIN)\n",
+    "\n",
+    "        model = tfm.BertModel(\n",
+    "            config=bert_config,\n",
+    "            is_training=is_training,\n",
+    "            input_ids=input_ids,\n",
+    "            input_mask=input_mask,\n",
+    "            token_type_ids=segment_ids,\n",
+    "            use_one_hot_embeddings=use_one_hot_embeddings)\n",
+    "\n",
+    "        (masked_lm_loss,\n",
+    "         masked_lm_example_loss, masked_lm_log_probs) = rp.get_masked_lm_output(\n",
+    "            bert_config, model.get_sequence_output(), model.get_embedding_table(),\n",
+    "            masked_lm_positions, masked_lm_ids, masked_lm_weights)\n",
+    "\n",
+    "        (next_sentence_loss, next_sentence_example_loss,\n",
+    "         next_sentence_log_probs) = rp.get_next_sentence_output(\n",
+    "            bert_config, model.get_pooled_output(), next_sentence_labels)\n",
+    "\n",
+    "        total_loss = masked_lm_loss + next_sentence_loss\n",
+    "\n",
+    "        tvars = tf.trainable_variables()\n",
+    "\n",
+    "        initialized_variable_names = {}\n",
+    "        scaffold_fn = None\n",
+    "        if init_checkpoint:\n",
+    "            (assignment_map,\n",
+    "             initialized_variable_names) = tfm.get_assigment_map_from_checkpoint(\n",
+    "                tvars, init_checkpoint)\n",
+    "            if use_tpu:\n",
+    "\n",
+    "                def tpu_scaffold():\n",
+    "                    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)\n",
+    "                    return tf.train.Scaffold()\n",
+    "\n",
+    "                scaffold_fn = tpu_scaffold\n",
+    "            else:\n",
+    "                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)\n",
+    "\n",
+    "        tf.logging.info(\"**** Trainable Variables ****\")\n",
+    "        for var in tvars:\n",
+    "            init_string = \"\"\n",
+    "            if var.name in initialized_variable_names:\n",
+    "                init_string = \", *INIT_FROM_CKPT*\"\n",
+    "            tf.logging.info(\"  name = %s, shape = %s%s\", var.name, var.shape,\n",
+    "                            init_string)\n",
+    "\n",
+    "        output_spec = None\n",
+    "        if mode == tf.estimator.ModeKeys.TRAIN:\n",
+    "            masked_lm_positions = features[\"masked_lm_positions\"]\n",
+    "            masked_lm_ids = features[\"masked_lm_ids\"]\n",
+    "            masked_lm_weights = features[\"masked_lm_weights\"]\n",
+    "            next_sentence_labels = features[\"next_sentence_labels\"]\n",
+    "            train_op = optimization.create_optimizer(\n",
+    "                total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)\n",
+    "\n",
+    "            output_spec = tf.contrib.tpu.TPUEstimatorSpec(\n",
+    "                mode=mode,\n",
+    "                loss=total_loss,\n",
+    "                train_op=train_op,\n",
+    "                scaffold_fn=scaffold_fn)\n",
+    "        elif mode == tf.estimator.ModeKeys.EVAL:\n",
+    "            masked_lm_positions = features[\"masked_lm_positions\"]\n",
+    "            masked_lm_ids = features[\"masked_lm_ids\"]\n",
+    "            masked_lm_weights = features[\"masked_lm_weights\"]\n",
+    "            next_sentence_labels = features[\"next_sentence_labels\"]\n",
+    "\n",
+    "            def metric_fn(masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,\n",
+    "                          masked_lm_weights, next_sentence_example_loss,\n",
+    "                          next_sentence_log_probs, next_sentence_labels):\n",
+    "                \"\"\"Computes the loss and accuracy of the model.\"\"\"\n",
+    "                masked_lm_log_probs = tf.reshape(masked_lm_log_probs,\n",
+    "                                                 [-1, masked_lm_log_probs.shape[-1]])\n",
+    "                masked_lm_predictions = tf.argmax(\n",
+    "                    masked_lm_log_probs, axis=-1, output_type=tf.int32)\n",
+    "                masked_lm_example_loss = tf.reshape(masked_lm_example_loss, [-1])\n",
+    "                masked_lm_ids = tf.reshape(masked_lm_ids, [-1])\n",
+    "                masked_lm_weights = tf.reshape(masked_lm_weights, [-1])\n",
+    "                masked_lm_accuracy = tf.metrics.accuracy(\n",
+    "                    labels=masked_lm_ids,\n",
+    "                    predictions=masked_lm_predictions,\n",
+    "                    weights=masked_lm_weights)\n",
+    "                masked_lm_mean_loss = tf.metrics.mean(\n",
+    "                    values=masked_lm_example_loss, weights=masked_lm_weights)\n",
+    "\n",
+    "                next_sentence_log_probs = tf.reshape(\n",
+    "                    next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]])\n",
+    "                next_sentence_predictions = tf.argmax(\n",
+    "                    next_sentence_log_probs, axis=-1, output_type=tf.int32)\n",
+    "                next_sentence_labels = tf.reshape(next_sentence_labels, [-1])\n",
+    "                next_sentence_accuracy = tf.metrics.accuracy(\n",
+    "                    labels=next_sentence_labels, predictions=next_sentence_predictions)\n",
+    "                next_sentence_mean_loss = tf.metrics.mean(\n",
+    "                    values=next_sentence_example_loss)\n",
+    "\n",
+    "                return {\n",
+    "                    \"masked_lm_accuracy\": masked_lm_accuracy,\n",
+    "                    \"masked_lm_loss\": masked_lm_mean_loss,\n",
+    "                    \"next_sentence_accuracy\": next_sentence_accuracy,\n",
+    "                    \"next_sentence_loss\": next_sentence_mean_loss,\n",
+    "                }\n",
+    "\n",
+    "            eval_metrics = (metric_fn, [\n",
+    "                masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,\n",
+    "                masked_lm_weights, next_sentence_example_loss,\n",
+    "                next_sentence_log_probs, next_sentence_labels\n",
+    "            ])\n",
+    "            output_spec = tf.contrib.tpu.TPUEstimatorSpec(\n",
+    "                mode=mode,\n",
+    "                loss=total_loss,\n",
+    "                eval_metrics=eval_metrics,\n",
+    "                scaffold_fn=scaffold_fn)\n",
+    "        elif mode == tf.estimator.ModeKeys.PREDICT:\n",
+    "            masked_lm_log_probs = tf.reshape(masked_lm_log_probs,\n",
+    "                                                [-1, masked_lm_log_probs.shape[-1]])\n",
+    "            masked_lm_predictions = tf.argmax(\n",
+    "                masked_lm_log_probs, axis=-1, output_type=tf.int32)\n",
+    "\n",
+    "            next_sentence_log_probs = tf.reshape(\n",
+    "                next_sentence_log_probs, [-1, next_sentence_log_probs.shape[-1]])\n",
+    "            next_sentence_predictions = tf.argmax(\n",
+    "                next_sentence_log_probs, axis=-1, output_type=tf.int32)\n",
+    "\n",
+    "            masked_lm_predictions = tf.reshape(masked_lm_predictions,\n",
+    "                                                [1, masked_lm_positions.shape[-1]])\n",
+    "            next_sentence_predictions = tf.reshape(next_sentence_predictions,\n",
+    "                                                [1, 1])\n",
+    "\n",
+    "            predictions = {\n",
+    "                \"masked_lm_predictions\": masked_lm_predictions,\n",
+    "                \"next_sentence_predictions\": next_sentence_predictions\n",
+    "            }\n",
+    "\n",
+    "            output_spec = tf.contrib.tpu.TPUEstimatorSpec(\n",
+    "                mode=mode, predictions=predictions, scaffold_fn=scaffold_fn)\n",
+    "            return output_spec\n",
+    "        else:\n",
+    "            raise ValueError(\"Only TRAIN, EVAL and PREDICT modes are supported: %s\" % (mode))\n",
+    "\n",
+    "        return output_spec\n",
+    "\n",
+    "    return model_fn"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2018-11-16T10:02:40.328700Z",
+     "start_time": "2018-11-16T10:02:36.289676Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x12a864ae8>) includes params argument, but params are not passed to Estimator.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:40 - WARNING - tensorflow -   Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x12a864ae8>) includes params argument, but params are not passed to Estimator.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:Using temporary folder as model directory: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmp4x8r3x3d\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:40 - WARNING - tensorflow -   Using temporary folder as model directory: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmp4x8r3x3d\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Using config: {'_model_dir': '/var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmp4x8r3x3d', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n",
       "graph_options {\n",
       "  rewrite_options {\n",
       "    meta_optimizer_iterations: ONE\n",
       "  }\n",
       "}\n",
-      ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12b3e1c18>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\n",
-      "WARNING:tensorflow:Setting TPUConfig.num_shards==1 is an unsupported behavior. Please fix as soon as possible (leaving num_shards as None.\n",
-      "INFO:tensorflow:_TPUContext: eval_on_tpu True\n",
+      ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12dbb5ac8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:40 - INFO - tensorflow -   Using config: {'_model_dir': '/var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmp4x8r3x3d', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n",
+      "graph_options {\n",
+      "  rewrite_options {\n",
+      "    meta_optimizer_iterations: ONE\n",
+      "  }\n",
+      "}\n",
+      ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12dbb5ac8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:Setting TPUConfig.num_shards==1 is an unsupported behavior. Please fix as soon as possible (leaving num_shards as None.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:40 - WARNING - tensorflow -   Setting TPUConfig.num_shards==1 is an unsupported behavior. Please fix as soon as possible (leaving num_shards as None.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:_TPUContext: eval_on_tpu True\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:40 - INFO - tensorflow -   _TPUContext: eval_on_tpu True\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
       "WARNING:tensorflow:eval_on_tpu ignored because use_tpu is False.\n"
      ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:40 - WARNING - tensorflow -   eval_on_tpu ignored because use_tpu is False.\n"
+     ]
     }
    ],
    "source": [
@@ -160,7 +720,9 @@
     "model_fn = model_fn_builder(\n",
     "    bert_config=bert_config,\n",
     "    init_checkpoint=init_checkpoint,\n",
-    "    layer_indexes=layer_indexes,\n",
+    "    learning_rate=0,\n",
+    "    num_train_steps=1,\n",
+    "    num_warmup_steps=1,\n",
     "    use_tpu=False,\n",
     "    use_one_hot_embeddings=False)\n",
     "\n",
@@ -173,16 +735,17 @@
     "    predict_batch_size=1)\n",
     "\n",
     "input_fn = input_fn_builder(\n",
-    "    features=features, seq_length=max_seq_length)"
+    "    features=features, seq_length=max_seq_length, max_predictions_per_seq=max_predictions_per_seq,\n",
+    "tokenizer=tokenizer)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 11,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-11-05T13:59:01.717585Z",
-     "start_time": "2018-11-05T13:58:55.941869Z"
+     "end_time": "2018-11-16T10:02:46.596956Z",
+     "start_time": "2018-11-16T10:02:40.331008Z"
     }
    },
    "outputs": [
@@ -190,62 +753,3152 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "INFO:tensorflow:Could not find trained model in model_dir: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpdbx_h23u, running initialization to predict.\n",
-      "INFO:tensorflow:Calling model_fn.\n",
-      "INFO:tensorflow:Running infer on CPU\n",
-      "INFO:tensorflow:Done calling model_fn.\n",
-      "INFO:tensorflow:Graph was finalized.\n",
-      "INFO:tensorflow:Running local_init_op.\n",
-      "INFO:tensorflow:Done running local_init_op.\n",
-      "extracting layer 0\n",
-      "extracting layer 1\n",
-      "extracting layer 2\n",
-      "extracting layer 3\n",
-      "extracting layer 4\n",
-      "extracting layer 5\n",
-      "extracting layer 6\n",
-      "extracting layer 7\n",
-      "extracting layer 8\n",
-      "extracting layer 9\n",
-      "extracting layer 10\n",
-      "extracting layer 11\n",
-      "INFO:tensorflow:prediction_loop marked as finished\n",
+      "INFO:tensorflow:Could not find trained model in model_dir: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmp4x8r3x3d, running initialization to predict.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:40 - INFO - tensorflow -   Could not find trained model in model_dir: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmp4x8r3x3d, running initialization to predict.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Calling model_fn.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:40 - INFO - tensorflow -   Calling model_fn.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Running infer on CPU\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:40 - INFO - tensorflow -   Running infer on CPU\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:*** Features ***\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:40 - INFO - tensorflow -   *** Features ***\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = input_ids, shape = (?, 128)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:40 - INFO - tensorflow -     name = input_ids, shape = (?, 128)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = input_mask, shape = (?, 128)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:40 - INFO - tensorflow -     name = input_mask, shape = (?, 128)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = masked_lm_ids, shape = (?, 20)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:40 - INFO - tensorflow -     name = masked_lm_ids, shape = (?, 20)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = masked_lm_positions, shape = (?, 20)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:40 - INFO - tensorflow -     name = masked_lm_positions, shape = (?, 20)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = masked_lm_weights, shape = (?, 20)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:40 - INFO - tensorflow -     name = masked_lm_weights, shape = (?, 20)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = next_sentence_labels, shape = (?, 1)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:40 - INFO - tensorflow -     name = next_sentence_labels, shape = (?, 1)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = segment_ids, shape = (?, 128)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:40 - INFO - tensorflow -     name = segment_ids, shape = (?, 128)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:**** Trainable Variables ****\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -   **** Trainable Variables ****\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/embeddings/word_embeddings:0, shape = (30522, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/embeddings/word_embeddings:0, shape = (30522, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/embeddings/token_type_embeddings:0, shape = (2, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/embeddings/token_type_embeddings:0, shape = (2, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/embeddings/position_embeddings:0, shape = (512, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/embeddings/position_embeddings:0, shape = (512, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/embeddings/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/embeddings/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/embeddings/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/embeddings/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_0/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_0/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_0/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_0/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_0/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_0/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_0/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_0/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_1/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_1/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_1/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_1/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_1/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_1/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_1/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_1/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_2/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_2/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_2/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_2/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_2/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_2/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_2/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_2/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_3/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_3/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_3/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_3/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_3/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_3/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_3/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_3/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_4/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_4/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_4/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_4/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_4/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_4/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_4/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_4/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_5/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_5/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_5/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_5/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_5/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_5/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_5/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_5/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_6/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_6/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_6/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_6/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_6/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_6/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_6/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_6/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_7/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_7/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_7/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_7/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_7/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_7/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_7/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_7/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_8/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_8/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_8/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_8/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_8/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_8/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_8/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_8/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_9/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_9/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_9/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_9/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_9/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_9/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_9/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_9/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_10/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_10/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_10/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_10/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_10/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_10/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_10/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_10/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/self/query/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/self/query/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/self/key/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/self/key/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/self/value/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/self/value/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/output/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_11/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/attention/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_11/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/intermediate/dense/kernel:0, shape = (768, 3072), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_11/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/intermediate/dense/bias:0, shape = (3072,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_11/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/output/dense/kernel:0, shape = (3072, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_11/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/output/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_11/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/output/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/encoder/layer_11/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/encoder/layer_11/output/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/pooler/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/pooler/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = bert/pooler/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = bert/pooler/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = cls/predictions/transform/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = cls/predictions/transform/dense/kernel:0, shape = (768, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = cls/predictions/transform/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = cls/predictions/transform/dense/bias:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = cls/predictions/transform/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = cls/predictions/transform/LayerNorm/beta:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = cls/predictions/transform/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = cls/predictions/transform/LayerNorm/gamma:0, shape = (768,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = cls/predictions/output_bias:0, shape = (30522,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = cls/predictions/output_bias:0, shape = (30522,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = cls/seq_relationship/output_weights:0, shape = (2, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = cls/seq_relationship/output_weights:0, shape = (2, 768), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:  name = cls/seq_relationship/output_bias:0, shape = (2,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -     name = cls/seq_relationship/output_bias:0, shape = (2,), *INIT_FROM_CKPT*\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Done calling model_fn.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:43 - INFO - tensorflow -   Done calling model_fn.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Graph was finalized.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:44 - INFO - tensorflow -   Graph was finalized.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Running local_init_op.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:45 - INFO - tensorflow -   Running local_init_op.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:Done running local_init_op.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:45 - INFO - tensorflow -   Done running local_init_op.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
       "INFO:tensorflow:prediction_loop marked as finished\n"
      ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:46 - INFO - tensorflow -   prediction_loop marked as finished\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:tensorflow:prediction_loop marked as finished\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:02:46 - INFO - tensorflow -   prediction_loop marked as finished\n"
+     ]
     }
    ],
    "source": [
     "tensorflow_all_out = []\n",
     "for result in estimator.predict(input_fn, yield_single_examples=True):\n",
-    "    unique_id = int(result[\"unique_id\"])\n",
-    "    feature = unique_id_to_feature[unique_id]\n",
-    "    output_json = collections.OrderedDict()\n",
-    "    output_json[\"linex_index\"] = unique_id\n",
-    "    tensorflow_all_out_features = []\n",
-    "    # for (i, token) in enumerate(feature.tokens):\n",
-    "    all_layers = []\n",
-    "    for (j, layer_index) in enumerate(layer_indexes):\n",
-    "        print(\"extracting layer {}\".format(j))\n",
-    "        layer_output = result[\"layer_output_%d\" % j]\n",
-    "        layers = collections.OrderedDict()\n",
-    "        layers[\"index\"] = layer_index\n",
-    "        layers[\"values\"] = layer_output\n",
-    "        all_layers.append(layers)\n",
-    "    tensorflow_out_features = collections.OrderedDict()\n",
-    "    tensorflow_out_features[\"layers\"] = all_layers\n",
-    "    tensorflow_all_out_features.append(tensorflow_out_features)\n",
-    "\n",
-    "    output_json[\"features\"] = tensorflow_all_out_features\n",
-    "    tensorflow_all_out.append(output_json)"
+    "    tensorflow_all_out.append(result)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 12,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-11-05T13:59:01.769845Z",
-     "start_time": "2018-11-05T13:59:01.719878Z"
+     "end_time": "2018-11-16T10:02:46.634304Z",
+     "start_time": "2018-11-16T10:02:46.598800Z"
     }
    },
    "outputs": [
@@ -255,43 +3908,42 @@
      "text": [
       "1\n",
       "2\n",
-      "odict_keys(['linex_index', 'features'])\n",
-      "number of tokens 1\n",
-      "number of layers 12\n"
+      "dict_keys(['masked_lm_predictions', 'next_sentence_predictions'])\n",
+      "masked_lm_predictions [27227  1010  1010  1010  1010  1010  1010  1010  1010  1010  1010  1010\n",
+      "  1010  1010  1010  1010  1010  1010  1010  1010]\n",
+      "predicted token ['henson', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',', ',']\n"
      ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "(128, 768)"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
     }
    ],
    "source": [
     "print(len(tensorflow_all_out))\n",
     "print(len(tensorflow_all_out[0]))\n",
     "print(tensorflow_all_out[0].keys())\n",
-    "print(\"number of tokens\", len(tensorflow_all_out[0]['features']))\n",
-    "print(\"number of layers\", len(tensorflow_all_out[0]['features'][0]['layers']))\n",
-    "tensorflow_all_out[0]['features'][0]['layers'][0]['values'].shape"
+    "print(\"masked_lm_predictions\", tensorflow_all_out[0]['masked_lm_predictions'])\n",
+    "print(\"predicted token\", tokenizer.convert_ids_to_tokens(tensorflow_all_out[0]['masked_lm_predictions']))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 13,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-11-05T13:59:01.807638Z",
-     "start_time": "2018-11-05T13:59:01.771422Z"
+     "end_time": "2018-11-16T10:02:46.671229Z",
+     "start_time": "2018-11-16T10:02:46.637102Z"
     }
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tensorflow_output: ['henson']\n"
+     ]
+    }
+   ],
    "source": [
-    "tensorflow_outputs = list(tensorflow_all_out[0]['features'][0]['layers'][t]['values'] for t in layer_indexes)"
+    "tensorflow_outputs = tokenizer.convert_ids_to_tokens(tensorflow_all_out[0]['masked_lm_predictions'])[:len(masked_lm_positions)]\n",
+    "print(\"tensorflow_output:\", tensorflow_outputs)"
    ]
   },
   {
@@ -303,26 +3955,26 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 14,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-11-05T13:59:02.020918Z",
-     "start_time": "2018-11-05T13:59:01.810061Z"
+     "end_time": "2018-11-16T10:03:03.556557Z",
+     "start_time": "2018-11-16T10:03:03.519654Z"
     }
    },
    "outputs": [],
    "source": [
-    "import extract_features\n",
-    "from extract_features import *"
+    "from examples import extract_features\n",
+    "from examples.extract_features import *"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 15,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-11-05T13:59:02.058211Z",
-     "start_time": "2018-11-05T13:59:02.022785Z"
+     "end_time": "2018-11-16T10:03:03.952710Z",
+     "start_time": "2018-11-16T10:03:03.921917Z"
     }
    },
    "outputs": [],
@@ -332,332 +3984,365 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 16,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-11-05T13:59:03.740561Z",
-     "start_time": "2018-11-05T13:59:02.059877Z"
+     "end_time": "2018-11-16T10:03:12.307673Z",
+     "start_time": "2018-11-16T10:03:04.439317Z"
     },
     "scrolled": true
    },
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "11/16/2018 11:03:05 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /Users/thomaswolf/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba\n",
+      "11/16/2018 11:03:05 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /Users/thomaswolf/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpaqgsm566\n",
+      "11/16/2018 11:03:08 - INFO - pytorch_pretrained_bert.modeling -   Model config {\n",
+      "  \"attention_probs_dropout_prob\": 0.1,\n",
+      "  \"hidden_act\": \"gelu\",\n",
+      "  \"hidden_dropout_prob\": 0.1,\n",
+      "  \"hidden_size\": 768,\n",
+      "  \"initializer_range\": 0.02,\n",
+      "  \"intermediate_size\": 3072,\n",
+      "  \"max_position_embeddings\": 512,\n",
+      "  \"num_attention_heads\": 12,\n",
+      "  \"num_hidden_layers\": 12,\n",
+      "  \"type_vocab_size\": 2,\n",
+      "  \"vocab_size\": 30522\n",
+      "}\n",
+      "\n"
+     ]
+    },
     {
      "data": {
       "text/plain": [
-       "BertModel(\n",
-       "  (embeddings): BERTEmbeddings(\n",
-       "    (word_embeddings): Embedding(30522, 768)\n",
-       "    (position_embeddings): Embedding(512, 768)\n",
-       "    (token_type_embeddings): Embedding(2, 768)\n",
-       "    (LayerNorm): BERTLayerNorm()\n",
-       "    (dropout): Dropout(p=0.1)\n",
-       "  )\n",
-       "  (encoder): BERTEncoder(\n",
-       "    (layer): ModuleList(\n",
-       "      (0): BERTLayer(\n",
-       "        (attention): BERTAttention(\n",
-       "          (self): BERTSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
+       "BertForPreTraining(\n",
+       "  (bert): BertModel(\n",
+       "    (embeddings): BertEmbeddings(\n",
+       "      (word_embeddings): Embedding(30522, 768)\n",
+       "      (position_embeddings): Embedding(512, 768)\n",
+       "      (token_type_embeddings): Embedding(2, 768)\n",
+       "      (LayerNorm): BertLayerNorm()\n",
+       "      (dropout): Dropout(p=0.1)\n",
+       "    )\n",
+       "    (encoder): BertEncoder(\n",
+       "      (layer): ModuleList(\n",
+       "        (0): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): BertLayerNorm()\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
        "          )\n",
-       "          (output): BERTSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BERTLayerNorm()\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): BertLayerNorm()\n",
        "            (dropout): Dropout(p=0.1)\n",
        "          )\n",
        "        )\n",
-       "        (intermediate): BERTIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BERTOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BERTLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (1): BERTLayer(\n",
-       "        (attention): BERTAttention(\n",
-       "          (self): BERTSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
+       "        (1): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): BertLayerNorm()\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
        "          )\n",
-       "          (output): BERTSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BERTLayerNorm()\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): BertLayerNorm()\n",
        "            (dropout): Dropout(p=0.1)\n",
        "          )\n",
        "        )\n",
-       "        (intermediate): BERTIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BERTOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BERTLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (2): BERTLayer(\n",
-       "        (attention): BERTAttention(\n",
-       "          (self): BERTSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
+       "        (2): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): BertLayerNorm()\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
        "          )\n",
-       "          (output): BERTSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BERTLayerNorm()\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): BertLayerNorm()\n",
        "            (dropout): Dropout(p=0.1)\n",
        "          )\n",
        "        )\n",
-       "        (intermediate): BERTIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BERTOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BERTLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (3): BERTLayer(\n",
-       "        (attention): BERTAttention(\n",
-       "          (self): BERTSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
+       "        (3): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): BertLayerNorm()\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
        "          )\n",
-       "          (output): BERTSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BERTLayerNorm()\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): BertLayerNorm()\n",
        "            (dropout): Dropout(p=0.1)\n",
        "          )\n",
        "        )\n",
-       "        (intermediate): BERTIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BERTOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BERTLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (4): BERTLayer(\n",
-       "        (attention): BERTAttention(\n",
-       "          (self): BERTSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
+       "        (4): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): BertLayerNorm()\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
        "          )\n",
-       "          (output): BERTSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BERTLayerNorm()\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): BertLayerNorm()\n",
        "            (dropout): Dropout(p=0.1)\n",
        "          )\n",
        "        )\n",
-       "        (intermediate): BERTIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BERTOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BERTLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (5): BERTLayer(\n",
-       "        (attention): BERTAttention(\n",
-       "          (self): BERTSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
+       "        (5): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): BertLayerNorm()\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
        "          )\n",
-       "          (output): BERTSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BERTLayerNorm()\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): BertLayerNorm()\n",
        "            (dropout): Dropout(p=0.1)\n",
        "          )\n",
        "        )\n",
-       "        (intermediate): BERTIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BERTOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BERTLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (6): BERTLayer(\n",
-       "        (attention): BERTAttention(\n",
-       "          (self): BERTSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
+       "        (6): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): BertLayerNorm()\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
        "          )\n",
-       "          (output): BERTSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BERTLayerNorm()\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): BertLayerNorm()\n",
        "            (dropout): Dropout(p=0.1)\n",
        "          )\n",
        "        )\n",
-       "        (intermediate): BERTIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BERTOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BERTLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (7): BERTLayer(\n",
-       "        (attention): BERTAttention(\n",
-       "          (self): BERTSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
+       "        (7): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): BertLayerNorm()\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
        "          )\n",
-       "          (output): BERTSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BERTLayerNorm()\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): BertLayerNorm()\n",
        "            (dropout): Dropout(p=0.1)\n",
        "          )\n",
        "        )\n",
-       "        (intermediate): BERTIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BERTOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BERTLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (8): BERTLayer(\n",
-       "        (attention): BERTAttention(\n",
-       "          (self): BERTSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
+       "        (8): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): BertLayerNorm()\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
        "          )\n",
-       "          (output): BERTSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BERTLayerNorm()\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): BertLayerNorm()\n",
        "            (dropout): Dropout(p=0.1)\n",
        "          )\n",
        "        )\n",
-       "        (intermediate): BERTIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BERTOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BERTLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (9): BERTLayer(\n",
-       "        (attention): BERTAttention(\n",
-       "          (self): BERTSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
+       "        (9): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): BertLayerNorm()\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
        "          )\n",
-       "          (output): BERTSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BERTLayerNorm()\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): BertLayerNorm()\n",
        "            (dropout): Dropout(p=0.1)\n",
        "          )\n",
        "        )\n",
-       "        (intermediate): BERTIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BERTOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BERTLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (10): BERTLayer(\n",
-       "        (attention): BERTAttention(\n",
-       "          (self): BERTSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
+       "        (10): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): BertLayerNorm()\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
        "          )\n",
-       "          (output): BERTSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BERTLayerNorm()\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): BertLayerNorm()\n",
        "            (dropout): Dropout(p=0.1)\n",
        "          )\n",
        "        )\n",
-       "        (intermediate): BERTIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BERTOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BERTLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (11): BERTLayer(\n",
-       "        (attention): BERTAttention(\n",
-       "          (self): BERTSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "        (11): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): BertLayerNorm()\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): BertLayerNorm()\n",
        "            (dropout): Dropout(p=0.1)\n",
        "          )\n",
-       "          (output): BERTSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BERTLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BERTIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BERTOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BERTLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
        "        )\n",
        "      )\n",
        "    )\n",
+       "    (pooler): BertPooler(\n",
+       "      (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "      (activation): Tanh()\n",
+       "    )\n",
        "  )\n",
-       "  (pooler): BERTPooler(\n",
-       "    (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "    (activation): Tanh()\n",
+       "  (cls): BertPreTrainingHeads(\n",
+       "    (predictions): BertLMPredictionHead(\n",
+       "      (transform): BertPredictionHeadTransform(\n",
+       "        (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "        (LayerNorm): BertLayerNorm()\n",
+       "      )\n",
+       "      (decoder): Linear(in_features=768, out_features=30522, bias=False)\n",
+       "    )\n",
+       "    (seq_relationship): Linear(in_features=768, out_features=2, bias=True)\n",
        "  )\n",
        ")"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
     "device = torch.device(\"cpu\")\n",
-    "model = extract_features.BertModel(bert_config)\n",
-    "model.load_state_dict(torch.load(init_checkpoint_pt, map_location='cpu'))\n",
+    "model = ppb.BertForPreTraining.from_pretrained('bert-base-uncased')\n",
     "model.to(device)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 17,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-11-05T13:59:03.780145Z",
-     "start_time": "2018-11-05T13:59:03.742407Z"
+     "end_time": "2018-11-16T10:03:12.351625Z",
+     "start_time": "2018-11-16T10:03:12.310736Z"
     },
     "code_folding": []
    },
@@ -665,302 +4350,314 @@
     {
      "data": {
       "text/plain": [
-       "BertModel(\n",
-       "  (embeddings): BERTEmbeddings(\n",
-       "    (word_embeddings): Embedding(30522, 768)\n",
-       "    (position_embeddings): Embedding(512, 768)\n",
-       "    (token_type_embeddings): Embedding(2, 768)\n",
-       "    (LayerNorm): BERTLayerNorm()\n",
-       "    (dropout): Dropout(p=0.1)\n",
-       "  )\n",
-       "  (encoder): BERTEncoder(\n",
-       "    (layer): ModuleList(\n",
-       "      (0): BERTLayer(\n",
-       "        (attention): BERTAttention(\n",
-       "          (self): BERTSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
+       "BertForPreTraining(\n",
+       "  (bert): BertModel(\n",
+       "    (embeddings): BertEmbeddings(\n",
+       "      (word_embeddings): Embedding(30522, 768)\n",
+       "      (position_embeddings): Embedding(512, 768)\n",
+       "      (token_type_embeddings): Embedding(2, 768)\n",
+       "      (LayerNorm): BertLayerNorm()\n",
+       "      (dropout): Dropout(p=0.1)\n",
+       "    )\n",
+       "    (encoder): BertEncoder(\n",
+       "      (layer): ModuleList(\n",
+       "        (0): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): BertLayerNorm()\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
        "          )\n",
-       "          (output): BERTSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BERTLayerNorm()\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): BertLayerNorm()\n",
        "            (dropout): Dropout(p=0.1)\n",
        "          )\n",
        "        )\n",
-       "        (intermediate): BERTIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BERTOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BERTLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (1): BERTLayer(\n",
-       "        (attention): BERTAttention(\n",
-       "          (self): BERTSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
+       "        (1): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): BertLayerNorm()\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
        "          )\n",
-       "          (output): BERTSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BERTLayerNorm()\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): BertLayerNorm()\n",
        "            (dropout): Dropout(p=0.1)\n",
        "          )\n",
        "        )\n",
-       "        (intermediate): BERTIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BERTOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BERTLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (2): BERTLayer(\n",
-       "        (attention): BERTAttention(\n",
-       "          (self): BERTSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
+       "        (2): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): BertLayerNorm()\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
        "          )\n",
-       "          (output): BERTSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BERTLayerNorm()\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): BertLayerNorm()\n",
        "            (dropout): Dropout(p=0.1)\n",
        "          )\n",
        "        )\n",
-       "        (intermediate): BERTIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BERTOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BERTLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (3): BERTLayer(\n",
-       "        (attention): BERTAttention(\n",
-       "          (self): BERTSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
+       "        (3): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): BertLayerNorm()\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
        "          )\n",
-       "          (output): BERTSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BERTLayerNorm()\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): BertLayerNorm()\n",
        "            (dropout): Dropout(p=0.1)\n",
        "          )\n",
        "        )\n",
-       "        (intermediate): BERTIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BERTOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BERTLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (4): BERTLayer(\n",
-       "        (attention): BERTAttention(\n",
-       "          (self): BERTSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
+       "        (4): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): BertLayerNorm()\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
        "          )\n",
-       "          (output): BERTSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BERTLayerNorm()\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): BertLayerNorm()\n",
        "            (dropout): Dropout(p=0.1)\n",
        "          )\n",
        "        )\n",
-       "        (intermediate): BERTIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BERTOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BERTLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (5): BERTLayer(\n",
-       "        (attention): BERTAttention(\n",
-       "          (self): BERTSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
+       "        (5): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): BertLayerNorm()\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
        "          )\n",
-       "          (output): BERTSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BERTLayerNorm()\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): BertLayerNorm()\n",
        "            (dropout): Dropout(p=0.1)\n",
        "          )\n",
        "        )\n",
-       "        (intermediate): BERTIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BERTOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BERTLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (6): BERTLayer(\n",
-       "        (attention): BERTAttention(\n",
-       "          (self): BERTSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
+       "        (6): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): BertLayerNorm()\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
        "          )\n",
-       "          (output): BERTSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BERTLayerNorm()\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): BertLayerNorm()\n",
        "            (dropout): Dropout(p=0.1)\n",
        "          )\n",
        "        )\n",
-       "        (intermediate): BERTIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BERTOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BERTLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (7): BERTLayer(\n",
-       "        (attention): BERTAttention(\n",
-       "          (self): BERTSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
+       "        (7): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): BertLayerNorm()\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
        "          )\n",
-       "          (output): BERTSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BERTLayerNorm()\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): BertLayerNorm()\n",
        "            (dropout): Dropout(p=0.1)\n",
        "          )\n",
        "        )\n",
-       "        (intermediate): BERTIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BERTOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BERTLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (8): BERTLayer(\n",
-       "        (attention): BERTAttention(\n",
-       "          (self): BERTSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
+       "        (8): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): BertLayerNorm()\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
        "          )\n",
-       "          (output): BERTSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BERTLayerNorm()\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): BertLayerNorm()\n",
        "            (dropout): Dropout(p=0.1)\n",
        "          )\n",
        "        )\n",
-       "        (intermediate): BERTIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BERTOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BERTLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (9): BERTLayer(\n",
-       "        (attention): BERTAttention(\n",
-       "          (self): BERTSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
+       "        (9): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): BertLayerNorm()\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
        "          )\n",
-       "          (output): BERTSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BERTLayerNorm()\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): BertLayerNorm()\n",
        "            (dropout): Dropout(p=0.1)\n",
        "          )\n",
        "        )\n",
-       "        (intermediate): BERTIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BERTOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BERTLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (10): BERTLayer(\n",
-       "        (attention): BERTAttention(\n",
-       "          (self): BERTSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (dropout): Dropout(p=0.1)\n",
+       "        (10): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): BertLayerNorm()\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
        "          )\n",
-       "          (output): BERTSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BERTLayerNorm()\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): BertLayerNorm()\n",
        "            (dropout): Dropout(p=0.1)\n",
        "          )\n",
        "        )\n",
-       "        (intermediate): BERTIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BERTOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BERTLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
-       "        )\n",
-       "      )\n",
-       "      (11): BERTLayer(\n",
-       "        (attention): BERTAttention(\n",
-       "          (self): BERTSelfAttention(\n",
-       "            (query): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (key): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "        (11): BertLayer(\n",
+       "          (attention): BertAttention(\n",
+       "            (self): BertSelfAttention(\n",
+       "              (query): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (key): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (value): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
+       "            (output): BertSelfOutput(\n",
+       "              (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "              (LayerNorm): BertLayerNorm()\n",
+       "              (dropout): Dropout(p=0.1)\n",
+       "            )\n",
+       "          )\n",
+       "          (intermediate): BertIntermediate(\n",
+       "            (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
+       "          )\n",
+       "          (output): BertOutput(\n",
+       "            (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
+       "            (LayerNorm): BertLayerNorm()\n",
        "            (dropout): Dropout(p=0.1)\n",
        "          )\n",
-       "          (output): BERTSelfOutput(\n",
-       "            (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "            (LayerNorm): BERTLayerNorm()\n",
-       "            (dropout): Dropout(p=0.1)\n",
-       "          )\n",
-       "        )\n",
-       "        (intermediate): BERTIntermediate(\n",
-       "          (dense): Linear(in_features=768, out_features=3072, bias=True)\n",
-       "        )\n",
-       "        (output): BERTOutput(\n",
-       "          (dense): Linear(in_features=3072, out_features=768, bias=True)\n",
-       "          (LayerNorm): BERTLayerNorm()\n",
-       "          (dropout): Dropout(p=0.1)\n",
        "        )\n",
        "      )\n",
        "    )\n",
+       "    (pooler): BertPooler(\n",
+       "      (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "      (activation): Tanh()\n",
+       "    )\n",
        "  )\n",
-       "  (pooler): BERTPooler(\n",
-       "    (dense): Linear(in_features=768, out_features=768, bias=True)\n",
-       "    (activation): Tanh()\n",
+       "  (cls): BertPreTrainingHeads(\n",
+       "    (predictions): BertLMPredictionHead(\n",
+       "      (transform): BertPredictionHeadTransform(\n",
+       "        (dense): Linear(in_features=768, out_features=768, bias=True)\n",
+       "        (LayerNorm): BertLayerNorm()\n",
+       "      )\n",
+       "      (decoder): Linear(in_features=768, out_features=30522, bias=False)\n",
+       "    )\n",
+       "    (seq_relationship): Linear(in_features=768, out_features=2, bias=True)\n",
        "  )\n",
        ")"
       ]
      },
-     "execution_count": 12,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -968,10 +4665,10 @@
    "source": [
     "all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)\n",
     "all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)\n",
-    "all_input_type_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long)\n",
-    "all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)\n",
+    "all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)\n",
+    "all_masked_lm_positions = torch.tensor([f.masked_lm_positions for f in features], dtype=torch.long)\n",
     "\n",
-    "eval_data = TensorDataset(all_input_ids, all_input_mask, all_input_type_ids, all_example_index)\n",
+    "eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_masked_lm_positions)\n",
     "eval_sampler = SequentialSampler(eval_data)\n",
     "eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=1)\n",
     "\n",
@@ -980,11 +4677,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 18,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-11-05T13:59:04.233844Z",
-     "start_time": "2018-11-05T13:59:03.782525Z"
+     "end_time": "2018-11-16T10:03:12.792741Z",
+     "start_time": "2018-11-16T10:03:12.354253Z"
     }
    },
    "outputs": [
@@ -992,8 +4689,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "tensor([[  101,  2040,  2001,  3958, 27227,  1029,   102,  3958, 27227,  2001,\n",
-      "          1037, 13997, 11510,   102,     0,     0,     0,     0,     0,     0,\n",
+      "tensor([[ 2040,  2001,  3958, 27227,  1029,  3958,   103,  2001,  1037, 13997,\n",
+      "         11510,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
       "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
       "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
       "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
@@ -1005,74 +4702,49 @@
       "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
       "             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
       "             0,     0,     0,     0,     0,     0,     0,     0]])\n",
-      "tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+      "tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
       "         0, 0, 0, 0, 0, 0, 0, 0]])\n",
-      "tensor([0])\n",
-      "layer 0 0\n",
-      "layer 1 1\n",
-      "layer 2 2\n",
-      "layer 3 3\n",
-      "layer 4 4\n",
-      "layer 5 5\n",
-      "layer 6 6\n",
-      "layer 7 7\n",
-      "layer 8 8\n",
-      "layer 9 9\n",
-      "layer 10 10\n",
-      "layer 11 11\n"
+      "tensor([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+      "         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
+      "         0, 0, 0, 0, 0, 0, 0, 0]])\n",
+      "(1, 20, 30522)\n",
+      "[27227, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010, 1010]\n"
      ]
     }
    ],
    "source": [
-    "layer_indexes = list(range(12))\n",
-    "\n",
+    "import numpy as np\n",
     "pytorch_all_out = []\n",
-    "for input_ids, input_mask, input_type_ids, example_indices in eval_dataloader:\n",
+    "for input_ids, input_mask, segment_ids, tensor_masked_lm_positions in eval_dataloader:\n",
     "    print(input_ids)\n",
     "    print(input_mask)\n",
-    "    print(example_indices)\n",
+    "    print(segment_ids)\n",
     "    input_ids = input_ids.to(device)\n",
     "    input_mask = input_mask.to(device)\n",
+    "    segment_ids = segment_ids.to(device)\n",
     "\n",
-    "    all_encoder_layers, _ = model(input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)\n",
-    "\n",
-    "    for b, example_index in enumerate(example_indices):\n",
-    "        feature = features[example_index.item()]\n",
-    "        unique_id = int(feature.unique_id)\n",
-    "        # feature = unique_id_to_feature[unique_id]\n",
-    "        output_json = collections.OrderedDict()\n",
-    "        output_json[\"linex_index\"] = unique_id\n",
-    "        all_out_features = []\n",
-    "        # for (i, token) in enumerate(feature.tokens):\n",
-    "        all_layers = []\n",
-    "        for (j, layer_index) in enumerate(layer_indexes):\n",
-    "            print(\"layer\", j, layer_index)\n",
-    "            layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()\n",
-    "            layer_output = layer_output[b]\n",
-    "            layers = collections.OrderedDict()\n",
-    "            layers[\"index\"] = layer_index\n",
-    "            layer_output = layer_output\n",
-    "            layers[\"values\"] = layer_output if not isinstance(layer_output, (int, float)) else [layer_output]\n",
-    "            all_layers.append(layers)\n",
-    "\n",
-    "            out_features = collections.OrderedDict()\n",
-    "            out_features[\"layers\"] = all_layers\n",
-    "            all_out_features.append(out_features)\n",
-    "        output_json[\"features\"] = all_out_features\n",
-    "        pytorch_all_out.append(output_json)"
+    "    prediction_scores, _ = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)\n",
+    "    prediction_scores = prediction_scores[0, tensor_masked_lm_positions].detach().cpu().numpy()\n",
+    "    print(prediction_scores.shape)\n",
+    "    masked_lm_predictions = np.argmax(prediction_scores, axis=-1).squeeze().tolist()\n",
+    "    print(masked_lm_predictions)\n",
+    "    pytorch_all_out.append(masked_lm_predictions)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 19,
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2018-11-05T13:59:04.278496Z",
-     "start_time": "2018-11-05T13:59:04.235703Z"
+     "end_time": "2018-11-16T10:03:12.828439Z",
+     "start_time": "2018-11-16T10:03:12.795420Z"
     }
    },
    "outputs": [
@@ -1080,140 +4752,15 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "1\n",
-      "2\n",
-      "odict_keys(['linex_index', 'features'])\n",
-      "number of tokens 1\n",
-      "number of layers 12\n",
-      "hidden_size 128\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "(128, 768)"
-      ]
-     },
-     "execution_count": 14,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "print(len(pytorch_all_out))\n",
-    "print(len(pytorch_all_out[0]))\n",
-    "print(pytorch_all_out[0].keys())\n",
-    "print(\"number of tokens\", len(pytorch_all_out))\n",
-    "print(\"number of layers\", len(pytorch_all_out[0]['features'][0]['layers']))\n",
-    "print(\"hidden_size\", len(pytorch_all_out[0]['features'][0]['layers'][0]['values']))\n",
-    "pytorch_all_out[0]['features'][0]['layers'][0]['values'].shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-05T13:59:04.313952Z",
-     "start_time": "2018-11-05T13:59:04.280352Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(128, 768)\n",
-      "(128, 768)\n"
+      "pytorch_output: ['henson']\n",
+      "tensorflow_output: ['henson']\n"
      ]
     }
    ],
    "source": [
-    "pytorch_outputs = list(pytorch_all_out[0]['features'][0]['layers'][t]['values'] for t in layer_indexes)\n",
-    "print(pytorch_outputs[0].shape)\n",
-    "print(pytorch_outputs[1].shape)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-05T13:59:04.350048Z",
-     "start_time": "2018-11-05T13:59:04.316003Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "(128, 768)\n",
-      "(128, 768)\n"
-     ]
-    }
-   ],
-   "source": [
-    "print(tensorflow_outputs[0].shape)\n",
-    "print(tensorflow_outputs[1].shape)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## 3/ Comparing the standard deviation on the last layer of both models"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-05T13:59:04.382430Z",
-     "start_time": "2018-11-05T13:59:04.351550Z"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "import numpy as np"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {
-    "ExecuteTime": {
-     "end_time": "2018-11-05T13:59:04.428334Z",
-     "start_time": "2018-11-05T13:59:04.386070Z"
-    }
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "shape tensorflow layer, shape pytorch layer, standard deviation\n",
-      "((128, 768), (128, 768), 1.5258875e-07)\n",
-      "((128, 768), (128, 768), 2.342731e-07)\n",
-      "((128, 768), (128, 768), 2.801949e-07)\n",
-      "((128, 768), (128, 768), 3.5904986e-07)\n",
-      "((128, 768), (128, 768), 4.2842768e-07)\n",
-      "((128, 768), (128, 768), 5.127951e-07)\n",
-      "((128, 768), (128, 768), 6.14668e-07)\n",
-      "((128, 768), (128, 768), 7.063922e-07)\n",
-      "((128, 768), (128, 768), 7.906173e-07)\n",
-      "((128, 768), (128, 768), 8.475192e-07)\n",
-      "((128, 768), (128, 768), 8.975489e-07)\n",
-      "((128, 768), (128, 768), 4.1671223e-07)\n"
-     ]
-    }
-   ],
-   "source": [
-    "print('shape tensorflow layer, shape pytorch layer, standard deviation')\n",
-    "print('\\n'.join(list(str((np.array(tensorflow_outputs[i]).shape,\n",
-    "                          np.array(pytorch_outputs[i]).shape, \n",
-    "                          np.sqrt(np.mean((np.array(tensorflow_outputs[i]) - np.array(pytorch_outputs[i]))**2.0)))) for i in range(12))))"
+    "pytorch_outputs = tokenizer.convert_ids_to_tokens(pytorch_all_out[0])[:len(masked_lm_positions)]\n",
+    "print(\"pytorch_output:\", pytorch_outputs)\n",
+    "print(\"tensorflow_output:\", tensorflow_outputs)"
    ]
   },
   {