From 8c932e37f9bb86efe6c08c7ef4f280a14455794f Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Sat, 3 Nov 2018 09:08:05 -0400
Subject: [PATCH] Update the comparison notebook

---
 Comparing TF and PT models.ipynb | 141 ++++++++++++++++++++-----------
 1 file changed, 94 insertions(+), 47 deletions(-)

diff --git a/Comparing TF and PT models.ipynb b/Comparing TF and PT models.ipynb
index 2f18d3b13c..912113bb4e 100644
--- a/Comparing TF and PT models.ipynb	
+++ b/Comparing TF and PT models.ipynb	
@@ -4,26 +4,72 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# TensorFlow code"
+    "# Comparing TensorFlow (original) and PyTorch models\n",
+    "\n",
+    "We use this small notebook to test the conversion of the model's weights and to make sure both the TensorFlow and PyTorch are coherent. In particular, we compare the weights of the last layer on a simple example (in `input.txt`).\n",
+    "\n",
+    "To run this notebook, please make sure that your Python environment has both TensorFlow and PyTorch.\n",
+    "You should follow the instructions in the `README.md` and make sure that you have:\n",
+    "- the original TensorFlow implementation\n",
+    "- the `BERT-base, Uncased` model\n",
+    "- run the script `convert_tf_checkpoint_to_pytorch.py` to convert the weights to PyTorch\n",
+    "\n",
+    "Please modify the relative paths accordingly (at the beggining of Sections 1 and 2)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1/ TensorFlow code"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "original_tf_inplem_dir = \"../bert/\"\n",
+    "model_dir = \"../uncased_L-12_H-768_A-12/\"\n",
+    "\n",
+    "vocab_file = model_dir + \"vocab.txt\"\n",
+    "bert_config_file = model_dir + \"bert_config.json\"\n",
+    "init_checkpoint = model_dir + \"bert_model.ckpt\"\n",
+    "\n",
+    "input_file = \"input.txt\"\n",
+    "max_seq_length = 128"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2018-11-03T02:09:37.498678Z",
      "start_time": "2018-11-03T02:09:36.366672Z"
     }
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.6/site-packages/h5py/__init__.py:34: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
+      "  from ._conv import register_converters as _register_converters\n"
+     ]
+    }
+   ],
    "source": [
+    "import sys\n",
+    "sys.path.append(original_tf_inplem_dir)\n",
+    "\n",
     "from extract_features import *"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2018-11-03T02:09:37.621865Z",
@@ -45,13 +91,6 @@
     }
    ],
    "source": [
-    "data_dir=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/data/glue_data/MRPC/\"\n",
-    "vocab_file=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/google_models/uncased_L-12_H-768_A-12/vocab.txt\"\n",
-    "bert_config_file=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/google_models/uncased_L-12_H-768_A-12/bert_config.json\"\n",
-    "init_checkpoint=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/google_models/uncased_L-12_H-768_A-12/bert_model.ckpt\"\n",
-    "max_seq_length=128\n",
-    "input_file=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/pytorch-pretrained-BERT/input.txt\"\n",
-    "\n",
     "layer_indexes = list(range(12))\n",
     "bert_config = modeling.BertConfig.from_json_file(bert_config_file)\n",
     "tokenizer = tokenization.FullTokenizer(\n",
@@ -67,7 +106,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2018-11-03T02:09:40.831618Z",
@@ -79,15 +118,15 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "WARNING:tensorflow:Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x12b0bcc80>) includes params argument, but params are not passed to Estimator.\n",
-      "WARNING:tensorflow:Using temporary folder as model directory: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpgpb5nz3u\n",
-      "INFO:tensorflow:Using config: {'_model_dir': '/var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpgpb5nz3u', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n",
+      "WARNING:tensorflow:Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x1289c1a60>) includes params argument, but params are not passed to Estimator.\n",
+      "WARNING:tensorflow:Using temporary folder as model directory: /var/folders/y2/py87pn6115bdsdftbc6394nh0000gn/T/tmpmcfk2tyr\n",
+      "INFO:tensorflow:Using config: {'_model_dir': '/var/folders/y2/py87pn6115bdsdftbc6394nh0000gn/T/tmpmcfk2tyr', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n",
       "graph_options {\n",
       "  rewrite_options {\n",
       "    meta_optimizer_iterations: ONE\n",
       "  }\n",
       "}\n",
-      ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12e1160f0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\n",
+      ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12c242470>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\n",
       "WARNING:tensorflow:Setting TPUConfig.num_shards==1 is an unsupported behavior. Please fix as soon as possible (leaving num_shards as None.\n",
       "INFO:tensorflow:_TPUContext: eval_on_tpu True\n",
       "WARNING:tensorflow:eval_on_tpu ignored because use_tpu is False.\n"
@@ -123,7 +162,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2018-11-03T02:09:46.413197Z",
@@ -135,7 +174,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "INFO:tensorflow:Could not find trained model in model_dir: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpgpb5nz3u, running initialization to predict.\n",
+      "INFO:tensorflow:Could not find trained model in model_dir: /var/folders/y2/py87pn6115bdsdftbc6394nh0000gn/T/tmpmcfk2tyr, running initialization to predict.\n",
       "INFO:tensorflow:Calling model_fn.\n",
       "INFO:tensorflow:Running infer on CPU\n",
       "INFO:tensorflow:Done calling model_fn.\n",
@@ -186,7 +225,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2018-11-03T02:09:46.460128Z",
@@ -211,7 +250,7 @@
        "(128, 768)"
       ]
      },
-     "execution_count": 5,
+     "execution_count": 6,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -227,7 +266,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2018-11-03T02:09:46.498637Z",
@@ -243,12 +282,12 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# PyTorch code"
+    "## 2/ PyTorch code"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2018-11-03T02:09:46.660303Z",
@@ -263,12 +302,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "init_checkpoint_pt = \"../pytorch_model/uncased_L-12_H-768_A-12/pytorch_model.bin\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2018-11-03T02:09:48.292135Z",
      "start_time": "2018-11-03T02:09:46.661921Z"
-    }
+    },
+    "scrolled": true
    },
    "outputs": [
     {
@@ -569,14 +618,12 @@
        ")"
       ]
      },
-     "execution_count": 8,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "init_checkpoint_pt=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/google_models/uncased_L-12_H-768_A-12/pytorch_model.bin\"\n",
-    "\n",
     "device = torch.device(\"cpu\")\n",
     "model = extract_features_pytorch.BertModel(bert_config)\n",
     "model.load_state_dict(torch.load(init_checkpoint_pt, map_location='cpu'))\n",
@@ -585,7 +632,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 11,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2018-11-03T02:09:48.332982Z",
@@ -892,7 +939,7 @@
        ")"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -912,7 +959,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 12,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2018-11-03T02:09:54.371188Z",
@@ -1000,7 +1047,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 13,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2018-11-03T02:09:57.139854Z",
@@ -1026,7 +1073,7 @@
        "(128, 768)"
       ]
      },
-     "execution_count": 17,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1043,7 +1090,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 14,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2018-11-03T02:09:59.000058Z",
@@ -1068,7 +1115,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 15,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2018-11-03T02:09:59.462123Z",
@@ -1090,9 +1137,16 @@
     "print(tensorflow_outputs[1].shape)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3/ Comparing the standard deviation on the last layer of both models"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 16,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2018-11-03T02:10:00.014784Z",
@@ -1106,7 +1160,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 17,
    "metadata": {
     "ExecuteTime": {
      "end_time": "2018-11-03T02:10:09.582557Z",
@@ -1127,7 +1181,7 @@
        "4.1671223e-07"
       ]
      },
-     "execution_count": 24,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1137,21 +1191,14 @@
     "print(np.array(tensorflow_outputs[i]).shape, np.array(pytorch_outputs[i]).shape)\n",
     "np.sqrt(np.mean((np.array(tensorflow_outputs[i]) - np.array(pytorch_outputs[i]))**2.0))"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
   "hide_input": false,
   "kernelspec": {
-   "display_name": "Python [conda env:bert]",
+   "display_name": "Python 3",
    "language": "python",
-   "name": "conda-env-bert-py"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -1163,7 +1210,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.7"
+   "version": "3.6.5"
   },
   "toc": {
    "colors": {