fixing model
This commit is contained in:
@@ -12,8 +12,8 @@
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2018-11-02T14:09:09.239405Z",
|
||||
"start_time": "2018-11-02T14:09:08.126668Z"
|
||||
"end_time": "2018-11-03T02:09:37.498678Z",
|
||||
"start_time": "2018-11-03T02:09:36.366672Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
@@ -26,8 +26,8 @@
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2018-11-02T14:09:09.370511Z",
|
||||
"start_time": "2018-11-02T14:09:09.242527Z"
|
||||
"end_time": "2018-11-03T02:09:37.621865Z",
|
||||
"start_time": "2018-11-03T02:09:37.500988Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
@@ -52,7 +52,7 @@
|
||||
"max_seq_length=128\n",
|
||||
"input_file=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/pytorch-pretrained-BERT/input.txt\"\n",
|
||||
"\n",
|
||||
"layer_indexes = [-1]\n",
|
||||
"layer_indexes = list(range(12))\n",
|
||||
"bert_config = modeling.BertConfig.from_json_file(bert_config_file)\n",
|
||||
"tokenizer = tokenization.FullTokenizer(\n",
|
||||
" vocab_file=vocab_file, do_lower_case=True)\n",
|
||||
@@ -70,8 +70,8 @@
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2018-11-02T14:09:12.514617Z",
|
||||
"start_time": "2018-11-02T14:09:09.372137Z"
|
||||
"end_time": "2018-11-03T02:09:40.831618Z",
|
||||
"start_time": "2018-11-03T02:09:37.624063Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
@@ -79,15 +79,15 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"WARNING:tensorflow:Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x12b266ae8>) includes params argument, but params are not passed to Estimator.\n",
|
||||
"WARNING:tensorflow:Using temporary folder as model directory: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmphrjfnoqh\n",
|
||||
"INFO:tensorflow:Using config: {'_model_dir': '/var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmphrjfnoqh', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n",
|
||||
"WARNING:tensorflow:Estimator's model_fn (<function model_fn_builder.<locals>.model_fn at 0x12b0bcc80>) includes params argument, but params are not passed to Estimator.\n",
|
||||
"WARNING:tensorflow:Using temporary folder as model directory: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpgpb5nz3u\n",
|
||||
"INFO:tensorflow:Using config: {'_model_dir': '/var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpgpb5nz3u', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n",
|
||||
"graph_options {\n",
|
||||
" rewrite_options {\n",
|
||||
" meta_optimizer_iterations: ONE\n",
|
||||
" }\n",
|
||||
"}\n",
|
||||
", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12e2c1160>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\n",
|
||||
", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x12e1160f0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\n",
|
||||
"WARNING:tensorflow:Setting TPUConfig.num_shards==1 is an unsupported behavior. Please fix as soon as possible (leaving num_shards as None.\n",
|
||||
"INFO:tensorflow:_TPUContext: eval_on_tpu True\n",
|
||||
"WARNING:tensorflow:eval_on_tpu ignored because use_tpu is False.\n"
|
||||
@@ -126,8 +126,8 @@
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2018-11-02T14:09:17.745970Z",
|
||||
"start_time": "2018-11-02T14:09:12.516953Z"
|
||||
"end_time": "2018-11-03T02:09:46.413197Z",
|
||||
"start_time": "2018-11-03T02:09:40.834621Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
@@ -135,42 +135,53 @@
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"INFO:tensorflow:Could not find trained model in model_dir: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmphrjfnoqh, running initialization to predict.\n",
|
||||
"INFO:tensorflow:Could not find trained model in model_dir: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpgpb5nz3u, running initialization to predict.\n",
|
||||
"INFO:tensorflow:Calling model_fn.\n",
|
||||
"INFO:tensorflow:Running infer on CPU\n",
|
||||
"INFO:tensorflow:Done calling model_fn.\n",
|
||||
"INFO:tensorflow:Graph was finalized.\n",
|
||||
"INFO:tensorflow:Running local_init_op.\n",
|
||||
"INFO:tensorflow:Done running local_init_op.\n",
|
||||
"extracting layer 0\n",
|
||||
"extracting layer 1\n",
|
||||
"extracting layer 2\n",
|
||||
"extracting layer 3\n",
|
||||
"extracting layer 4\n",
|
||||
"extracting layer 5\n",
|
||||
"extracting layer 6\n",
|
||||
"extracting layer 7\n",
|
||||
"extracting layer 8\n",
|
||||
"extracting layer 9\n",
|
||||
"extracting layer 10\n",
|
||||
"extracting layer 11\n",
|
||||
"INFO:tensorflow:prediction_loop marked as finished\n",
|
||||
"INFO:tensorflow:prediction_loop marked as finished\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"all_out = []\n",
|
||||
"tensorflow_all_out = []\n",
|
||||
"for result in estimator.predict(input_fn, yield_single_examples=True):\n",
|
||||
" unique_id = int(result[\"unique_id\"])\n",
|
||||
" feature = unique_id_to_feature[unique_id]\n",
|
||||
" output_json = collections.OrderedDict()\n",
|
||||
" output_json[\"linex_index\"] = unique_id\n",
|
||||
" all_out_features = []\n",
|
||||
" for (i, token) in enumerate(feature.tokens):\n",
|
||||
" all_layers = []\n",
|
||||
" for (j, layer_index) in enumerate(layer_indexes):\n",
|
||||
" layer_output = result[\"layer_output_%d\" % j]\n",
|
||||
" layers = collections.OrderedDict()\n",
|
||||
" layers[\"index\"] = layer_index\n",
|
||||
" layers[\"values\"] = [\n",
|
||||
" round(float(x), 6) for x in layer_output[i:(i + 1)].flat\n",
|
||||
" ]\n",
|
||||
" all_layers.append(layers)\n",
|
||||
" out_features = collections.OrderedDict()\n",
|
||||
" out_features[\"token\"] = token\n",
|
||||
" out_features[\"layers\"] = all_layers\n",
|
||||
" all_out_features.append(out_features)\n",
|
||||
" output_json[\"features\"] = all_out_features\n",
|
||||
" all_out.append(output_json)"
|
||||
" tensorflow_all_out_features = []\n",
|
||||
" # for (i, token) in enumerate(feature.tokens):\n",
|
||||
" all_layers = []\n",
|
||||
" for (j, layer_index) in enumerate(layer_indexes):\n",
|
||||
" print(\"extracting layer {}\".format(j))\n",
|
||||
" layer_output = result[\"layer_output_%d\" % j]\n",
|
||||
" layers = collections.OrderedDict()\n",
|
||||
" layers[\"index\"] = layer_index\n",
|
||||
" layers[\"values\"] = layer_output\n",
|
||||
" all_layers.append(layers)\n",
|
||||
" tensorflow_out_features = collections.OrderedDict()\n",
|
||||
" tensorflow_out_features[\"layers\"] = all_layers\n",
|
||||
" tensorflow_all_out_features.append(tensorflow_out_features)\n",
|
||||
"\n",
|
||||
" output_json[\"features\"] = tensorflow_all_out_features\n",
|
||||
" tensorflow_all_out.append(output_json)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -178,8 +189,8 @@
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2018-11-02T14:09:17.780532Z",
|
||||
"start_time": "2018-11-02T14:09:17.748778Z"
|
||||
"end_time": "2018-11-03T02:09:46.460128Z",
|
||||
"start_time": "2018-11-03T02:09:46.416138Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
@@ -190,15 +201,28 @@
|
||||
"1\n",
|
||||
"2\n",
|
||||
"odict_keys(['linex_index', 'features'])\n",
|
||||
"14\n"
|
||||
"number of tokens 1\n",
|
||||
"number of layers 12\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(128, 768)"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(len(all_out))\n",
|
||||
"print(len(all_out[0]))\n",
|
||||
"print(all_out[0].keys())\n",
|
||||
"print(len(all_out[0]['features']))"
|
||||
"print(len(tensorflow_all_out))\n",
|
||||
"print(len(tensorflow_all_out[0]))\n",
|
||||
"print(tensorflow_all_out[0].keys())\n",
|
||||
"print(\"number of tokens\", len(tensorflow_all_out[0]['features']))\n",
|
||||
"print(\"number of layers\", len(tensorflow_all_out[0]['features'][0]['layers']))\n",
|
||||
"tensorflow_all_out[0]['features'][0]['layers'][0]['values'].shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -206,34 +230,13 @@
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2018-11-02T14:09:17.818968Z",
|
||||
"start_time": "2018-11-02T14:09:17.782121Z"
|
||||
"end_time": "2018-11-03T02:09:46.498637Z",
|
||||
"start_time": "2018-11-03T02:09:46.463115Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[-0.628111,\n",
|
||||
" 0.193215,\n",
|
||||
" -0.75185,\n",
|
||||
" -0.040464,\n",
|
||||
" -0.875331,\n",
|
||||
" 0.15654,\n",
|
||||
" 1.385444,\n",
|
||||
" 1.066997,\n",
|
||||
" -0.349549,\n",
|
||||
" 0.270686]"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tensorflow_output = all_out[0]['features'][0]['layers'][0]['values']\n",
|
||||
"tensorflow_output[:10]"
|
||||
"tensorflow_outputs = list(tensorflow_all_out[0]['features'][0]['layers'][t]['values'] for t in layer_indexes)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -248,12 +251,13 @@
|
||||
"execution_count": 7,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2018-11-02T14:09:17.954196Z",
|
||||
"start_time": "2018-11-02T14:09:17.821115Z"
|
||||
"end_time": "2018-11-03T02:09:46.660303Z",
|
||||
"start_time": "2018-11-03T02:09:46.501325Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import extract_features_pytorch\n",
|
||||
"from extract_features_pytorch import *"
|
||||
]
|
||||
},
|
||||
@@ -262,8 +266,8 @@
|
||||
"execution_count": 8,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2018-11-02T14:09:19.196475Z",
|
||||
"start_time": "2018-11-02T14:09:17.956199Z"
|
||||
"end_time": "2018-11-03T02:09:48.292135Z",
|
||||
"start_time": "2018-11-03T02:09:46.661921Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
@@ -574,7 +578,7 @@
|
||||
"init_checkpoint_pt=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/google_models/uncased_L-12_H-768_A-12/pytorch_model.bin\"\n",
|
||||
"\n",
|
||||
"device = torch.device(\"cpu\")\n",
|
||||
"model = BertModel(bert_config)\n",
|
||||
"model = extract_features_pytorch.BertModel(bert_config)\n",
|
||||
"model.load_state_dict(torch.load(init_checkpoint_pt, map_location='cpu'))\n",
|
||||
"model.to(device)"
|
||||
]
|
||||
@@ -584,8 +588,8 @@
|
||||
"execution_count": 9,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2018-11-02T14:09:19.236256Z",
|
||||
"start_time": "2018-11-02T14:09:19.198407Z"
|
||||
"end_time": "2018-11-03T02:09:48.332982Z",
|
||||
"start_time": "2018-11-03T02:09:48.294056Z"
|
||||
},
|
||||
"code_folding": []
|
||||
},
|
||||
@@ -896,9 +900,10 @@
|
||||
"source": [
|
||||
"all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)\n",
|
||||
"all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)\n",
|
||||
"all_input_type_ids = torch.tensor([f.input_type_ids for f in features], dtype=torch.long)\n",
|
||||
"all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)\n",
|
||||
"\n",
|
||||
"eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)\n",
|
||||
"eval_data = TensorDataset(all_input_ids, all_input_mask, all_input_type_ids, all_example_index)\n",
|
||||
"eval_sampler = SequentialSampler(eval_data)\n",
|
||||
"eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=1)\n",
|
||||
"\n",
|
||||
@@ -907,41 +912,86 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"execution_count": 16,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2018-11-02T14:09:19.671994Z",
|
||||
"start_time": "2018-11-02T14:09:19.239454Z"
|
||||
"end_time": "2018-11-03T02:09:54.371188Z",
|
||||
"start_time": "2018-11-03T02:09:53.976875Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"tensor([[ 101, 2040, 2001, 3958, 27227, 1029, 102, 3958, 27227, 2001,\n",
|
||||
" 1037, 13997, 11510, 102, 0, 0, 0, 0, 0, 0,\n",
|
||||
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
||||
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
||||
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
||||
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
||||
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
||||
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
||||
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
||||
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
||||
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
||||
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
||||
" 0, 0, 0, 0, 0, 0, 0, 0]])\n",
|
||||
"tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
||||
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
||||
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
||||
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
||||
" 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
|
||||
" 0, 0, 0, 0, 0, 0, 0, 0]])\n",
|
||||
"tensor([0])\n",
|
||||
"layer 0 0\n",
|
||||
"layer 1 1\n",
|
||||
"layer 2 2\n",
|
||||
"layer 3 3\n",
|
||||
"layer 4 4\n",
|
||||
"layer 5 5\n",
|
||||
"layer 6 6\n",
|
||||
"layer 7 7\n",
|
||||
"layer 8 8\n",
|
||||
"layer 9 9\n",
|
||||
"layer 10 10\n",
|
||||
"layer 11 11\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"layer_indexes = list(range(12))\n",
|
||||
"\n",
|
||||
"pytorch_all_out = []\n",
|
||||
"for input_ids, input_mask, example_indices in eval_dataloader:\n",
|
||||
"for input_ids, input_mask, input_type_ids, example_indices in eval_dataloader:\n",
|
||||
" print(input_ids)\n",
|
||||
" print(input_mask)\n",
|
||||
" print(example_indices)\n",
|
||||
" input_ids = input_ids.to(device)\n",
|
||||
" input_mask = input_mask.float().to(device)\n",
|
||||
"\n",
|
||||
" all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask)\n",
|
||||
" all_encoder_layers, _ = model(input_ids, token_type_ids=input_type_ids, attention_mask=input_mask)\n",
|
||||
"\n",
|
||||
" for enc_layers, example_index in zip(all_encoder_layers, example_indices):\n",
|
||||
" for b, example_index in enumerate(example_indices):\n",
|
||||
" feature = features[example_index.item()]\n",
|
||||
" unique_id = int(feature.unique_id)\n",
|
||||
" # feature = unique_id_to_feature[unique_id]\n",
|
||||
" output_json = collections.OrderedDict()\n",
|
||||
" output_json[\"linex_index\"] = unique_id\n",
|
||||
" all_out_features = []\n",
|
||||
" for (i, token) in enumerate(feature.tokens):\n",
|
||||
" all_layers = []\n",
|
||||
" for (j, layer_index) in enumerate(layer_indexes):\n",
|
||||
" layer_output = enc_layers[int(layer_index)].detach().cpu().numpy()\n",
|
||||
" layers = collections.OrderedDict()\n",
|
||||
" layers[\"index\"] = layer_index\n",
|
||||
" layers[\"values\"] = [\n",
|
||||
" round(float(x), 6) for x in layer_output[i:(i + 1)].flat\n",
|
||||
" ]\n",
|
||||
" all_layers.append(layers)\n",
|
||||
" # for (i, token) in enumerate(feature.tokens):\n",
|
||||
" all_layers = []\n",
|
||||
" for (j, layer_index) in enumerate(layer_indexes):\n",
|
||||
" print(\"layer\", j, layer_index)\n",
|
||||
" layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()\n",
|
||||
" layer_output = layer_output[b]\n",
|
||||
" layers = collections.OrderedDict()\n",
|
||||
" layers[\"index\"] = layer_index\n",
|
||||
" layer_output = layer_output\n",
|
||||
" layers[\"values\"] = layer_output if not isinstance(layer_output, (int, float)) else [layer_output]\n",
|
||||
" all_layers.append(layers)\n",
|
||||
"\n",
|
||||
" out_features = collections.OrderedDict()\n",
|
||||
" out_features[\"token\"] = token\n",
|
||||
" out_features[\"layers\"] = all_layers\n",
|
||||
" all_out_features.append(out_features)\n",
|
||||
" output_json[\"features\"] = all_out_features\n",
|
||||
@@ -950,11 +1000,11 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"execution_count": 17,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2018-11-02T14:09:19.706616Z",
|
||||
"start_time": "2018-11-02T14:09:19.673670Z"
|
||||
"end_time": "2018-11-03T02:09:57.139854Z",
|
||||
"start_time": "2018-11-03T02:09:57.104636Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
@@ -965,84 +1015,127 @@
|
||||
"1\n",
|
||||
"2\n",
|
||||
"odict_keys(['linex_index', 'features'])\n",
|
||||
"14\n"
|
||||
"number of tokens 1\n",
|
||||
"number of layers 12\n",
|
||||
"hidden_size 128\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(128, 768)"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(len(pytorch_all_out))\n",
|
||||
"print(len(pytorch_all_out[0]))\n",
|
||||
"print(pytorch_all_out[0].keys())\n",
|
||||
"print(len(pytorch_all_out[0]['features']))"
|
||||
"print(\"number of tokens\", len(pytorch_all_out))\n",
|
||||
"print(\"number of layers\", len(pytorch_all_out[0]['features'][0]['layers']))\n",
|
||||
"print(\"hidden_size\", len(pytorch_all_out[0]['features'][0]['layers'][0]['values']))\n",
|
||||
"pytorch_all_out[0]['features'][0]['layers'][0]['values'].shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"execution_count": 18,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2018-11-02T14:10:28.295669Z",
|
||||
"start_time": "2018-11-02T14:10:28.263140Z"
|
||||
"end_time": "2018-11-03T02:09:59.000058Z",
|
||||
"start_time": "2018-11-03T02:09:58.967575Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"(128, 768)\n",
|
||||
"(128, 768)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"pytorch_outputs = list(pytorch_all_out[0]['features'][0]['layers'][t]['values'] for t in layer_indexes)\n",
|
||||
"print(pytorch_outputs[0].shape)\n",
|
||||
"print(pytorch_outputs[1].shape)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2018-11-03T02:09:59.462123Z",
|
||||
"start_time": "2018-11-03T02:09:59.430932Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"(128, 768)\n",
|
||||
"(128, 768)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(tensorflow_outputs[0].shape)\n",
|
||||
"print(tensorflow_outputs[1].shape)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2018-11-03T02:10:00.014784Z",
|
||||
"start_time": "2018-11-03T02:09:59.983978Z"
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import numpy as np"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2018-11-03T02:10:09.582557Z",
|
||||
"start_time": "2018-11-03T02:10:09.549308Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"(128, 768) (128, 768)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[-0.016153,\n",
|
||||
" -0.697252,\n",
|
||||
" -0.298296,\n",
|
||||
" -0.167194,\n",
|
||||
" -0.219306,\n",
|
||||
" 0.061712,\n",
|
||||
" -0.006953,\n",
|
||||
" 0.366519,\n",
|
||||
" -0.031027,\n",
|
||||
" -0.33547]"
|
||||
"4.1671223e-07"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"pytorch_output = pytorch_all_out[0]['features'][0]['layers'][0]['values']\n",
|
||||
"pytorch_output[:10]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {
|
||||
"ExecuteTime": {
|
||||
"end_time": "2018-11-02T14:10:34.540457Z",
|
||||
"start_time": "2018-11-02T14:10:34.510109Z"
|
||||
}
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[-0.628111,\n",
|
||||
" 0.193215,\n",
|
||||
" -0.75185,\n",
|
||||
" -0.040464,\n",
|
||||
" -0.875331,\n",
|
||||
" 0.15654,\n",
|
||||
" 1.385444,\n",
|
||||
" 1.066997,\n",
|
||||
" -0.349549,\n",
|
||||
" 0.270686]"
|
||||
]
|
||||
},
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"tensorflow_output[:10]"
|
||||
"i = 11\n",
|
||||
"print(np.array(tensorflow_outputs[i]).shape, np.array(pytorch_outputs[i]).shape)\n",
|
||||
"np.sqrt(np.mean((np.array(tensorflow_outputs[i]) - np.array(pytorch_outputs[i]))**2.0))"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@@ -268,29 +268,31 @@ def main():
|
||||
input_mask = input_mask.float().to(device)
|
||||
|
||||
all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask)
|
||||
all_encoder_layers = all_encoder_layers
|
||||
|
||||
for enc_layers, example_index in zip(all_encoder_layers, example_indices):
|
||||
for b, example_index in enumerate(example_indices):
|
||||
feature = features[example_index.item()]
|
||||
unique_id = int(feature.unique_id)
|
||||
# feature = unique_id_to_feature[unique_id]
|
||||
output_json = collections.OrderedDict()
|
||||
output_json["linex_index"] = unique_id
|
||||
all_features = []
|
||||
all_out_features = []
|
||||
for (i, token) in enumerate(feature.tokens):
|
||||
all_layers = []
|
||||
for (j, layer_index) in enumerate(layer_indexes):
|
||||
layer_output = enc_layers[int(layer_index)].detach().cpu().numpy()
|
||||
layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()
|
||||
layer_output = layer_output[b]
|
||||
layers = collections.OrderedDict()
|
||||
layers["index"] = layer_index
|
||||
layers["values"] = [
|
||||
round(float(x), 6) for x in layer_output[i:(i + 1)].flat
|
||||
round(x.item(), 6) for x in layer_output[i]
|
||||
]
|
||||
all_layers.append(layers)
|
||||
features = collections.OrderedDict()
|
||||
features["token"] = token
|
||||
features["layers"] = all_layers
|
||||
all_features.append(features)
|
||||
output_json["features"] = all_features
|
||||
out_features = collections.OrderedDict()
|
||||
out_features["token"] = token
|
||||
out_features["layers"] = all_layers
|
||||
all_out_features.append(out_features)
|
||||
output_json["features"] = all_out_features
|
||||
writer.write(json.dumps(output_json) + "\n")
|
||||
|
||||
|
||||
|
||||
@@ -27,8 +27,9 @@ import torch.nn as nn
|
||||
from torch.nn import CrossEntropyLoss
|
||||
|
||||
def gelu(x):
|
||||
return 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
|
||||
# OpenAI GPT gelu version was : 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
||||
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
|
||||
# OpenAI GPT gelu version :
|
||||
# return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
|
||||
|
||||
|
||||
class BertConfig(object):
|
||||
@@ -157,7 +158,7 @@ class BERTEmbeddings(nn.Module):
|
||||
words_embeddings = self.word_embeddings(input_ids)
|
||||
position_embeddings = self.position_embeddings(position_ids)
|
||||
token_type_embeddings = self.token_type_embeddings(token_type_ids)
|
||||
|
||||
|
||||
embeddings = words_embeddings + position_embeddings + token_type_embeddings
|
||||
embeddings = self.LayerNorm(embeddings)
|
||||
embeddings = self.dropout(embeddings)
|
||||
@@ -196,19 +197,19 @@ class BERTSelfAttention(nn.Module):
|
||||
# T = `to_tensor` sequence length
|
||||
# N = `num_attention_heads`
|
||||
# H = `size_per_head`
|
||||
query_layer = self.query(hidden_states)
|
||||
key_layer = self.key(hidden_states)
|
||||
value_layer = self.value(hidden_states)
|
||||
mixed_query_layer = self.query(hidden_states)
|
||||
mixed_key_layer = self.key(hidden_states)
|
||||
mixed_value_layer = self.value(hidden_states)
|
||||
|
||||
query_layer = self.transpose_for_scores(query_layer)
|
||||
key_layer = self.transpose_for_scores(key_layer, is_key_tensor=True)
|
||||
value_layer = self.transpose_for_scores(value_layer)
|
||||
query_layer = self.transpose_for_scores(mixed_query_layer)
|
||||
key_layer = self.transpose_for_scores(mixed_key_layer) #, is_key_tensor=True)
|
||||
value_layer = self.transpose_for_scores(mixed_value_layer)
|
||||
|
||||
# Take the dot product between "query" and "key" to get the raw
|
||||
# attention scores.
|
||||
# `attention_scores` = [B, N, F, T]
|
||||
attention_scores = torch.matmul(query_layer, key_layer)
|
||||
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
|
||||
attention_scores_no_norm = torch.matmul(query_layer, key_layer.transpose(-1, -2))
|
||||
attention_scores_no_mask = attention_scores_no_norm / math.sqrt(self.attention_head_size)
|
||||
|
||||
# TODO clean up this (precompute)
|
||||
# MY PYTORCH: w = w * self.b + -1e9 * (1 - self.b) # TF implem method: mask_attn_weights
|
||||
@@ -220,21 +221,26 @@ class BERTSelfAttention(nn.Module):
|
||||
# adder = (1.0 - attention_mask) * -10000.0
|
||||
# Since we are adding it to the raw scores before the softmax, this is
|
||||
# effectively the same as removing these entirely.
|
||||
attention_scores += attention_mask
|
||||
attention_scores = attention_scores_no_mask + attention_mask
|
||||
|
||||
# Normalize the attention scores to probabilities.
|
||||
# `attention_probs` = [B, N, F, T]
|
||||
attention_probs = nn.Softmax(dim=-1)(attention_scores)
|
||||
attention_probs_no_drop = nn.Softmax(dim=-1)(attention_scores)
|
||||
|
||||
# This is actually dropping out entire tokens to attend to, which might
|
||||
# seem a bit unusual, but is taken from the original Transformer paper.
|
||||
attention_probs = self.dropout(attention_probs)
|
||||
attention_probs = self.dropout(attention_probs_no_drop)
|
||||
|
||||
context_layer = torch.matmul(attention_probs, value_layer)
|
||||
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
|
||||
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
|
||||
context_layer = context_layer.view(*new_context_layer_shape)
|
||||
|
||||
# aux_attention = attention_probs[0, 0, 0, :].view(1, 128, 1)
|
||||
# aux_attention = aux_attention.permute(0, 2, 1, 3).contiguous().view(1, 128, 768)
|
||||
# aux_attention = key_layer.permute(0, 2, 3, 1).contiguous().view(1, 128, 768)
|
||||
# aux_attention = key_layer.permute(0, 2, 1, 3).contiguous().view(1, 128, 768)
|
||||
|
||||
return context_layer
|
||||
|
||||
|
||||
@@ -246,7 +252,7 @@ class BERTSelfOutput(nn.Module):
|
||||
self.dropout = nn.Dropout(config.hidden_dropout_prob)
|
||||
|
||||
def forward(self, hidden_states, input_tensor):
|
||||
hidden_states = self.dense(input_tensor)
|
||||
hidden_states = self.dense(hidden_states)
|
||||
hidden_states = self.dropout(hidden_states)
|
||||
hidden_states = self.LayerNorm(hidden_states + input_tensor)
|
||||
return hidden_states
|
||||
@@ -259,8 +265,8 @@ class BERTAttention(nn.Module):
|
||||
self.output = BERTSelfOutput(config)
|
||||
|
||||
def forward(self, input_tensor, attention_mask):
|
||||
attention_output = self.self(input_tensor, attention_mask)
|
||||
attention_output = self.output(attention_output, input_tensor)
|
||||
self_output = self.self(input_tensor, attention_mask)
|
||||
attention_output = self.output(self_output, input_tensor)
|
||||
return attention_output
|
||||
|
||||
|
||||
@@ -388,13 +394,16 @@ class BertModel(nn.Module):
|
||||
if token_type_ids is None:
|
||||
token_type_ids = torch.zeros_like(input_ids)
|
||||
|
||||
attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
|
||||
attention_mask = (1.0 - attention_mask) * -10000.0
|
||||
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
|
||||
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
|
||||
|
||||
embedding_output = self.embeddings(input_ids, token_type_ids)
|
||||
all_encoder_layers = self.encoder(embedding_output, attention_mask)
|
||||
all_encoder_layers = self.encoder(embedding_output, extended_attention_mask)
|
||||
sequence_output = all_encoder_layers[-1]
|
||||
pooled_output = self.pooler(sequence_output)
|
||||
|
||||
# TODO DEbugging
|
||||
# all_encoder_layers = [attention_mask, embeddings_sum, embedding_output] + all_encoder_layers
|
||||
return all_encoder_layers, pooled_output
|
||||
|
||||
class BertForSequenceClassification(nn.Module):
|
||||
|
||||
Reference in New Issue
Block a user