From c9690e57f84d5911bb23562e532d5d925ae3bc42 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 2 Nov 2018 14:25:21 +0100 Subject: [PATCH] adding jupyter, updating extract features adding simple test file --- Comparing TF and PT models.ipynb | 288 +++++++++++++++++++++++++++++++ extract_features_pytorch.py | 58 +++---- input.txt | 1 + 3 files changed, 318 insertions(+), 29 deletions(-) create mode 100644 Comparing TF and PT models.ipynb create mode 100644 input.txt diff --git a/Comparing TF and PT models.ipynb b/Comparing TF and PT models.ipynb new file mode 100644 index 0000000000..1148bac16e --- /dev/null +++ b/Comparing TF and PT models.ipynb @@ -0,0 +1,288 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# TensorFlow code" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-02T13:05:56.692585Z", + "start_time": "2018-11-02T13:05:55.699169Z" + } + }, + "outputs": [], + "source": [ + "from extract_features import *" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-02T13:18:23.944585Z", + "start_time": "2018-11-02T13:18:23.821309Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:tensorflow:*** Example ***\n", + "INFO:tensorflow:unique_id: 0\n", + "INFO:tensorflow:tokens: [CLS] who was jim henson ? [SEP] jim henson was a puppet ##eer [SEP]\n", + "INFO:tensorflow:input_ids: 101 2040 2001 3958 27227 1029 102 3958 27227 2001 1037 13997 11510 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n", + "INFO:tensorflow:input_type_ids: 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n" + ] + } + ], + "source": [ + "data_dir=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/data/glue_data/MRPC/\"\n", + "vocab_file=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/google_models/uncased_L-12_H-768_A-12/vocab.txt\"\n", + "bert_config_file=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/google_models/uncased_L-12_H-768_A-12/bert_config.json\"\n", + "init_checkpoint=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/google_models/uncased_L-12_H-768_A-12/bert_model.ckpt\"\n", + "max_seq_length=128\n", + "input_file=\"/Users/thomaswolf/Documents/Thomas/Code/HF/BERT/pytorch-pretrained-BERT/input.txt\"\n", + "\n", + "layer_indexes = [-1]\n", + "bert_config = modeling.BertConfig.from_json_file(bert_config_file)\n", + "tokenizer = tokenization.FullTokenizer(\n", + " vocab_file=vocab_file, do_lower_case=True)\n", + "examples = read_examples(input_file)\n", + "\n", + "features = convert_examples_to_features(\n", + " examples=examples, seq_length=max_seq_length, tokenizer=tokenizer)\n", + "unique_id_to_feature = {}\n", + "for feature in features:\n", + " unique_id_to_feature[feature.unique_id] = feature" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-02T13:18:24.802620Z", + "start_time": "2018-11-02T13:18:24.764474Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:Estimator's model_fn (.model_fn at 0x128feb7b8>) includes params argument, but params are not passed to Estimator.\n", + "WARNING:tensorflow:Using temporary folder as model directory: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpp9hntmfs\n", + "INFO:tensorflow:Using config: {'_model_dir': '/var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpp9hntmfs', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true\n", + "graph_options {\n", + " rewrite_options {\n", + " meta_optimizer_iterations: ONE\n", + " }\n", + "}\n", + ", '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': , '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=2, num_shards=1, num_cores_per_replica=None, per_host_input_for_training=3, tpu_job_name=None, initial_infeed_sleep_secs=None, input_partition_dims=None), '_cluster': None}\n", + "WARNING:tensorflow:Setting TPUConfig.num_shards==1 is an unsupported behavior. Please fix as soon as possible (leaving num_shards as None.\n", + "INFO:tensorflow:_TPUContext: eval_on_tpu True\n", + "WARNING:tensorflow:eval_on_tpu ignored because use_tpu is False.\n" + ] + } + ], + "source": [ + "is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2\n", + "run_config = tf.contrib.tpu.RunConfig(\n", + " master=None,\n", + " tpu_config=tf.contrib.tpu.TPUConfig(\n", + " num_shards=1,\n", + " per_host_input_for_training=is_per_host))\n", + "\n", + "model_fn = model_fn_builder(\n", + " bert_config=bert_config,\n", + " init_checkpoint=init_checkpoint,\n", + " layer_indexes=layer_indexes,\n", + " use_tpu=False,\n", + " use_one_hot_embeddings=False)\n", + "\n", + "# If TPU is not available, this will fall back to normal Estimator on CPU\n", + "# or GPU.\n", + "estimator = tf.contrib.tpu.TPUEstimator(\n", + " use_tpu=False,\n", + " model_fn=model_fn,\n", + " config=run_config,\n", + " predict_batch_size=1)\n", + "\n", + "input_fn = input_fn_builder(\n", + " features=features, seq_length=max_seq_length)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-02T13:19:20.060587Z", + "start_time": "2018-11-02T13:19:14.804525Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:tensorflow:Could not find trained model in model_dir: /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpp9hntmfs, running initialization to predict.\n", + "INFO:tensorflow:Calling model_fn.\n", + "INFO:tensorflow:Running infer on CPU\n", + "INFO:tensorflow:Done calling model_fn.\n", + "INFO:tensorflow:Graph was finalized.\n", + "INFO:tensorflow:Running local_init_op.\n", + "INFO:tensorflow:Done running local_init_op.\n", + "INFO:tensorflow:prediction_loop marked as finished\n", + "INFO:tensorflow:prediction_loop marked as finished\n" + ] + } + ], + "source": [ + "all_out = []\n", + "for result in estimator.predict(input_fn, yield_single_examples=True):\n", + " unique_id = int(result[\"unique_id\"])\n", + " feature = unique_id_to_feature[unique_id]\n", + " output_json = collections.OrderedDict()\n", + " output_json[\"linex_index\"] = unique_id\n", + " all_features = []\n", + " for (i, token) in enumerate(feature.tokens):\n", + " all_layers = []\n", + " for (j, layer_index) in enumerate(layer_indexes):\n", + " layer_output = result[\"layer_output_%d\" % j]\n", + " layers = collections.OrderedDict()\n", + " layers[\"index\"] = layer_index\n", + " layers[\"values\"] = [\n", + " round(float(x), 6) for x in layer_output[i:(i + 1)].flat\n", + " ]\n", + " all_layers.append(layers)\n", + " features = collections.OrderedDict()\n", + " features[\"token\"] = token\n", + " features[\"layers\"] = all_layers\n", + " all_features.append(features)\n", + " output_json[\"features\"] = all_features\n", + " all_out.append(output_json)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-02T13:22:39.694206Z", + "start_time": "2018-11-02T13:22:39.663432Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "14" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(all_out)\n", + "len(all_out[0])\n", + "all_out[0].keys()\n", + "len(all_out[0]['features'])" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-02T13:23:05.752981Z", + "start_time": "2018-11-02T13:23:05.723891Z" + } + }, + "outputs": [], + "source": [ + "tensorflow_output = all_out[0]['features'][0]['layers'][0]['values']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PyTorch code" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "ExecuteTime": { + "end_time": "2018-11-02T13:24:27.644785Z", + "start_time": "2018-11-02T13:24:27.611996Z" + } + }, + "outputs": [], + "source": [ + "from extract_features_pytorch import *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "hide_input": false, + "kernelspec": { + "display_name": "Python [conda env:bert]", + "language": "python", + "name": "conda-env-bert-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + }, + "toc": { + "colors": { + "hover_highlight": "#DAA520", + "running_highlight": "#FF0000", + "selected_highlight": "#FFD700" + }, + "moveMenuLeft": true, + "nav_menu": { + "height": "48px", + "width": "252px" + }, + "navigate_menu": true, + "number_sections": true, + "sideBar": true, + "threshold": 4, + "toc_cell": false, + "toc_section_display": "block", + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/extract_features_pytorch.py b/extract_features_pytorch.py index 8d1054f967..6f4f79e0f9 100644 --- a/extract_features_pytorch.py +++ b/extract_features_pytorch.py @@ -37,35 +37,6 @@ logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(messa level = logging.INFO) logger = logging.getLogger(__name__) -parser = argparse.ArgumentParser() - -## Required parameters -parser.add_argument("--input_file", default=None, type=str, required=True) -parser.add_argument("--vocab_file", default=None, type=str, required=True, - help="The vocabulary file that the BERT model was trained on.") -parser.add_argument("--output_file", default=None, type=str, required=True) -parser.add_argument("--bert_config_file", default=None, type=str, required=True, - help="The config json file corresponding to the pre-trained BERT model. " - "This specifies the model architecture.") -parser.add_argument("--init_checkpoint", default=None, type=str, required=True, - help="Initial checkpoint (usually from a pre-trained BERT model).") - -## Other parameters -parser.add_argument("--layers", default="-1,-2,-3,-4", type=str) -parser.add_argument("--max_seq_length", default=128, type=int, - help="The maximum total input sequence length after WordPiece tokenization. Sequences longer " - "than this will be truncated, and sequences shorter than this will be padded.") -parser.add_argument("--do_lower_case", default=True, action='store_true', - help="Whether to lower case the input text. Should be True for uncased " - "models and False for cased models.") -parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.") -parser.add_argument("--local_rank", - type=int, - default=-1, - help = "local_rank for distributed training on gpus") - -args = parser.parse_args() - class InputExample(object): @@ -219,6 +190,35 @@ def read_examples(input_file): def main(): + parser = argparse.ArgumentParser() + + ## Required parameters + parser.add_argument("--input_file", default=None, type=str, required=True) + parser.add_argument("--vocab_file", default=None, type=str, required=True, + help="The vocabulary file that the BERT model was trained on.") + parser.add_argument("--output_file", default=None, type=str, required=True) + parser.add_argument("--bert_config_file", default=None, type=str, required=True, + help="The config json file corresponding to the pre-trained BERT model. " + "This specifies the model architecture.") + parser.add_argument("--init_checkpoint", default=None, type=str, required=True, + help="Initial checkpoint (usually from a pre-trained BERT model).") + + ## Other parameters + parser.add_argument("--layers", default="-1,-2,-3,-4", type=str) + parser.add_argument("--max_seq_length", default=128, type=int, + help="The maximum total input sequence length after WordPiece tokenization. Sequences longer " + "than this will be truncated, and sequences shorter than this will be padded.") + parser.add_argument("--do_lower_case", default=True, action='store_true', + help="Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.") + parser.add_argument("--local_rank", + type=int, + default=-1, + help = "local_rank for distributed training on gpus") + + args = parser.parse_args() + if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() diff --git a/input.txt b/input.txt new file mode 100644 index 0000000000..d1e3f410d0 --- /dev/null +++ b/input.txt @@ -0,0 +1 @@ +Who was Jim Henson ? ||| Jim Henson was a puppeteer