From 2cf3447e0a3e3fd04b715f1e6f4ee43575e1e7c9 Mon Sep 17 00:00:00 2001 From: Juha Kiili Date: Thu, 21 Nov 2019 12:35:25 +0200 Subject: [PATCH 1/6] Glue: log in Valohai-compatible JSON format too --- examples/run_glue.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/examples/run_glue.py b/examples/run_glue.py index 527e440075..ea5ac5bbb7 100644 --- a/examples/run_glue.py +++ b/examples/run_glue.py @@ -22,6 +22,7 @@ import glob import logging import os import random +import json import numpy as np import torch @@ -171,13 +172,21 @@ def train(args, train_dataset, model, tokenizer): if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics + logs = {'step': global_step} if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): - tb_writer.add_scalar('eval_{}'.format(key), value, global_step) - tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) - tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) + eval_key = 'eval_{}'.format(key) + tb_writer.add_scalar(eval_key, value, global_step) + logs[eval_key] = str(value) logging_loss = tr_loss + loss_scalar = (tr_loss - logging_loss) / args.logging_steps + learning_rate_scalar = scheduler.get_lr()[0] + tb_writer.add_scalar('lr', learning_rate_scalar, global_step) + tb_writer.add_scalar('loss', loss_scalar, global_step) + logs['learning_rate'] = learning_rate_scalar + logs['loss'] = loss_scalar + print(json.dumps(logs)) if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint From aac35514075290a46419b9bd969e6f94fef9d43b Mon Sep 17 00:00:00 2001 From: Aarni Koskela Date: Thu, 21 Nov 2019 12:37:39 +0200 Subject: [PATCH 2/6] Add download_glue_data.py from kamalkraj/ALBERT-TF2.0 Original source: https://github.com/kamalkraj/ALBERT-TF2.0/blob/fa90194e5fe729dbb19f32ac29c8d6d6372c0f93/download_glue_data.py Original license: https://github.com/kamalkraj/ALBERT-TF2.0/blob/fa90194e5fe729dbb19f32ac29c8d6d6372c0f93/LICENSE (Apache-2.0) --- utils/download_glue_data.py | 141 ++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 utils/download_glue_data.py diff --git a/utils/download_glue_data.py b/utils/download_glue_data.py new file mode 100644 index 0000000000..86a4e8951f --- /dev/null +++ b/utils/download_glue_data.py @@ -0,0 +1,141 @@ +''' Script for downloading all GLUE data. + +Note: for legal reasons, we are unable to host MRPC. +You can either use the version hosted by the SentEval team, which is already tokenized, +or you can download the original data from (https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi) and extract the data from it manually. +For Windows users, you can run the .msi file. For Mac and Linux users, consider an external library such as 'cabextract' (see below for an example). +You should then rename and place specific files in a folder (see below for an example). + +mkdir MRPC +cabextract MSRParaphraseCorpus.msi -d MRPC +cat MRPC/_2DEC3DBE877E4DB192D17C0256E90F1D | tr -d $'\r' > MRPC/msr_paraphrase_train.txt +cat MRPC/_D7B391F9EAFF4B1B8BCE8F21B20B1B61 | tr -d $'\r' > MRPC/msr_paraphrase_test.txt +rm MRPC/_* +rm MSRParaphraseCorpus.msi + +1/30/19: It looks like SentEval is no longer hosting their extracted and tokenized MRPC data, so you'll need to download the data from the original source for now. +2/11/19: It looks like SentEval actually *is* hosting the extracted data. Hooray! +''' + +import os +import sys +import shutil +import argparse +import tempfile +import urllib.request +import zipfile + +TASKS = ["CoLA", "SST", "MRPC", "QQP", "STS", "MNLI", "SNLI", "QNLI", "RTE", "WNLI", "diagnostic"] +TASK2PATH = {"CoLA":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4', + "SST":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8', + "MRPC":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc', + "QQP":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP.zip?alt=media&token=700c6acf-160d-4d89-81d1-de4191d02cb5', + "STS":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5', + "MNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce', + "SNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df', + "QNLI": 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLIv2.zip?alt=media&token=6fdcf570-0fc5-4631-8456-9505272d1601', + "RTE":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb', + "WNLI":'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf', + "diagnostic":'https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D'} + +MRPC_TRAIN = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt' +MRPC_TEST = 'https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt' + +def download_and_extract(task, data_dir): + print("Downloading and extracting %s..." % task) + data_file = "%s.zip" % task + urllib.request.urlretrieve(TASK2PATH[task], data_file) + with zipfile.ZipFile(data_file) as zip_ref: + zip_ref.extractall(data_dir) + os.remove(data_file) + print("\tCompleted!") + +def format_mrpc(data_dir, path_to_data): + print("Processing MRPC...") + mrpc_dir = os.path.join(data_dir, "MRPC") + if not os.path.isdir(mrpc_dir): + os.mkdir(mrpc_dir) + if path_to_data: + mrpc_train_file = os.path.join(path_to_data, "msr_paraphrase_train.txt") + mrpc_test_file = os.path.join(path_to_data, "msr_paraphrase_test.txt") + else: + print("Local MRPC data not specified, downloading data from %s" % MRPC_TRAIN) + mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt") + mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt") + urllib.request.urlretrieve(MRPC_TRAIN, mrpc_train_file) + urllib.request.urlretrieve(MRPC_TEST, mrpc_test_file) + assert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_file + assert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_file + urllib.request.urlretrieve(TASK2PATH["MRPC"], os.path.join(mrpc_dir, "dev_ids.tsv")) + + dev_ids = [] + with open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding="utf8") as ids_fh: + for row in ids_fh: + dev_ids.append(row.strip().split('\t')) + + with open(mrpc_train_file, encoding="utf8") as data_fh, \ + open(os.path.join(mrpc_dir, "train.tsv"), 'w', encoding="utf8") as train_fh, \ + open(os.path.join(mrpc_dir, "dev.tsv"), 'w', encoding="utf8") as dev_fh: + header = data_fh.readline() + train_fh.write(header) + dev_fh.write(header) + for row in data_fh: + label, id1, id2, s1, s2 = row.strip().split('\t') + if [id1, id2] in dev_ids: + dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2)) + else: + train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2)) + + with open(mrpc_test_file, encoding="utf8") as data_fh, \ + open(os.path.join(mrpc_dir, "test.tsv"), 'w', encoding="utf8") as test_fh: + header = data_fh.readline() + test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n") + for idx, row in enumerate(data_fh): + label, id1, id2, s1, s2 = row.strip().split('\t') + test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2)) + print("\tCompleted!") + +def download_diagnostic(data_dir): + print("Downloading and extracting diagnostic...") + if not os.path.isdir(os.path.join(data_dir, "diagnostic")): + os.mkdir(os.path.join(data_dir, "diagnostic")) + data_file = os.path.join(data_dir, "diagnostic", "diagnostic.tsv") + urllib.request.urlretrieve(TASK2PATH["diagnostic"], data_file) + print("\tCompleted!") + return + +def get_tasks(task_names): + task_names = task_names.split(',') + if "all" in task_names: + tasks = TASKS + else: + tasks = [] + for task_name in task_names: + assert task_name in TASKS, "Task %s not found!" % task_name + tasks.append(task_name) + return tasks + +def main(arguments): + parser = argparse.ArgumentParser() + parser.add_argument('--data_dir', help='directory to save data to', type=str, default='glue_data') + parser.add_argument('--tasks', help='tasks to download data for as a comma separated string', + type=str, default='all') + parser.add_argument('--path_to_mrpc', help='path to directory containing extracted MRPC data, msr_paraphrase_train.txt and msr_paraphrase_text.txt', + type=str, default='') + args = parser.parse_args(arguments) + + if not os.path.isdir(args.data_dir): + os.mkdir(args.data_dir) + tasks = get_tasks(args.tasks) + + for task in tasks: + if task == 'MRPC': + format_mrpc(args.data_dir, args.path_to_mrpc) + elif task == 'diagnostic': + download_diagnostic(args.data_dir) + else: + download_and_extract(task, args.data_dir) + + +if __name__ == '__main__': + sys.exit(main(sys.argv[1:])) From 05d4232f63f121baefec9a87704ea7a15933f6e9 Mon Sep 17 00:00:00 2001 From: Juha Kiili Date: Thu, 21 Nov 2019 12:38:17 +0200 Subject: [PATCH 3/6] Add valohai.yaml --- valohai.yaml | 94 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 valohai.yaml diff --git a/valohai.yaml b/valohai.yaml new file mode 100644 index 0000000000..2573551b4e --- /dev/null +++ b/valohai.yaml @@ -0,0 +1,94 @@ +--- + +- step: + name: Execute python examples/run_glue.py + image: pytorch/pytorch:nightly-devel-cuda10.0-cudnn7 + command: + - python /valohai/repository/utils/download_glue_data.py --data_dir=/glue_data + - pip install -e . + - pip install -r examples/requirements.txt + - python examples/run_glue.py --do_train --data_dir=/glue_data/{parameter-value:task_name} {parameters} + parameters: + - name: model_type + pass-as: --model_type={v} + type: string + default: bert + - name: model_name_or_path + pass-as: --model_name_or_path={v} + type: string + default: bert-base-uncased + - name: task_name + pass-as: --task_name={v} + type: string + default: MRPC + - name: max_seq_length + pass-as: --max_seq_length={v} + description: The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded. + type: integer + default: 128 + - name: per_gpu_train_batch_size + pass-as: --per_gpu_train_batch_size={v} + description: Batch size per GPU/CPU for training. + type: integer + default: 8 + - name: per_gpu_eval_batch_size + pass-as: --per_gpu_eval_batch_size={v} + description: Batch size per GPU/CPU for evaluation. + type: integer + default: 8 + - name: gradient_accumulation_steps + pass-as: --gradient_accumulation_steps={v} + description: Number of updates steps to accumulate before performing a backward/update pass. + type: integer + default: 1 + - name: learning_rate + pass-as: --learning_rate={v} + description: The initial learning rate for Adam. + type: float + default: 0.00005 + - name: adam_epsilon + pass-as: --adam_epsilon={v} + description: Epsilon for Adam optimizer. + type: float + default: 0.00000001 + - name: max_grad_norm + pass-as: --max_grad_norm={v} + description: Max gradient norm. + type: float + default: 1.0 + - name: num_train_epochs + pass-as: --num_train_epochs={v} + description: Total number of training epochs to perform. + type: integer + default: 3 + - name: max_steps + pass-as: --max_steps={v} + description: If > 0, set total number of training steps to perform. Override num_train_epochs. + type: integer + default: -1 + - name: warmup_steps + pass-as: --warmup_steps={v} + description: Linear warmup over warmup_steps. + type: integer + default: -1 + - name: logging_steps + pass-as: --logging_steps={v} + description: Log every X updates steps. + type: integer + default: 25 + - name: save_steps + pass-as: --save_steps={v} + description: Save checkpoint every X updates steps. + type: integer + default: -1 + - name: output_dir + pass-as: --output_dir={v} + type: string + default: /valohai/outputs + - name: evaluate_during_training + description: Run evaluation during training at each logging step. + type: flag + default: true + - name: do_lower_case + description: Set this flag if you are using an uncased model. + type: flag From 41aa0e80039d3148c55f4fe967247d4f7bbbfec5 Mon Sep 17 00:00:00 2001 From: Juha Kiili Date: Fri, 29 Nov 2019 15:33:25 +0200 Subject: [PATCH 4/6] Refactor logs and fix loss bug --- examples/run_glue.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/run_glue.py b/examples/run_glue.py index ea5ac5bbb7..8749593a1f 100644 --- a/examples/run_glue.py +++ b/examples/run_glue.py @@ -171,22 +171,22 @@ def train(args, train_dataset, model, tokenizer): global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: - # Log metrics - logs = {'step': global_step} + logs = {} if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): eval_key = 'eval_{}'.format(key) - tb_writer.add_scalar(eval_key, value, global_step) - logs[eval_key] = str(value) - logging_loss = tr_loss + logs[eval_key] = value + loss_scalar = (tr_loss - logging_loss) / args.logging_steps learning_rate_scalar = scheduler.get_lr()[0] - tb_writer.add_scalar('lr', learning_rate_scalar, global_step) - tb_writer.add_scalar('loss', loss_scalar, global_step) logs['learning_rate'] = learning_rate_scalar logs['loss'] = loss_scalar - print(json.dumps(logs)) + logging_loss = tr_loss + + for key, value in logs.items(): + tb_writer.add_scalar(key, value, global_step) + print(json.dumps({**logs, **{'step': global_step}})) if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint From 2421e54f8c354fc110a7f8819a9161163813f7ad Mon Sep 17 00:00:00 2001 From: Juha Kiili Date: Fri, 29 Nov 2019 15:39:28 +0200 Subject: [PATCH 5/6] Add link to original source and license to download_glue.data.py --- utils/download_glue_data.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/utils/download_glue_data.py b/utils/download_glue_data.py index 86a4e8951f..f676a71c76 100644 --- a/utils/download_glue_data.py +++ b/utils/download_glue_data.py @@ -1,5 +1,8 @@ ''' Script for downloading all GLUE data. +Original source: https://github.com/kamalkraj/ALBERT-TF2.0/blob/fa90194e5fe729dbb19f32ac29c8d6d6372c0f93/download_glue_data.py +Original license: https://github.com/kamalkraj/ALBERT-TF2.0/blob/fa90194e5fe729dbb19f32ac29c8d6d6372c0f93/LICENSE (Apache-2.0) + Note: for legal reasons, we are unable to host MRPC. You can either use the version hosted by the SentEval team, which is already tokenized, or you can download the original data from (https://download.microsoft.com/download/D/4/6/D46FF87A-F6B9-4252-AA8B-3604ED519838/MSRParaphraseCorpus.msi) and extract the data from it manually. From 66fc8d25a5de43c08baeea8b22b9bcf57346c325 Mon Sep 17 00:00:00 2001 From: Juha Kiili Date: Tue, 3 Dec 2019 10:49:50 +0200 Subject: [PATCH 6/6] Change ref to original GLUE downloader script --- utils/download_glue_data.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/utils/download_glue_data.py b/utils/download_glue_data.py index f676a71c76..de8cfa9e73 100644 --- a/utils/download_glue_data.py +++ b/utils/download_glue_data.py @@ -1,7 +1,5 @@ ''' Script for downloading all GLUE data. - -Original source: https://github.com/kamalkraj/ALBERT-TF2.0/blob/fa90194e5fe729dbb19f32ac29c8d6d6372c0f93/download_glue_data.py -Original license: https://github.com/kamalkraj/ALBERT-TF2.0/blob/fa90194e5fe729dbb19f32ac29c8d6d6372c0f93/LICENSE (Apache-2.0) +Original source: https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e Note: for legal reasons, we are unable to host MRPC. You can either use the version hosted by the SentEval team, which is already tokenized,