From 1798e98e5af317961441f926743b15373bf0eafe Mon Sep 17 00:00:00 2001 From: Matthew Carrigan Date: Wed, 20 Mar 2019 16:42:37 +0000 Subject: [PATCH] Added final TODOs --- examples/lm_finetuning/finetune_on_pregenerated.py | 6 +----- examples/lm_finetuning/pregenerate_training_data.py | 8 +++++++- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py index 6ba7072208..26003045fc 100644 --- a/examples/lm_finetuning/finetune_on_pregenerated.py +++ b/examples/lm_finetuning/finetune_on_pregenerated.py @@ -54,7 +54,7 @@ def convert_example_to_features(example, tokenizer, max_seq_length): class PregeneratedDataset(Dataset): def __init__(self, training_path, epoch, tokenizer, num_data_epochs): - # TODO Add an option to memmap the training data + # TODO Add an option to memmap and shuffle the training data if needed (see note in pregenerate_training_data) self.vocab = tokenizer.vocab self.tokenizer = tokenizer self.epoch = epoch @@ -101,10 +101,6 @@ class PregeneratedDataset(Dataset): torch.tensor(self.is_nexts[item].astype(np.int64))) -# TODO 2: Test it's all working -# TODO 3: Add a README (can you do that with subfolders?) - - def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_data', type=Path, required=True) diff --git a/examples/lm_finetuning/pregenerate_training_data.py b/examples/lm_finetuning/pregenerate_training_data.py index 20695caba0..e37d9ba822 100644 --- a/examples/lm_finetuning/pregenerate_training_data.py +++ b/examples/lm_finetuning/pregenerate_training_data.py @@ -220,6 +220,13 @@ def main(): args = parser.parse_args() + # TODO Add a low-memory / multiprocessing path for very large datasets + # In this path documents would be stored in a shelf after being tokenized, and multiple processes would convert + # those docs into training examples that would be written out on the fly. This would avoid the need to keep + # the whole training set in memory and would speed up dataset creation at the cost of code complexity. + # In addition, the finetuning script would need to be modified to store the training epochs as memmaped arrays, + # and to shuffle them by importing to the rows of the array in a random order. + tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) vocab_list = list(tokenizer.vocab.keys()) with args.corpus_path.open() as f: @@ -232,7 +239,6 @@ def main(): doc = [] else: tokens = tokenizer.tokenize(line) - # TODO If the sentence is longer than max_len, do we split it in the middle? That's probably a bad idea doc.append(tokens) args.save_dir.mkdir(exist_ok=True)