From 8a861048dd5da11f2a82632333601b7bd42a71b8 Mon Sep 17 00:00:00 2001 From: Matthew Carrigan Date: Thu, 21 Mar 2019 14:08:39 +0000 Subject: [PATCH] Fixed up the notes on a possible future low-memory path --- examples/lm_finetuning/finetune_on_pregenerated.py | 2 +- examples/lm_finetuning/pregenerate_training_data.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py index a6670d5a81..95f78143fa 100644 --- a/examples/lm_finetuning/finetune_on_pregenerated.py +++ b/examples/lm_finetuning/finetune_on_pregenerated.py @@ -54,7 +54,7 @@ def convert_example_to_features(example, tokenizer, max_seq_length): class PregeneratedDataset(Dataset): def __init__(self, training_path, epoch, tokenizer, num_data_epochs): - # TODO Add an option to memmap and shuffle the training data if needed (see note in pregenerate_training_data) + # TODO Add an option to memmap the training data if needed (see note in pregenerate_training_data) self.vocab = tokenizer.vocab self.tokenizer = tokenizer self.epoch = epoch diff --git a/examples/lm_finetuning/pregenerate_training_data.py b/examples/lm_finetuning/pregenerate_training_data.py index aa4969aedc..03fb2a763f 100644 --- a/examples/lm_finetuning/pregenerate_training_data.py +++ b/examples/lm_finetuning/pregenerate_training_data.py @@ -220,8 +220,8 @@ def main(): # In this path documents would be stored in a shelf after being tokenized, and multiple processes would convert # those docs into training examples that would be written out on the fly. This would avoid the need to keep # the whole training set in memory and would speed up dataset creation at the cost of code complexity. - # In addition, the finetuning script would need to be modified to store the training epochs as memmaped arrays, - # and to shuffle them by importing to the rows of the array in a random order. + # In addition, the finetuning script would need to be modified + # to store the training epochs as memmapped arrays. tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) vocab_list = list(tokenizer.vocab.keys())