Fixed up the notes on a possible future low-memory path

2019-03-21 14:08:39 +00:00
parent a8a577ba93
commit 8a861048dd
2 changed files with 3 additions and 3 deletions
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -54,7 +54,7 @@ def convert_example_to_features(example, tokenizer, max_seq_length):

 class PregeneratedDataset(Dataset):
    def __init__(self, training_path, epoch, tokenizer, num_data_epochs):
-        # TODO Add an option to memmap and shuffle the training data if needed (see note in pregenerate_training_data)
+        # TODO Add an option to memmap the training data if needed (see note in pregenerate_training_data)
        self.vocab = tokenizer.vocab
        self.tokenizer = tokenizer
        self.epoch = epoch
--- a/examples/lm_finetuning/pregenerate_training_data.py
+++ b/examples/lm_finetuning/pregenerate_training_data.py
@@ -220,8 +220,8 @@ def main():
    #      In this path documents would be stored in a shelf after being tokenized, and multiple processes would convert
    #      those docs into training examples that would be written out on the fly. This would avoid the need to keep
    #      the whole training set in memory and would speed up dataset creation at the cost of code complexity.
-    #      In addition, the finetuning script would need to be modified to store the training epochs as memmaped arrays,
-    #      and to shuffle them by importing to the rows of the array in a random order.
+    #      In addition, the finetuning script would need to be modified
+    #      to store the training epochs as memmapped arrays.

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
    vocab_list = list(tokenizer.vocab.keys())