From a8a577ba93e8de476827ae1ecfbc0c32fd40b478 Mon Sep 17 00:00:00 2001 From: Matthew Carrigan Date: Thu, 21 Mar 2019 14:05:52 +0000 Subject: [PATCH] Reduced memory usage for pregenerating the data a lot by writing it out on the fly without shuffling - the Sampler in the finetuning script will shuffle for us. --- examples/lm_finetuning/finetune_on_pregenerated.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py index dcaef85545..a6670d5a81 100644 --- a/examples/lm_finetuning/finetune_on_pregenerated.py +++ b/examples/lm_finetuning/finetune_on_pregenerated.py @@ -74,8 +74,6 @@ class PregeneratedDataset(Dataset): with data_file.open() as f: for i, line in enumerate(tqdm(f, total=num_samples, desc="Training examples")): line = line.strip() - if not line: - continue # Skip trailing blank lines etc. example = json.loads(line) features = convert_example_to_features(example, tokenizer, seq_len) input_ids[i] = features.input_ids