From 0ae59e662d628bec34ecb86d8b8ab7323f725ed7 Mon Sep 17 00:00:00 2001 From: Matthew Carrigan Date: Thu, 21 Mar 2019 14:04:17 +0000 Subject: [PATCH] Reduced memory usage for pregenerating the data a lot by writing it out on the fly without shuffling - the Sampler in the finetuning script will shuffle for us. --- .../lm_finetuning/finetune_on_pregenerated.py | 5 +++- .../pregenerate_training_data.py | 28 +++++++++---------- 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py index a0d393568a..dcaef85545 100644 --- a/examples/lm_finetuning/finetune_on_pregenerated.py +++ b/examples/lm_finetuning/finetune_on_pregenerated.py @@ -73,7 +73,10 @@ class PregeneratedDataset(Dataset): logging.info(f"Loading training examples for epoch {epoch}") with data_file.open() as f: for i, line in enumerate(tqdm(f, total=num_samples, desc="Training examples")): - example = json.loads(line.rstrip()) + line = line.strip() + if not line: + continue # Skip trailing blank lines etc. + example = json.loads(line) features = convert_example_to_features(example, tokenizer, seq_len) input_ids[i] = features.input_ids segment_ids[i] = features.segment_ids diff --git a/examples/lm_finetuning/pregenerate_training_data.py b/examples/lm_finetuning/pregenerate_training_data.py index d77f2a236a..aa4969aedc 100644 --- a/examples/lm_finetuning/pregenerate_training_data.py +++ b/examples/lm_finetuning/pregenerate_training_data.py @@ -242,24 +242,22 @@ def main(): # When choosing a random sentence, we should sample docs proportionally to the number of sentences they contain # Google BERT doesn't do this, and as a result oversamples shorter docs for epoch in trange(args.epochs_to_generate, desc="Epoch"): - epoch_instances = [] - for doc_idx in trange(len(docs), desc="Document"): - doc_instances = create_instances_from_document( - docs, doc_idx, max_seq_length=args.max_seq_len, short_seq_prob=args.short_seq_prob, - masked_lm_prob=args.masked_lm_prob, max_predictions_per_seq=args.max_predictions_per_seq, - vocab_list=vocab_list) - doc_instances = [json.dumps(instance) for instance in doc_instances] - epoch_instances.extend(doc_instances) - - shuffle(epoch_instances) - epoch_file = args.output_dir / f"epoch_{epoch}.json" + epoch_filename = args.output_dir / f"epoch_{epoch}.json" + num_instances = 0 + with epoch_filename.open('w') as epoch_file: + for doc_idx in trange(len(docs), desc="Document"): + doc_instances = create_instances_from_document( + docs, doc_idx, max_seq_length=args.max_seq_len, short_seq_prob=args.short_seq_prob, + masked_lm_prob=args.masked_lm_prob, max_predictions_per_seq=args.max_predictions_per_seq, + vocab_list=vocab_list) + doc_instances = [json.dumps(instance) for instance in doc_instances] + for instance in doc_instances: + epoch_file.write(instance + '\n') + num_instances += 1 metrics_file = args.output_dir / f"epoch_{epoch}_metrics.json" - with epoch_file.open('w') as out_file: - for instance in epoch_instances: - out_file.write(instance + '\n') with metrics_file.open('w') as metrics_file: metrics = { - "num_training_examples": len(epoch_instances), + "num_training_examples": num_instances, "max_seq_len": args.max_seq_len } metrics_file.write(json.dumps(metrics))