Added proper context management to ensure cleanup happens in the right
order.
This commit is contained in:
@@ -23,6 +23,7 @@ class DocumentDatabase:
|
||||
self.documents = []
|
||||
self.document_shelf = None
|
||||
self.document_shelf_filepath = None
|
||||
self.temp_dir = None
|
||||
self.doc_lengths = []
|
||||
self.doc_cumsum = None
|
||||
self.cumsum_max = None
|
||||
@@ -68,9 +69,14 @@ class DocumentDatabase:
|
||||
else:
|
||||
return self.documents[item]
|
||||
|
||||
def cleanup(self):
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, traceback):
|
||||
if self.document_shelf is not None:
|
||||
self.document_shelf.close()
|
||||
if self.temp_dir is not None:
|
||||
self.temp_dir.cleanup()
|
||||
|
||||
|
||||
def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens):
|
||||
@@ -247,7 +253,7 @@ def main():
|
||||
|
||||
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
|
||||
vocab_list = list(tokenizer.vocab.keys())
|
||||
docs = DocumentDatabase(reduce_memory=args.reduce_memory)
|
||||
with DocumentDatabase(reduce_memory=args.reduce_memory) as docs:
|
||||
with args.train_corpus.open() as f:
|
||||
doc = []
|
||||
for line in tqdm(f, desc="Loading Dataset", unit=" lines"):
|
||||
@@ -280,7 +286,6 @@ def main():
|
||||
"max_seq_len": args.max_seq_len
|
||||
}
|
||||
metrics_file.write(json.dumps(metrics))
|
||||
docs.cleanup()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
Reference in New Issue
Block a user