Fixed up the notes on a possible future low-memory path
This commit is contained in:
@@ -54,7 +54,7 @@ def convert_example_to_features(example, tokenizer, max_seq_length):
|
|||||||
|
|
||||||
class PregeneratedDataset(Dataset):
|
class PregeneratedDataset(Dataset):
|
||||||
def __init__(self, training_path, epoch, tokenizer, num_data_epochs):
|
def __init__(self, training_path, epoch, tokenizer, num_data_epochs):
|
||||||
# TODO Add an option to memmap and shuffle the training data if needed (see note in pregenerate_training_data)
|
# TODO Add an option to memmap the training data if needed (see note in pregenerate_training_data)
|
||||||
self.vocab = tokenizer.vocab
|
self.vocab = tokenizer.vocab
|
||||||
self.tokenizer = tokenizer
|
self.tokenizer = tokenizer
|
||||||
self.epoch = epoch
|
self.epoch = epoch
|
||||||
|
|||||||
@@ -220,8 +220,8 @@ def main():
|
|||||||
# In this path documents would be stored in a shelf after being tokenized, and multiple processes would convert
|
# In this path documents would be stored in a shelf after being tokenized, and multiple processes would convert
|
||||||
# those docs into training examples that would be written out on the fly. This would avoid the need to keep
|
# those docs into training examples that would be written out on the fly. This would avoid the need to keep
|
||||||
# the whole training set in memory and would speed up dataset creation at the cost of code complexity.
|
# the whole training set in memory and would speed up dataset creation at the cost of code complexity.
|
||||||
# In addition, the finetuning script would need to be modified to store the training epochs as memmaped arrays,
|
# In addition, the finetuning script would need to be modified
|
||||||
# and to shuffle them by importing to the rows of the array in a random order.
|
# to store the training epochs as memmapped arrays.
|
||||||
|
|
||||||
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
|
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
|
||||||
vocab_list = list(tokenizer.vocab.keys())
|
vocab_list = list(tokenizer.vocab.keys())
|
||||||
|
|||||||
Reference in New Issue
Block a user