Reduced memory usage for pregenerating the data a lot by writing it
out on the fly without shuffling - the Sampler in the finetuning script will shuffle for us.
This commit is contained in:
@@ -74,8 +74,6 @@ class PregeneratedDataset(Dataset):
|
|||||||
with data_file.open() as f:
|
with data_file.open() as f:
|
||||||
for i, line in enumerate(tqdm(f, total=num_samples, desc="Training examples")):
|
for i, line in enumerate(tqdm(f, total=num_samples, desc="Training examples")):
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if not line:
|
|
||||||
continue # Skip trailing blank lines etc.
|
|
||||||
example = json.loads(line)
|
example = json.loads(line)
|
||||||
features = convert_example_to_features(example, tokenizer, seq_len)
|
features = convert_example_to_features(example, tokenizer, seq_len)
|
||||||
input_ids[i] = features.input_ids
|
input_ids[i] = features.input_ids
|
||||||
|
|||||||
Reference in New Issue
Block a user