From 8e093e5981e573a0b591dc57e8d52cc3efe82230 Mon Sep 17 00:00:00 2001 From: peterandluc <597624085@qq.com> Date: Thu, 23 Apr 2020 17:10:57 +0200 Subject: [PATCH] Remove 50k limits bug --- src/transformers/data/datasets/language_modeling.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/data/datasets/language_modeling.py b/src/transformers/data/datasets/language_modeling.py index 5695be482b..32b03fae83 100644 --- a/src/transformers/data/datasets/language_modeling.py +++ b/src/transformers/data/datasets/language_modeling.py @@ -90,7 +90,6 @@ class LineByLineTextDataset(Dataset): with open(file_path, encoding="utf-8") as f: lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())] - lines = lines[:50_000] batch_encoding = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size) self.examples = batch_encoding["input_ids"]