Clean up data collators and datasets (#8308)

* Clean up data collators and datasets

* Apply suggestions from code review

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>

* Remove needless clone

Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
This commit is contained in:
Sylvain Gugger
2020-11-04 17:24:49 -05:00
committed by GitHub
parent b1d3e95eb5
commit 9c4aa4ac1a
6 changed files with 136 additions and 197 deletions

View File

@@ -264,7 +264,15 @@ def main():
def tokenize_function(examples):
# Remove empty lines
examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
return tokenizer(examples["text"], padding=padding, truncation=True, max_length=data_args.max_seq_length)
return tokenizer(
examples["text"],
padding=padding,
truncation=True,
max_length=data_args.max_seq_length,
# We use this option because DataCollatorForLanguageModeling (see below) is more efficient when it
# receives the `special_tokens_mask`.
return_special_tokens_mask=True,
)
tokenized_datasets = datasets.map(
tokenize_function,
@@ -275,8 +283,10 @@ def main():
)
else:
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
# We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
# efficient when it receives the `special_tokens_mask`.
def tokenize_function(examples):
return tokenizer(examples[text_column_name])
return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
tokenized_datasets = datasets.map(
tokenize_function,