Switch from using sum for flattening lists of lists in group_texts (#14472)

* remove sum for list flattening

* change to chain(*)

* make chain object a list

* delete empty lines

per sgugger's suggestions

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

Co-authored-by: Nicholas Broad <nicholas@nmbroad.com>
Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
Nicholas Broad
2021-11-22 16:17:26 -05:00
committed by GitHub
parent 0b7d053c13
commit 69e16abf98
15 changed files with 35 additions and 20 deletions

View File

@@ -26,6 +26,7 @@ import math
import os
import sys
from dataclasses import dataclass, field
from itertools import chain
from typing import Optional
import datasets
@@ -408,7 +409,7 @@ def main():
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.

View File

@@ -27,6 +27,7 @@ import logging
import math
import os
import random
from itertools import chain
from pathlib import Path
import datasets
@@ -366,7 +367,7 @@ def main():
# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.

View File

@@ -26,6 +26,7 @@ import math
import os
import sys
from dataclasses import dataclass, field
from itertools import chain
from typing import Optional
import datasets
@@ -432,7 +433,7 @@ def main():
# max_seq_length.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.

View File

@@ -27,6 +27,7 @@ import logging
import math
import os
import random
from itertools import chain
from pathlib import Path
import datasets
@@ -406,7 +407,7 @@ def main():
# max_seq_length.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.

View File

@@ -23,6 +23,7 @@ import math
import os
import sys
from dataclasses import dataclass, field
from itertools import chain
from typing import Optional
import datasets
@@ -403,7 +404,7 @@ def main():
# max_seq_length.
def group_texts(examples):
# Concatenate all texts.
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
total_length = len(concatenated_examples[list(examples.keys())[0]])
# We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
# customize this part to your needs.

View File

@@ -22,6 +22,7 @@ import logging
import os
import sys
from dataclasses import dataclass, field
from itertools import chain
from typing import Optional, Union
import datasets
@@ -185,7 +186,7 @@ class DataCollatorForMultipleChoice:
flattened_features = [
[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
]
flattened_features = sum(flattened_features, [])
flattened_features = list(chain(*flattened_features))
batch = self.tokenizer.pad(
flattened_features,
@@ -333,8 +334,8 @@ def main():
]
# Flatten out
first_sentences = sum(first_sentences, [])
second_sentences = sum(second_sentences, [])
first_sentences = list(chain(*first_sentences))
second_sentences = list(chain(*second_sentences))
# Tokenize
tokenized_examples = tokenizer(

View File

@@ -24,6 +24,7 @@ import math
import os
import random
from dataclasses import dataclass
from itertools import chain
from pathlib import Path
from typing import Optional, Union
@@ -224,7 +225,7 @@ class DataCollatorForMultipleChoice:
flattened_features = [
[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
]
flattened_features = sum(flattened_features, [])
flattened_features = list(chain(*flattened_features))
batch = self.tokenizer.pad(
flattened_features,
@@ -365,8 +366,8 @@ def main():
labels = examples[label_column_name]
# Flatten out
first_sentences = sum(first_sentences, [])
second_sentences = sum(second_sentences, [])
first_sentences = list(chain(*first_sentences))
second_sentences = list(chain(*second_sentences))
# Tokenize
tokenized_examples = tokenizer(