Switch from using sum for flattening lists of lists in group_texts (#14472)

* remove sum for list flattening * change to chain(*) * make chain object a list * delete empty lines per sgugger's suggestions Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Nicholas Broad <nicholas@nmbroad.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
2021-11-22 16:17:26 -05:00
parent 0b7d053c13
commit 69e16abf98
15 changed files with 35 additions and 20 deletions
--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@@ -22,6 +22,7 @@ import logging
 import os
 import sys
 from dataclasses import dataclass, field
+from itertools import chain
 from typing import Optional, Union

 import datasets
@@ -185,7 +186,7 @@ class DataCollatorForMultipleChoice:
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
-        flattened_features = sum(flattened_features, [])
+        flattened_features = list(chain(*flattened_features))

        batch = self.tokenizer.pad(
            flattened_features,
@@ -333,8 +334,8 @@ def main():
        ]

        # Flatten out
-        first_sentences = sum(first_sentences, [])
-        second_sentences = sum(second_sentences, [])
+        first_sentences = list(chain(*first_sentences))
+        second_sentences = list(chain(*second_sentences))

        # Tokenize
        tokenized_examples = tokenizer(
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -24,6 +24,7 @@ import math
 import os
 import random
 from dataclasses import dataclass
+from itertools import chain
 from pathlib import Path
 from typing import Optional, Union

@@ -224,7 +225,7 @@ class DataCollatorForMultipleChoice:
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
-        flattened_features = sum(flattened_features, [])
+        flattened_features = list(chain(*flattened_features))

        batch = self.tokenizer.pad(
            flattened_features,
@@ -365,8 +366,8 @@ def main():
        labels = examples[label_column_name]

        # Flatten out
-        first_sentences = sum(first_sentences, [])
-        second_sentences = sum(second_sentences, [])
+        first_sentences = list(chain(*first_sentences))
+        second_sentences = list(chain(*second_sentences))

        # Tokenize
        tokenized_examples = tokenizer(