From 69e16abf98c94b8a6d2cf7d60ca36f13e4fbee58 Mon Sep 17 00:00:00 2001 From: Nicholas Broad Date: Mon, 22 Nov 2021 16:17:26 -0500 Subject: [PATCH] Switch from using sum for flattening lists of lists in group_texts (#14472) * remove sum for list flattening * change to chain(*) * make chain object a list * delete empty lines per sgugger's suggestions Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Nicholas Broad Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- examples/flax/language-modeling/run_clm_flax.py | 3 ++- examples/flax/language-modeling/run_mlm_flax.py | 3 ++- examples/flax/language-modeling/run_t5_mlm_flax.py | 3 ++- examples/pytorch/language-modeling/run_clm.py | 3 ++- examples/pytorch/language-modeling/run_clm_no_trainer.py | 3 ++- examples/pytorch/language-modeling/run_mlm.py | 3 ++- examples/pytorch/language-modeling/run_mlm_no_trainer.py | 3 ++- examples/pytorch/language-modeling/run_plm.py | 3 ++- examples/pytorch/multiple-choice/run_swag.py | 7 ++++--- examples/pytorch/multiple-choice/run_swag_no_trainer.py | 7 ++++--- .../jax-projects/model_parallel/run_clm_mp.py | 3 ++- examples/tensorflow/language-modeling/run_clm.py | 3 ++- examples/tensorflow/language-modeling/run_mlm.py | 3 ++- examples/tensorflow/multiple-choice/run_swag.py | 5 +++-- src/transformers/file_utils.py | 3 ++- 15 files changed, 35 insertions(+), 20 deletions(-) diff --git a/examples/flax/language-modeling/run_clm_flax.py b/examples/flax/language-modeling/run_clm_flax.py index 95c313c6d3..50054a6044 100755 --- a/examples/flax/language-modeling/run_clm_flax.py +++ b/examples/flax/language-modeling/run_clm_flax.py @@ -27,6 +27,7 @@ import os import sys import time from dataclasses import dataclass, field +from itertools import chain from pathlib import Path from typing import Callable, Optional @@ -430,7 +431,7 @@ def main(): # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. def group_texts(examples): # Concatenate all texts. - concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. diff --git a/examples/flax/language-modeling/run_mlm_flax.py b/examples/flax/language-modeling/run_mlm_flax.py index 322479148d..3be4bf387d 100755 --- a/examples/flax/language-modeling/run_mlm_flax.py +++ b/examples/flax/language-modeling/run_mlm_flax.py @@ -25,6 +25,7 @@ import os import sys import time from dataclasses import dataclass, field +from itertools import chain # You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments. from pathlib import Path @@ -453,7 +454,7 @@ if __name__ == "__main__": # max_seq_length. def group_texts(examples): # Concatenate all texts. - concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. diff --git a/examples/flax/language-modeling/run_t5_mlm_flax.py b/examples/flax/language-modeling/run_t5_mlm_flax.py index e75b0f290f..b78dc0431a 100755 --- a/examples/flax/language-modeling/run_t5_mlm_flax.py +++ b/examples/flax/language-modeling/run_t5_mlm_flax.py @@ -25,6 +25,7 @@ import os import sys import time from dataclasses import dataclass, field +from itertools import chain from pathlib import Path from typing import Dict, List, Optional @@ -563,7 +564,7 @@ if __name__ == "__main__": # Main data processing function that will concatenate all texts from our dataset and generate chunks of expanded_inputs_length. def group_texts(examples): # Concatenate all texts. - concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index 444df1b809..f098f139ae 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -26,6 +26,7 @@ import math import os import sys from dataclasses import dataclass, field +from itertools import chain from typing import Optional import datasets @@ -408,7 +409,7 @@ def main(): # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. def group_texts(examples): # Concatenate all texts. - concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index ef9edffb34..ed0702e3bb 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -27,6 +27,7 @@ import logging import math import os import random +from itertools import chain from pathlib import Path import datasets @@ -366,7 +367,7 @@ def main(): # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. def group_texts(examples): # Concatenate all texts. - concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py index a1b5b7aca3..3f8ab03f45 100755 --- a/examples/pytorch/language-modeling/run_mlm.py +++ b/examples/pytorch/language-modeling/run_mlm.py @@ -26,6 +26,7 @@ import math import os import sys from dataclasses import dataclass, field +from itertools import chain from typing import Optional import datasets @@ -432,7 +433,7 @@ def main(): # max_seq_length. def group_texts(examples): # Concatenate all texts. - concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index e356741daf..2fc492daa1 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -27,6 +27,7 @@ import logging import math import os import random +from itertools import chain from pathlib import Path import datasets @@ -406,7 +407,7 @@ def main(): # max_seq_length. def group_texts(examples): # Concatenate all texts. - concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py index 840bfa9ad6..063393e0a4 100755 --- a/examples/pytorch/language-modeling/run_plm.py +++ b/examples/pytorch/language-modeling/run_plm.py @@ -23,6 +23,7 @@ import math import os import sys from dataclasses import dataclass, field +from itertools import chain from typing import Optional import datasets @@ -403,7 +404,7 @@ def main(): # max_seq_length. def group_texts(examples): # Concatenate all texts. - concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py index 54a80a5c32..b18ea1288c 100755 --- a/examples/pytorch/multiple-choice/run_swag.py +++ b/examples/pytorch/multiple-choice/run_swag.py @@ -22,6 +22,7 @@ import logging import os import sys from dataclasses import dataclass, field +from itertools import chain from typing import Optional, Union import datasets @@ -185,7 +186,7 @@ class DataCollatorForMultipleChoice: flattened_features = [ [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features ] - flattened_features = sum(flattened_features, []) + flattened_features = list(chain(*flattened_features)) batch = self.tokenizer.pad( flattened_features, @@ -333,8 +334,8 @@ def main(): ] # Flatten out - first_sentences = sum(first_sentences, []) - second_sentences = sum(second_sentences, []) + first_sentences = list(chain(*first_sentences)) + second_sentences = list(chain(*second_sentences)) # Tokenize tokenized_examples = tokenizer( diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index 07d212a65a..6f0f38a831 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -24,6 +24,7 @@ import math import os import random from dataclasses import dataclass +from itertools import chain from pathlib import Path from typing import Optional, Union @@ -224,7 +225,7 @@ class DataCollatorForMultipleChoice: flattened_features = [ [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features ] - flattened_features = sum(flattened_features, []) + flattened_features = list(chain(*flattened_features)) batch = self.tokenizer.pad( flattened_features, @@ -365,8 +366,8 @@ def main(): labels = examples[label_column_name] # Flatten out - first_sentences = sum(first_sentences, []) - second_sentences = sum(second_sentences, []) + first_sentences = list(chain(*first_sentences)) + second_sentences = list(chain(*second_sentences)) # Tokenize tokenized_examples = tokenizer( diff --git a/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py b/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py index 34fa5d3b15..c56f10478f 100644 --- a/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py +++ b/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py @@ -23,6 +23,7 @@ import os import sys import time from dataclasses import dataclass, field +from itertools import chain from pathlib import Path from typing import Callable, Optional @@ -364,7 +365,7 @@ def main(): # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. def group_texts(examples): # Concatenate all texts. - concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. diff --git a/examples/tensorflow/language-modeling/run_clm.py b/examples/tensorflow/language-modeling/run_clm.py index 5f1adc5ccf..d8383b0f24 100755 --- a/examples/tensorflow/language-modeling/run_clm.py +++ b/examples/tensorflow/language-modeling/run_clm.py @@ -30,6 +30,7 @@ import random import sys from dataclasses import dataclass, field from functools import partial +from itertools import chain from pathlib import Path from typing import Optional @@ -406,7 +407,7 @@ def main(): # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size. def group_texts(examples): # Concatenate all texts. - concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. diff --git a/examples/tensorflow/language-modeling/run_mlm.py b/examples/tensorflow/language-modeling/run_mlm.py index 244a3a9a47..c4f318416c 100755 --- a/examples/tensorflow/language-modeling/run_mlm.py +++ b/examples/tensorflow/language-modeling/run_mlm.py @@ -32,6 +32,7 @@ import random import sys from dataclasses import dataclass, field from functools import partial +from itertools import chain from pathlib import Path from typing import Optional @@ -462,7 +463,7 @@ def main(): # max_seq_length. def group_texts(examples): # Concatenate all texts. - concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()} total_length = len(concatenated_examples[list(examples.keys())[0]]) # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can # customize this part to your needs. diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py index 77dab86f5b..56e6012ac6 100644 --- a/examples/tensorflow/multiple-choice/run_swag.py +++ b/examples/tensorflow/multiple-choice/run_swag.py @@ -22,6 +22,7 @@ import logging import os import sys from dataclasses import dataclass, field +from itertools import chain from pathlib import Path from typing import Optional @@ -342,8 +343,8 @@ def main(): ] # Flatten out - first_sentences = sum(first_sentences, []) - second_sentences = sum(second_sentences, []) + first_sentences = list(chain(*first_sentences)) + second_sentences = list(chain(*second_sentences)) # Tokenize tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True, max_length=max_seq_length) diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py index ae9fc49980..5d99b36c14 100644 --- a/src/transformers/file_utils.py +++ b/src/transformers/file_utils.py @@ -35,6 +35,7 @@ from dataclasses import fields from enum import Enum from functools import partial, wraps from hashlib import sha256 +from itertools import chain from pathlib import Path from types import ModuleType from typing import Any, BinaryIO, ContextManager, Dict, List, Optional, Tuple, Union @@ -2129,7 +2130,7 @@ class _LazyModule(ModuleType): for value in values: self._class_to_module[value] = key # Needed for autocompletion in an IDE - self.__all__ = list(import_structure.keys()) + sum(import_structure.values(), []) + self.__all__ = list(import_structure.keys()) + list(chain(*import_structure.values())) self.__file__ = module_file self.__spec__ = module_spec self.__path__ = [os.path.dirname(module_file)]