map only on one process (#13810)

2021-09-30 18:52:53 +02:00
parent 9a9805fccf
commit 44eb8bdeea
9 changed files with 142 additions and 125 deletions
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -439,13 +439,14 @@ def main():
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

-    processed_datasets = raw_datasets.map(
-        preprocess_function,
-        batched=True,
-        remove_columns=column_names,
-        load_from_cache_file=not args.overwrite_cache,
-        desc="Running tokenizer on dataset",
-    )
+    with accelerator.main_process_first():
+        processed_datasets = raw_datasets.map(
+            preprocess_function,
+            batched=True,
+            remove_columns=column_names,
+            load_from_cache_file=not args.overwrite_cache,
+            desc="Running tokenizer on dataset",
+        )

    train_dataset = processed_datasets["train"]
    eval_dataset = processed_datasets["validation"]