map only on one process (#13810)

This commit is contained in:
Patrick von Platen
2021-09-30 18:52:53 +02:00
committed by GitHub
parent 9a9805fccf
commit 44eb8bdeea
9 changed files with 142 additions and 125 deletions

View File

@@ -439,13 +439,14 @@ def main():
model_inputs["labels"] = labels["input_ids"]
return model_inputs
processed_datasets = raw_datasets.map(
preprocess_function,
batched=True,
remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on dataset",
)
with accelerator.main_process_first():
processed_datasets = raw_datasets.map(
preprocess_function,
batched=True,
remove_columns=column_names,
load_from_cache_file=not args.overwrite_cache,
desc="Running tokenizer on dataset",
)
train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation"]