map only on one process (#13810)
This commit is contained in:
committed by
GitHub
parent
9a9805fccf
commit
44eb8bdeea
@@ -439,13 +439,14 @@ def main():
|
||||
model_inputs["labels"] = labels["input_ids"]
|
||||
return model_inputs
|
||||
|
||||
processed_datasets = raw_datasets.map(
|
||||
preprocess_function,
|
||||
batched=True,
|
||||
remove_columns=column_names,
|
||||
load_from_cache_file=not args.overwrite_cache,
|
||||
desc="Running tokenizer on dataset",
|
||||
)
|
||||
with accelerator.main_process_first():
|
||||
processed_datasets = raw_datasets.map(
|
||||
preprocess_function,
|
||||
batched=True,
|
||||
remove_columns=column_names,
|
||||
load_from_cache_file=not args.overwrite_cache,
|
||||
desc="Running tokenizer on dataset",
|
||||
)
|
||||
|
||||
train_dataset = processed_datasets["train"]
|
||||
eval_dataset = processed_datasets["validation"]
|
||||
|
||||
Reference in New Issue
Block a user