map only on one process (#13810)
This commit is contained in:
committed by
GitHub
parent
9a9805fccf
commit
44eb8bdeea
@@ -337,6 +337,7 @@ def main():
|
||||
def tokenize_function(examples):
|
||||
return tokenizer(examples[text_column_name])
|
||||
|
||||
with accelerator.main_process_first():
|
||||
tokenized_datasets = raw_datasets.map(
|
||||
tokenize_function,
|
||||
batched=True,
|
||||
@@ -386,6 +387,7 @@ def main():
|
||||
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
|
||||
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
|
||||
|
||||
with accelerator.main_process_first():
|
||||
lm_datasets = tokenized_datasets.map(
|
||||
group_texts,
|
||||
batched=True,
|
||||
|
||||
@@ -374,6 +374,7 @@ def main():
|
||||
return_special_tokens_mask=True,
|
||||
)
|
||||
|
||||
with accelerator.main_process_first():
|
||||
tokenized_datasets = raw_datasets.map(
|
||||
tokenize_function,
|
||||
batched=True,
|
||||
@@ -389,6 +390,7 @@ def main():
|
||||
def tokenize_function(examples):
|
||||
return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
|
||||
|
||||
with accelerator.main_process_first():
|
||||
tokenized_datasets = raw_datasets.map(
|
||||
tokenize_function,
|
||||
batched=True,
|
||||
@@ -422,6 +424,7 @@ def main():
|
||||
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
|
||||
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
|
||||
|
||||
with accelerator.main_process_first():
|
||||
tokenized_datasets = tokenized_datasets.map(
|
||||
group_texts,
|
||||
batched=True,
|
||||
|
||||
@@ -381,6 +381,7 @@ def main():
|
||||
tokenized_inputs["labels"] = labels
|
||||
return tokenized_inputs
|
||||
|
||||
with accelerator.main_process_first():
|
||||
processed_datasets = raw_datasets.map(
|
||||
preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names
|
||||
)
|
||||
|
||||
@@ -440,6 +440,7 @@ def main():
|
||||
# We will select sample from whole data if agument is specified
|
||||
train_dataset = train_dataset.select(range(args.max_train_samples))
|
||||
# Create train feature from dataset
|
||||
with accelerator.main_process_first():
|
||||
train_dataset = train_dataset.map(
|
||||
prepare_train_features,
|
||||
batched=True,
|
||||
@@ -530,6 +531,7 @@ def main():
|
||||
# We will select sample from whole data
|
||||
eval_examples = eval_examples.select(range(args.max_eval_samples))
|
||||
# Validation Feature Creation
|
||||
with accelerator.main_process_first():
|
||||
eval_dataset = eval_examples.map(
|
||||
prepare_validation_features,
|
||||
batched=True,
|
||||
@@ -551,6 +553,7 @@ def main():
|
||||
# We will select sample from whole data
|
||||
predict_examples = predict_examples.select(range(args.max_predict_samples))
|
||||
# Predict Feature Creation
|
||||
with accelerator.main_process_first():
|
||||
predict_dataset = predict_examples.map(
|
||||
prepare_validation_features,
|
||||
batched=True,
|
||||
|
||||
@@ -468,7 +468,9 @@ def main():
|
||||
if args.max_train_samples is not None:
|
||||
# We will select sample from whole data if agument is specified
|
||||
train_dataset = train_dataset.select(range(args.max_train_samples))
|
||||
|
||||
# Create train feature from dataset
|
||||
with accelerator.main_process_first():
|
||||
train_dataset = train_dataset.map(
|
||||
prepare_train_features,
|
||||
batched=True,
|
||||
@@ -535,6 +537,7 @@ def main():
|
||||
# We will select sample from whole data
|
||||
eval_examples = eval_examples.select(range(args.max_eval_samples))
|
||||
# Validation Feature Creation
|
||||
with accelerator.main_process_first():
|
||||
eval_dataset = eval_examples.map(
|
||||
prepare_validation_features,
|
||||
batched=True,
|
||||
@@ -556,6 +559,7 @@ def main():
|
||||
# We will select sample from whole data
|
||||
predict_examples = predict_examples.select(range(args.max_predict_samples))
|
||||
# Predict Feature Creation
|
||||
with accelerator.main_process_first():
|
||||
predict_dataset = predict_examples.map(
|
||||
prepare_validation_features,
|
||||
batched=True,
|
||||
|
||||
@@ -439,6 +439,7 @@ def main():
|
||||
model_inputs["labels"] = labels["input_ids"]
|
||||
return model_inputs
|
||||
|
||||
with accelerator.main_process_first():
|
||||
processed_datasets = raw_datasets.map(
|
||||
preprocess_function,
|
||||
batched=True,
|
||||
|
||||
@@ -330,6 +330,7 @@ def main():
|
||||
result["labels"] = examples["label"]
|
||||
return result
|
||||
|
||||
with accelerator.main_process_first():
|
||||
processed_datasets = raw_datasets.map(
|
||||
preprocess_function,
|
||||
batched=True,
|
||||
|
||||
@@ -403,6 +403,7 @@ def main():
|
||||
tokenized_inputs["labels"] = labels
|
||||
return tokenized_inputs
|
||||
|
||||
with accelerator.main_process_first():
|
||||
processed_raw_datasets = raw_datasets.map(
|
||||
tokenize_and_align_labels,
|
||||
batched=True,
|
||||
|
||||
@@ -418,6 +418,7 @@ def main():
|
||||
model_inputs["labels"] = labels["input_ids"]
|
||||
return model_inputs
|
||||
|
||||
with accelerator.main_process_first():
|
||||
processed_datasets = raw_datasets.map(
|
||||
preprocess_function,
|
||||
batched=True,
|
||||
|
||||
Reference in New Issue
Block a user