map only on one process (#13810)
This commit is contained in:
committed by
GitHub
parent
9a9805fccf
commit
44eb8bdeea
@@ -337,6 +337,7 @@ def main():
|
|||||||
def tokenize_function(examples):
|
def tokenize_function(examples):
|
||||||
return tokenizer(examples[text_column_name])
|
return tokenizer(examples[text_column_name])
|
||||||
|
|
||||||
|
with accelerator.main_process_first():
|
||||||
tokenized_datasets = raw_datasets.map(
|
tokenized_datasets = raw_datasets.map(
|
||||||
tokenize_function,
|
tokenize_function,
|
||||||
batched=True,
|
batched=True,
|
||||||
@@ -386,6 +387,7 @@ def main():
|
|||||||
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
|
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
|
||||||
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
|
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
|
||||||
|
|
||||||
|
with accelerator.main_process_first():
|
||||||
lm_datasets = tokenized_datasets.map(
|
lm_datasets = tokenized_datasets.map(
|
||||||
group_texts,
|
group_texts,
|
||||||
batched=True,
|
batched=True,
|
||||||
|
|||||||
@@ -374,6 +374,7 @@ def main():
|
|||||||
return_special_tokens_mask=True,
|
return_special_tokens_mask=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
with accelerator.main_process_first():
|
||||||
tokenized_datasets = raw_datasets.map(
|
tokenized_datasets = raw_datasets.map(
|
||||||
tokenize_function,
|
tokenize_function,
|
||||||
batched=True,
|
batched=True,
|
||||||
@@ -389,6 +390,7 @@ def main():
|
|||||||
def tokenize_function(examples):
|
def tokenize_function(examples):
|
||||||
return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
|
return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
|
||||||
|
|
||||||
|
with accelerator.main_process_first():
|
||||||
tokenized_datasets = raw_datasets.map(
|
tokenized_datasets = raw_datasets.map(
|
||||||
tokenize_function,
|
tokenize_function,
|
||||||
batched=True,
|
batched=True,
|
||||||
@@ -422,6 +424,7 @@ def main():
|
|||||||
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
|
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
|
||||||
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
|
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
|
||||||
|
|
||||||
|
with accelerator.main_process_first():
|
||||||
tokenized_datasets = tokenized_datasets.map(
|
tokenized_datasets = tokenized_datasets.map(
|
||||||
group_texts,
|
group_texts,
|
||||||
batched=True,
|
batched=True,
|
||||||
|
|||||||
@@ -381,6 +381,7 @@ def main():
|
|||||||
tokenized_inputs["labels"] = labels
|
tokenized_inputs["labels"] = labels
|
||||||
return tokenized_inputs
|
return tokenized_inputs
|
||||||
|
|
||||||
|
with accelerator.main_process_first():
|
||||||
processed_datasets = raw_datasets.map(
|
processed_datasets = raw_datasets.map(
|
||||||
preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names
|
preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -440,6 +440,7 @@ def main():
|
|||||||
# We will select sample from whole data if agument is specified
|
# We will select sample from whole data if agument is specified
|
||||||
train_dataset = train_dataset.select(range(args.max_train_samples))
|
train_dataset = train_dataset.select(range(args.max_train_samples))
|
||||||
# Create train feature from dataset
|
# Create train feature from dataset
|
||||||
|
with accelerator.main_process_first():
|
||||||
train_dataset = train_dataset.map(
|
train_dataset = train_dataset.map(
|
||||||
prepare_train_features,
|
prepare_train_features,
|
||||||
batched=True,
|
batched=True,
|
||||||
@@ -530,6 +531,7 @@ def main():
|
|||||||
# We will select sample from whole data
|
# We will select sample from whole data
|
||||||
eval_examples = eval_examples.select(range(args.max_eval_samples))
|
eval_examples = eval_examples.select(range(args.max_eval_samples))
|
||||||
# Validation Feature Creation
|
# Validation Feature Creation
|
||||||
|
with accelerator.main_process_first():
|
||||||
eval_dataset = eval_examples.map(
|
eval_dataset = eval_examples.map(
|
||||||
prepare_validation_features,
|
prepare_validation_features,
|
||||||
batched=True,
|
batched=True,
|
||||||
@@ -551,6 +553,7 @@ def main():
|
|||||||
# We will select sample from whole data
|
# We will select sample from whole data
|
||||||
predict_examples = predict_examples.select(range(args.max_predict_samples))
|
predict_examples = predict_examples.select(range(args.max_predict_samples))
|
||||||
# Predict Feature Creation
|
# Predict Feature Creation
|
||||||
|
with accelerator.main_process_first():
|
||||||
predict_dataset = predict_examples.map(
|
predict_dataset = predict_examples.map(
|
||||||
prepare_validation_features,
|
prepare_validation_features,
|
||||||
batched=True,
|
batched=True,
|
||||||
|
|||||||
@@ -468,7 +468,9 @@ def main():
|
|||||||
if args.max_train_samples is not None:
|
if args.max_train_samples is not None:
|
||||||
# We will select sample from whole data if agument is specified
|
# We will select sample from whole data if agument is specified
|
||||||
train_dataset = train_dataset.select(range(args.max_train_samples))
|
train_dataset = train_dataset.select(range(args.max_train_samples))
|
||||||
|
|
||||||
# Create train feature from dataset
|
# Create train feature from dataset
|
||||||
|
with accelerator.main_process_first():
|
||||||
train_dataset = train_dataset.map(
|
train_dataset = train_dataset.map(
|
||||||
prepare_train_features,
|
prepare_train_features,
|
||||||
batched=True,
|
batched=True,
|
||||||
@@ -535,6 +537,7 @@ def main():
|
|||||||
# We will select sample from whole data
|
# We will select sample from whole data
|
||||||
eval_examples = eval_examples.select(range(args.max_eval_samples))
|
eval_examples = eval_examples.select(range(args.max_eval_samples))
|
||||||
# Validation Feature Creation
|
# Validation Feature Creation
|
||||||
|
with accelerator.main_process_first():
|
||||||
eval_dataset = eval_examples.map(
|
eval_dataset = eval_examples.map(
|
||||||
prepare_validation_features,
|
prepare_validation_features,
|
||||||
batched=True,
|
batched=True,
|
||||||
@@ -556,6 +559,7 @@ def main():
|
|||||||
# We will select sample from whole data
|
# We will select sample from whole data
|
||||||
predict_examples = predict_examples.select(range(args.max_predict_samples))
|
predict_examples = predict_examples.select(range(args.max_predict_samples))
|
||||||
# Predict Feature Creation
|
# Predict Feature Creation
|
||||||
|
with accelerator.main_process_first():
|
||||||
predict_dataset = predict_examples.map(
|
predict_dataset = predict_examples.map(
|
||||||
prepare_validation_features,
|
prepare_validation_features,
|
||||||
batched=True,
|
batched=True,
|
||||||
|
|||||||
@@ -439,6 +439,7 @@ def main():
|
|||||||
model_inputs["labels"] = labels["input_ids"]
|
model_inputs["labels"] = labels["input_ids"]
|
||||||
return model_inputs
|
return model_inputs
|
||||||
|
|
||||||
|
with accelerator.main_process_first():
|
||||||
processed_datasets = raw_datasets.map(
|
processed_datasets = raw_datasets.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
batched=True,
|
batched=True,
|
||||||
|
|||||||
@@ -330,6 +330,7 @@ def main():
|
|||||||
result["labels"] = examples["label"]
|
result["labels"] = examples["label"]
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
with accelerator.main_process_first():
|
||||||
processed_datasets = raw_datasets.map(
|
processed_datasets = raw_datasets.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
batched=True,
|
batched=True,
|
||||||
|
|||||||
@@ -403,6 +403,7 @@ def main():
|
|||||||
tokenized_inputs["labels"] = labels
|
tokenized_inputs["labels"] = labels
|
||||||
return tokenized_inputs
|
return tokenized_inputs
|
||||||
|
|
||||||
|
with accelerator.main_process_first():
|
||||||
processed_raw_datasets = raw_datasets.map(
|
processed_raw_datasets = raw_datasets.map(
|
||||||
tokenize_and_align_labels,
|
tokenize_and_align_labels,
|
||||||
batched=True,
|
batched=True,
|
||||||
|
|||||||
@@ -418,6 +418,7 @@ def main():
|
|||||||
model_inputs["labels"] = labels["input_ids"]
|
model_inputs["labels"] = labels["input_ids"]
|
||||||
return model_inputs
|
return model_inputs
|
||||||
|
|
||||||
|
with accelerator.main_process_first():
|
||||||
processed_datasets = raw_datasets.map(
|
processed_datasets = raw_datasets.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
batched=True,
|
batched=True,
|
||||||
|
|||||||
Reference in New Issue
Block a user