map only on one process (#13810)
This commit is contained in:
committed by
GitHub
parent
9a9805fccf
commit
44eb8bdeea
@@ -337,14 +337,15 @@ def main():
|
|||||||
def tokenize_function(examples):
|
def tokenize_function(examples):
|
||||||
return tokenizer(examples[text_column_name])
|
return tokenizer(examples[text_column_name])
|
||||||
|
|
||||||
tokenized_datasets = raw_datasets.map(
|
with accelerator.main_process_first():
|
||||||
tokenize_function,
|
tokenized_datasets = raw_datasets.map(
|
||||||
batched=True,
|
tokenize_function,
|
||||||
num_proc=args.preprocessing_num_workers,
|
batched=True,
|
||||||
remove_columns=column_names,
|
num_proc=args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not args.overwrite_cache,
|
remove_columns=column_names,
|
||||||
desc="Running tokenizer on dataset",
|
load_from_cache_file=not args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on dataset",
|
||||||
|
)
|
||||||
|
|
||||||
if args.block_size is None:
|
if args.block_size is None:
|
||||||
block_size = tokenizer.model_max_length
|
block_size = tokenizer.model_max_length
|
||||||
@@ -386,13 +387,14 @@ def main():
|
|||||||
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
|
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
|
||||||
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
|
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
|
||||||
|
|
||||||
lm_datasets = tokenized_datasets.map(
|
with accelerator.main_process_first():
|
||||||
group_texts,
|
lm_datasets = tokenized_datasets.map(
|
||||||
batched=True,
|
group_texts,
|
||||||
num_proc=args.preprocessing_num_workers,
|
batched=True,
|
||||||
load_from_cache_file=not args.overwrite_cache,
|
num_proc=args.preprocessing_num_workers,
|
||||||
desc=f"Grouping texts in chunks of {block_size}",
|
load_from_cache_file=not args.overwrite_cache,
|
||||||
)
|
desc=f"Grouping texts in chunks of {block_size}",
|
||||||
|
)
|
||||||
|
|
||||||
train_dataset = lm_datasets["train"]
|
train_dataset = lm_datasets["train"]
|
||||||
eval_dataset = lm_datasets["validation"]
|
eval_dataset = lm_datasets["validation"]
|
||||||
|
|||||||
@@ -374,14 +374,15 @@ def main():
|
|||||||
return_special_tokens_mask=True,
|
return_special_tokens_mask=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
tokenized_datasets = raw_datasets.map(
|
with accelerator.main_process_first():
|
||||||
tokenize_function,
|
tokenized_datasets = raw_datasets.map(
|
||||||
batched=True,
|
tokenize_function,
|
||||||
num_proc=args.preprocessing_num_workers,
|
batched=True,
|
||||||
remove_columns=[text_column_name],
|
num_proc=args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not args.overwrite_cache,
|
remove_columns=[text_column_name],
|
||||||
desc="Running tokenizer on dataset line_by_line",
|
load_from_cache_file=not args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on dataset line_by_line",
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
|
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
|
||||||
# We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
|
# We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
|
||||||
@@ -389,14 +390,15 @@ def main():
|
|||||||
def tokenize_function(examples):
|
def tokenize_function(examples):
|
||||||
return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
|
return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
|
||||||
|
|
||||||
tokenized_datasets = raw_datasets.map(
|
with accelerator.main_process_first():
|
||||||
tokenize_function,
|
tokenized_datasets = raw_datasets.map(
|
||||||
batched=True,
|
tokenize_function,
|
||||||
num_proc=args.preprocessing_num_workers,
|
batched=True,
|
||||||
remove_columns=column_names,
|
num_proc=args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not args.overwrite_cache,
|
remove_columns=column_names,
|
||||||
desc="Running tokenizer on every text in dataset",
|
load_from_cache_file=not args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on every text in dataset",
|
||||||
|
)
|
||||||
|
|
||||||
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
|
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
|
||||||
# max_seq_length.
|
# max_seq_length.
|
||||||
@@ -422,13 +424,14 @@ def main():
|
|||||||
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
|
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
|
||||||
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
|
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
|
||||||
|
|
||||||
tokenized_datasets = tokenized_datasets.map(
|
with accelerator.main_process_first():
|
||||||
group_texts,
|
tokenized_datasets = tokenized_datasets.map(
|
||||||
batched=True,
|
group_texts,
|
||||||
num_proc=args.preprocessing_num_workers,
|
batched=True,
|
||||||
load_from_cache_file=not args.overwrite_cache,
|
num_proc=args.preprocessing_num_workers,
|
||||||
desc=f"Grouping texts in chunks of {max_seq_length}",
|
load_from_cache_file=not args.overwrite_cache,
|
||||||
)
|
desc=f"Grouping texts in chunks of {max_seq_length}",
|
||||||
|
)
|
||||||
|
|
||||||
train_dataset = tokenized_datasets["train"]
|
train_dataset = tokenized_datasets["train"]
|
||||||
eval_dataset = tokenized_datasets["validation"]
|
eval_dataset = tokenized_datasets["validation"]
|
||||||
|
|||||||
@@ -381,9 +381,10 @@ def main():
|
|||||||
tokenized_inputs["labels"] = labels
|
tokenized_inputs["labels"] = labels
|
||||||
return tokenized_inputs
|
return tokenized_inputs
|
||||||
|
|
||||||
processed_datasets = raw_datasets.map(
|
with accelerator.main_process_first():
|
||||||
preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names
|
processed_datasets = raw_datasets.map(
|
||||||
)
|
preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names
|
||||||
|
)
|
||||||
|
|
||||||
train_dataset = processed_datasets["train"]
|
train_dataset = processed_datasets["train"]
|
||||||
eval_dataset = processed_datasets["validation"]
|
eval_dataset = processed_datasets["validation"]
|
||||||
|
|||||||
@@ -440,14 +440,15 @@ def main():
|
|||||||
# We will select sample from whole data if agument is specified
|
# We will select sample from whole data if agument is specified
|
||||||
train_dataset = train_dataset.select(range(args.max_train_samples))
|
train_dataset = train_dataset.select(range(args.max_train_samples))
|
||||||
# Create train feature from dataset
|
# Create train feature from dataset
|
||||||
train_dataset = train_dataset.map(
|
with accelerator.main_process_first():
|
||||||
prepare_train_features,
|
train_dataset = train_dataset.map(
|
||||||
batched=True,
|
prepare_train_features,
|
||||||
num_proc=args.preprocessing_num_workers,
|
batched=True,
|
||||||
remove_columns=column_names,
|
num_proc=args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not args.overwrite_cache,
|
remove_columns=column_names,
|
||||||
desc="Running tokenizer on train dataset",
|
load_from_cache_file=not args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on train dataset",
|
||||||
|
)
|
||||||
if args.max_train_samples is not None:
|
if args.max_train_samples is not None:
|
||||||
# Number of samples might increase during Feature Creation, We select only specified max samples
|
# Number of samples might increase during Feature Creation, We select only specified max samples
|
||||||
train_dataset = train_dataset.select(range(args.max_train_samples))
|
train_dataset = train_dataset.select(range(args.max_train_samples))
|
||||||
@@ -530,14 +531,15 @@ def main():
|
|||||||
# We will select sample from whole data
|
# We will select sample from whole data
|
||||||
eval_examples = eval_examples.select(range(args.max_eval_samples))
|
eval_examples = eval_examples.select(range(args.max_eval_samples))
|
||||||
# Validation Feature Creation
|
# Validation Feature Creation
|
||||||
eval_dataset = eval_examples.map(
|
with accelerator.main_process_first():
|
||||||
prepare_validation_features,
|
eval_dataset = eval_examples.map(
|
||||||
batched=True,
|
prepare_validation_features,
|
||||||
num_proc=args.preprocessing_num_workers,
|
batched=True,
|
||||||
remove_columns=column_names,
|
num_proc=args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not args.overwrite_cache,
|
remove_columns=column_names,
|
||||||
desc="Running tokenizer on validation dataset",
|
load_from_cache_file=not args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on validation dataset",
|
||||||
|
)
|
||||||
|
|
||||||
if args.max_eval_samples is not None:
|
if args.max_eval_samples is not None:
|
||||||
# During Feature creation dataset samples might increase, we will select required samples again
|
# During Feature creation dataset samples might increase, we will select required samples again
|
||||||
@@ -551,17 +553,18 @@ def main():
|
|||||||
# We will select sample from whole data
|
# We will select sample from whole data
|
||||||
predict_examples = predict_examples.select(range(args.max_predict_samples))
|
predict_examples = predict_examples.select(range(args.max_predict_samples))
|
||||||
# Predict Feature Creation
|
# Predict Feature Creation
|
||||||
predict_dataset = predict_examples.map(
|
with accelerator.main_process_first():
|
||||||
prepare_validation_features,
|
predict_dataset = predict_examples.map(
|
||||||
batched=True,
|
prepare_validation_features,
|
||||||
num_proc=args.preprocessing_num_workers,
|
batched=True,
|
||||||
remove_columns=column_names,
|
num_proc=args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not args.overwrite_cache,
|
remove_columns=column_names,
|
||||||
desc="Running tokenizer on prediction dataset",
|
load_from_cache_file=not args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on prediction dataset",
|
||||||
if args.max_predict_samples is not None:
|
)
|
||||||
# During Feature creation dataset samples might increase, we will select required samples again
|
if args.max_predict_samples is not None:
|
||||||
predict_dataset = predict_dataset.select(range(args.max_predict_samples))
|
# During Feature creation dataset samples might increase, we will select required samples again
|
||||||
|
predict_dataset = predict_dataset.select(range(args.max_predict_samples))
|
||||||
|
|
||||||
# Log a few random samples from the training set:
|
# Log a few random samples from the training set:
|
||||||
for index in random.sample(range(len(train_dataset)), 3):
|
for index in random.sample(range(len(train_dataset)), 3):
|
||||||
|
|||||||
@@ -468,18 +468,20 @@ def main():
|
|||||||
if args.max_train_samples is not None:
|
if args.max_train_samples is not None:
|
||||||
# We will select sample from whole data if agument is specified
|
# We will select sample from whole data if agument is specified
|
||||||
train_dataset = train_dataset.select(range(args.max_train_samples))
|
train_dataset = train_dataset.select(range(args.max_train_samples))
|
||||||
|
|
||||||
# Create train feature from dataset
|
# Create train feature from dataset
|
||||||
train_dataset = train_dataset.map(
|
with accelerator.main_process_first():
|
||||||
prepare_train_features,
|
train_dataset = train_dataset.map(
|
||||||
batched=True,
|
prepare_train_features,
|
||||||
num_proc=args.preprocessing_num_workers,
|
batched=True,
|
||||||
remove_columns=column_names,
|
num_proc=args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not args.overwrite_cache,
|
remove_columns=column_names,
|
||||||
desc="Running tokenizer on train dataset",
|
load_from_cache_file=not args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on train dataset",
|
||||||
if args.max_train_samples is not None:
|
)
|
||||||
# Number of samples might increase during Feature Creation, We select only specified max samples
|
if args.max_train_samples is not None:
|
||||||
train_dataset = train_dataset.select(range(args.max_train_samples))
|
# Number of samples might increase during Feature Creation, We select only specified max samples
|
||||||
|
train_dataset = train_dataset.select(range(args.max_train_samples))
|
||||||
|
|
||||||
# Validation preprocessing
|
# Validation preprocessing
|
||||||
def prepare_validation_features(examples):
|
def prepare_validation_features(examples):
|
||||||
@@ -535,14 +537,15 @@ def main():
|
|||||||
# We will select sample from whole data
|
# We will select sample from whole data
|
||||||
eval_examples = eval_examples.select(range(args.max_eval_samples))
|
eval_examples = eval_examples.select(range(args.max_eval_samples))
|
||||||
# Validation Feature Creation
|
# Validation Feature Creation
|
||||||
eval_dataset = eval_examples.map(
|
with accelerator.main_process_first():
|
||||||
prepare_validation_features,
|
eval_dataset = eval_examples.map(
|
||||||
batched=True,
|
prepare_validation_features,
|
||||||
num_proc=args.preprocessing_num_workers,
|
batched=True,
|
||||||
remove_columns=column_names,
|
num_proc=args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not args.overwrite_cache,
|
remove_columns=column_names,
|
||||||
desc="Running tokenizer on validation dataset",
|
load_from_cache_file=not args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on validation dataset",
|
||||||
|
)
|
||||||
|
|
||||||
if args.max_eval_samples is not None:
|
if args.max_eval_samples is not None:
|
||||||
# During Feature creation dataset samples might increase, we will select required samples again
|
# During Feature creation dataset samples might increase, we will select required samples again
|
||||||
@@ -556,17 +559,18 @@ def main():
|
|||||||
# We will select sample from whole data
|
# We will select sample from whole data
|
||||||
predict_examples = predict_examples.select(range(args.max_predict_samples))
|
predict_examples = predict_examples.select(range(args.max_predict_samples))
|
||||||
# Predict Feature Creation
|
# Predict Feature Creation
|
||||||
predict_dataset = predict_examples.map(
|
with accelerator.main_process_first():
|
||||||
prepare_validation_features,
|
predict_dataset = predict_examples.map(
|
||||||
batched=True,
|
prepare_validation_features,
|
||||||
num_proc=args.preprocessing_num_workers,
|
batched=True,
|
||||||
remove_columns=column_names,
|
num_proc=args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not args.overwrite_cache,
|
remove_columns=column_names,
|
||||||
desc="Running tokenizer on prediction dataset",
|
load_from_cache_file=not args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on prediction dataset",
|
||||||
if args.max_predict_samples is not None:
|
)
|
||||||
# During Feature creation dataset samples might increase, we will select required samples again
|
if args.max_predict_samples is not None:
|
||||||
predict_dataset = predict_dataset.select(range(args.max_predict_samples))
|
# During Feature creation dataset samples might increase, we will select required samples again
|
||||||
|
predict_dataset = predict_dataset.select(range(args.max_predict_samples))
|
||||||
|
|
||||||
# Log a few random samples from the training set:
|
# Log a few random samples from the training set:
|
||||||
for index in random.sample(range(len(train_dataset)), 3):
|
for index in random.sample(range(len(train_dataset)), 3):
|
||||||
|
|||||||
@@ -439,13 +439,14 @@ def main():
|
|||||||
model_inputs["labels"] = labels["input_ids"]
|
model_inputs["labels"] = labels["input_ids"]
|
||||||
return model_inputs
|
return model_inputs
|
||||||
|
|
||||||
processed_datasets = raw_datasets.map(
|
with accelerator.main_process_first():
|
||||||
preprocess_function,
|
processed_datasets = raw_datasets.map(
|
||||||
batched=True,
|
preprocess_function,
|
||||||
remove_columns=column_names,
|
batched=True,
|
||||||
load_from_cache_file=not args.overwrite_cache,
|
remove_columns=column_names,
|
||||||
desc="Running tokenizer on dataset",
|
load_from_cache_file=not args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on dataset",
|
||||||
|
)
|
||||||
|
|
||||||
train_dataset = processed_datasets["train"]
|
train_dataset = processed_datasets["train"]
|
||||||
eval_dataset = processed_datasets["validation"]
|
eval_dataset = processed_datasets["validation"]
|
||||||
|
|||||||
@@ -330,12 +330,13 @@ def main():
|
|||||||
result["labels"] = examples["label"]
|
result["labels"] = examples["label"]
|
||||||
return result
|
return result
|
||||||
|
|
||||||
processed_datasets = raw_datasets.map(
|
with accelerator.main_process_first():
|
||||||
preprocess_function,
|
processed_datasets = raw_datasets.map(
|
||||||
batched=True,
|
preprocess_function,
|
||||||
remove_columns=raw_datasets["train"].column_names,
|
batched=True,
|
||||||
desc="Running tokenizer on dataset",
|
remove_columns=raw_datasets["train"].column_names,
|
||||||
)
|
desc="Running tokenizer on dataset",
|
||||||
|
)
|
||||||
|
|
||||||
train_dataset = processed_datasets["train"]
|
train_dataset = processed_datasets["train"]
|
||||||
eval_dataset = processed_datasets["validation_matched" if args.task_name == "mnli" else "validation"]
|
eval_dataset = processed_datasets["validation_matched" if args.task_name == "mnli" else "validation"]
|
||||||
|
|||||||
@@ -403,12 +403,13 @@ def main():
|
|||||||
tokenized_inputs["labels"] = labels
|
tokenized_inputs["labels"] = labels
|
||||||
return tokenized_inputs
|
return tokenized_inputs
|
||||||
|
|
||||||
processed_raw_datasets = raw_datasets.map(
|
with accelerator.main_process_first():
|
||||||
tokenize_and_align_labels,
|
processed_raw_datasets = raw_datasets.map(
|
||||||
batched=True,
|
tokenize_and_align_labels,
|
||||||
remove_columns=raw_datasets["train"].column_names,
|
batched=True,
|
||||||
desc="Running tokenizer on dataset",
|
remove_columns=raw_datasets["train"].column_names,
|
||||||
)
|
desc="Running tokenizer on dataset",
|
||||||
|
)
|
||||||
|
|
||||||
train_dataset = processed_raw_datasets["train"]
|
train_dataset = processed_raw_datasets["train"]
|
||||||
eval_dataset = processed_raw_datasets["validation"]
|
eval_dataset = processed_raw_datasets["validation"]
|
||||||
|
|||||||
@@ -418,14 +418,15 @@ def main():
|
|||||||
model_inputs["labels"] = labels["input_ids"]
|
model_inputs["labels"] = labels["input_ids"]
|
||||||
return model_inputs
|
return model_inputs
|
||||||
|
|
||||||
processed_datasets = raw_datasets.map(
|
with accelerator.main_process_first():
|
||||||
preprocess_function,
|
processed_datasets = raw_datasets.map(
|
||||||
batched=True,
|
preprocess_function,
|
||||||
num_proc=args.preprocessing_num_workers,
|
batched=True,
|
||||||
remove_columns=column_names,
|
num_proc=args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not args.overwrite_cache,
|
remove_columns=column_names,
|
||||||
desc="Running tokenizer on dataset",
|
load_from_cache_file=not args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on dataset",
|
||||||
|
)
|
||||||
|
|
||||||
train_dataset = processed_datasets["train"]
|
train_dataset = processed_datasets["train"]
|
||||||
eval_dataset = processed_datasets["validation"]
|
eval_dataset = processed_datasets["validation"]
|
||||||
|
|||||||
Reference in New Issue
Block a user