[Examples] Added context manager to datasets map (#12367)
* added cotext manager to datasets map * fixed style and spaces * fixed warning of deprecation * changed desc
This commit is contained in:
@@ -356,6 +356,7 @@ def main():
|
|||||||
)
|
)
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
with training_args.main_process_first(desc="dataset map tokenization"):
|
||||||
tokenized_datasets = raw_datasets.map(
|
tokenized_datasets = raw_datasets.map(
|
||||||
tokenize_function,
|
tokenize_function,
|
||||||
batched=True,
|
batched=True,
|
||||||
@@ -404,6 +405,7 @@ def main():
|
|||||||
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
|
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
|
||||||
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
|
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
|
||||||
|
|
||||||
|
with training_args.main_process_first(desc="grouping texts together"):
|
||||||
lm_datasets = tokenized_datasets.map(
|
lm_datasets = tokenized_datasets.map(
|
||||||
group_texts,
|
group_texts,
|
||||||
batched=True,
|
batched=True,
|
||||||
|
|||||||
@@ -383,6 +383,7 @@ def main():
|
|||||||
return_special_tokens_mask=True,
|
return_special_tokens_mask=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
with training_args.main_process_first(desc="dataset map tokenization"):
|
||||||
tokenized_datasets = raw_datasets.map(
|
tokenized_datasets = raw_datasets.map(
|
||||||
tokenize_function,
|
tokenize_function,
|
||||||
batched=True,
|
batched=True,
|
||||||
@@ -398,6 +399,7 @@ def main():
|
|||||||
def tokenize_function(examples):
|
def tokenize_function(examples):
|
||||||
return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
|
return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
|
||||||
|
|
||||||
|
with training_args.main_process_first(desc="dataset map tokenization"):
|
||||||
tokenized_datasets = raw_datasets.map(
|
tokenized_datasets = raw_datasets.map(
|
||||||
tokenize_function,
|
tokenize_function,
|
||||||
batched=True,
|
batched=True,
|
||||||
@@ -430,6 +432,7 @@ def main():
|
|||||||
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
|
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
|
||||||
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
|
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
|
||||||
|
|
||||||
|
with training_args.main_process_first(desc="grouping texts together"):
|
||||||
tokenized_datasets = tokenized_datasets.map(
|
tokenized_datasets = tokenized_datasets.map(
|
||||||
group_texts,
|
group_texts,
|
||||||
batched=True,
|
batched=True,
|
||||||
|
|||||||
@@ -359,6 +359,7 @@ def main():
|
|||||||
examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
|
examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
|
||||||
return tokenizer(examples["text"], padding=padding, truncation=True, max_length=max_seq_length)
|
return tokenizer(examples["text"], padding=padding, truncation=True, max_length=max_seq_length)
|
||||||
|
|
||||||
|
with training_args.main_process_first(desc="dataset map tokenization"):
|
||||||
tokenized_datasets = raw_datasets.map(
|
tokenized_datasets = raw_datasets.map(
|
||||||
tokenize_function,
|
tokenize_function,
|
||||||
batched=True,
|
batched=True,
|
||||||
@@ -372,6 +373,7 @@ def main():
|
|||||||
def tokenize_function(examples):
|
def tokenize_function(examples):
|
||||||
return tokenizer(examples[text_column_name])
|
return tokenizer(examples[text_column_name])
|
||||||
|
|
||||||
|
with training_args.main_process_first(desc="dataset map tokenization"):
|
||||||
tokenized_datasets = raw_datasets.map(
|
tokenized_datasets = raw_datasets.map(
|
||||||
tokenize_function,
|
tokenize_function,
|
||||||
batched=True,
|
batched=True,
|
||||||
@@ -404,6 +406,7 @@ def main():
|
|||||||
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
|
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
|
||||||
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
|
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
|
||||||
|
|
||||||
|
with training_args.main_process_first(desc="grouping texts together"):
|
||||||
tokenized_datasets = tokenized_datasets.map(
|
tokenized_datasets = tokenized_datasets.map(
|
||||||
group_texts,
|
group_texts,
|
||||||
batched=True,
|
batched=True,
|
||||||
|
|||||||
@@ -353,6 +353,7 @@ def main():
|
|||||||
train_dataset = raw_datasets["train"]
|
train_dataset = raw_datasets["train"]
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
||||||
|
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
||||||
train_dataset = train_dataset.map(
|
train_dataset = train_dataset.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
batched=True,
|
batched=True,
|
||||||
@@ -366,6 +367,7 @@ def main():
|
|||||||
eval_dataset = raw_datasets["validation"]
|
eval_dataset = raw_datasets["validation"]
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
||||||
|
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
||||||
eval_dataset = eval_dataset.map(
|
eval_dataset = eval_dataset.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
batched=True,
|
batched=True,
|
||||||
|
|||||||
@@ -418,6 +418,7 @@ def main():
|
|||||||
# We will select sample from whole data if agument is specified
|
# We will select sample from whole data if agument is specified
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
||||||
# Create train feature from dataset
|
# Create train feature from dataset
|
||||||
|
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
||||||
train_dataset = train_dataset.map(
|
train_dataset = train_dataset.map(
|
||||||
prepare_train_features,
|
prepare_train_features,
|
||||||
batched=True,
|
batched=True,
|
||||||
@@ -480,6 +481,7 @@ def main():
|
|||||||
# We will select sample from whole data
|
# We will select sample from whole data
|
||||||
eval_examples = eval_examples.select(range(data_args.max_eval_samples))
|
eval_examples = eval_examples.select(range(data_args.max_eval_samples))
|
||||||
# Validation Feature Creation
|
# Validation Feature Creation
|
||||||
|
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
||||||
eval_dataset = eval_examples.map(
|
eval_dataset = eval_examples.map(
|
||||||
prepare_validation_features,
|
prepare_validation_features,
|
||||||
batched=True,
|
batched=True,
|
||||||
@@ -500,6 +502,7 @@ def main():
|
|||||||
# We will select sample from whole data
|
# We will select sample from whole data
|
||||||
predict_examples = predict_examples.select(range(data_args.max_predict_samples))
|
predict_examples = predict_examples.select(range(data_args.max_predict_samples))
|
||||||
# Predict Feature Creation
|
# Predict Feature Creation
|
||||||
|
with training_args.main_process_first(desc="prediction dataset map pre-processing"):
|
||||||
predict_dataset = predict_examples.map(
|
predict_dataset = predict_examples.map(
|
||||||
prepare_validation_features,
|
prepare_validation_features,
|
||||||
batched=True,
|
batched=True,
|
||||||
|
|||||||
@@ -429,6 +429,7 @@ def main():
|
|||||||
# Select samples from Dataset, This will help to decrease processing time
|
# Select samples from Dataset, This will help to decrease processing time
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
||||||
# Create Training Features
|
# Create Training Features
|
||||||
|
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
||||||
train_dataset = train_dataset.map(
|
train_dataset = train_dataset.map(
|
||||||
prepare_train_features,
|
prepare_train_features,
|
||||||
batched=True,
|
batched=True,
|
||||||
@@ -515,6 +516,7 @@ def main():
|
|||||||
# Selecting Eval Samples from Dataset
|
# Selecting Eval Samples from Dataset
|
||||||
eval_examples = eval_examples.select(range(data_args.max_eval_samples))
|
eval_examples = eval_examples.select(range(data_args.max_eval_samples))
|
||||||
# Create Features from Eval Dataset
|
# Create Features from Eval Dataset
|
||||||
|
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
||||||
eval_dataset = eval_examples.map(
|
eval_dataset = eval_examples.map(
|
||||||
prepare_validation_features,
|
prepare_validation_features,
|
||||||
batched=True,
|
batched=True,
|
||||||
@@ -535,6 +537,7 @@ def main():
|
|||||||
# We will select sample from whole data
|
# We will select sample from whole data
|
||||||
predict_examples = predict_examples.select(range(data_args.max_predict_samples))
|
predict_examples = predict_examples.select(range(data_args.max_predict_samples))
|
||||||
# Test Feature Creation
|
# Test Feature Creation
|
||||||
|
with training_args.main_process_first(desc="prediction dataset map pre-processing"):
|
||||||
predict_dataset = predict_examples.map(
|
predict_dataset = predict_examples.map(
|
||||||
prepare_validation_features,
|
prepare_validation_features,
|
||||||
batched=True,
|
batched=True,
|
||||||
|
|||||||
@@ -435,6 +435,7 @@ def main():
|
|||||||
train_dataset = raw_datasets["train"]
|
train_dataset = raw_datasets["train"]
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
||||||
|
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
||||||
train_dataset = train_dataset.map(
|
train_dataset = train_dataset.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
batched=True,
|
batched=True,
|
||||||
@@ -451,6 +452,7 @@ def main():
|
|||||||
eval_dataset = raw_datasets["validation"]
|
eval_dataset = raw_datasets["validation"]
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
||||||
|
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
||||||
eval_dataset = eval_dataset.map(
|
eval_dataset = eval_dataset.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
batched=True,
|
batched=True,
|
||||||
@@ -467,6 +469,7 @@ def main():
|
|||||||
predict_dataset = raw_datasets["test"]
|
predict_dataset = raw_datasets["test"]
|
||||||
if data_args.max_predict_samples is not None:
|
if data_args.max_predict_samples is not None:
|
||||||
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
||||||
|
with training_args.main_process_first(desc="prediction dataset map pre-processing"):
|
||||||
predict_dataset = predict_dataset.map(
|
predict_dataset = predict_dataset.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
batched=True,
|
batched=True,
|
||||||
|
|||||||
@@ -400,6 +400,7 @@ def main():
|
|||||||
result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
|
result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
with training_args.main_process_first(desc="dataset map pre-processing"):
|
||||||
raw_datasets = raw_datasets.map(
|
raw_datasets = raw_datasets.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
batched=True,
|
batched=True,
|
||||||
@@ -526,7 +527,7 @@ def main():
|
|||||||
|
|
||||||
for predict_dataset, task in zip(predict_datasets, tasks):
|
for predict_dataset, task in zip(predict_datasets, tasks):
|
||||||
# Removing the `label` columns because it contains -1 and Trainer won't like that.
|
# Removing the `label` columns because it contains -1 and Trainer won't like that.
|
||||||
predict_dataset.remove_columns_("label")
|
predict_dataset = predict_dataset.remove_columns("label")
|
||||||
predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions
|
predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions
|
||||||
predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1)
|
predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1)
|
||||||
|
|
||||||
|
|||||||
@@ -280,6 +280,7 @@ def main():
|
|||||||
if training_args.do_train:
|
if training_args.do_train:
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
||||||
|
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
||||||
train_dataset = train_dataset.map(
|
train_dataset = train_dataset.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
batched=True,
|
batched=True,
|
||||||
@@ -293,6 +294,7 @@ def main():
|
|||||||
if training_args.do_eval:
|
if training_args.do_eval:
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
||||||
|
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
||||||
eval_dataset = eval_dataset.map(
|
eval_dataset = eval_dataset.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
batched=True,
|
batched=True,
|
||||||
@@ -303,6 +305,7 @@ def main():
|
|||||||
if training_args.do_predict:
|
if training_args.do_predict:
|
||||||
if data_args.max_predict_samples is not None:
|
if data_args.max_predict_samples is not None:
|
||||||
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
||||||
|
with training_args.main_process_first(desc="prediction dataset map pre-processing"):
|
||||||
predict_dataset = predict_dataset.map(
|
predict_dataset = predict_dataset.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
batched=True,
|
batched=True,
|
||||||
|
|||||||
@@ -390,6 +390,7 @@ def main():
|
|||||||
train_dataset = raw_datasets["train"]
|
train_dataset = raw_datasets["train"]
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
||||||
|
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
||||||
train_dataset = train_dataset.map(
|
train_dataset = train_dataset.map(
|
||||||
tokenize_and_align_labels,
|
tokenize_and_align_labels,
|
||||||
batched=True,
|
batched=True,
|
||||||
@@ -404,6 +405,7 @@ def main():
|
|||||||
eval_dataset = raw_datasets["validation"]
|
eval_dataset = raw_datasets["validation"]
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
||||||
|
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
||||||
eval_dataset = eval_dataset.map(
|
eval_dataset = eval_dataset.map(
|
||||||
tokenize_and_align_labels,
|
tokenize_and_align_labels,
|
||||||
batched=True,
|
batched=True,
|
||||||
@@ -418,6 +420,7 @@ def main():
|
|||||||
predict_dataset = raw_datasets["test"]
|
predict_dataset = raw_datasets["test"]
|
||||||
if data_args.max_predict_samples is not None:
|
if data_args.max_predict_samples is not None:
|
||||||
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
||||||
|
with training_args.main_process_first(desc="prediction dataset map pre-processing"):
|
||||||
predict_dataset = predict_dataset.map(
|
predict_dataset = predict_dataset.map(
|
||||||
tokenize_and_align_labels,
|
tokenize_and_align_labels,
|
||||||
batched=True,
|
batched=True,
|
||||||
|
|||||||
@@ -370,6 +370,7 @@ def main():
|
|||||||
# Select Sample from Dataset
|
# Select Sample from Dataset
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
||||||
# tokenize train dataset in batch
|
# tokenize train dataset in batch
|
||||||
|
with training_args.main_process_first(desc="train dataset map tokenization"):
|
||||||
train_dataset = train_dataset.map(
|
train_dataset = train_dataset.map(
|
||||||
tokenize_function,
|
tokenize_function,
|
||||||
batched=True,
|
batched=True,
|
||||||
@@ -386,6 +387,7 @@ def main():
|
|||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
||||||
# tokenize validation dataset
|
# tokenize validation dataset
|
||||||
|
with training_args.main_process_first(desc="validation dataset map tokenization"):
|
||||||
eval_dataset = eval_dataset.map(
|
eval_dataset = eval_dataset.map(
|
||||||
tokenize_function,
|
tokenize_function,
|
||||||
batched=True,
|
batched=True,
|
||||||
@@ -402,6 +404,7 @@ def main():
|
|||||||
if data_args.max_predict_samples is not None:
|
if data_args.max_predict_samples is not None:
|
||||||
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
||||||
# tokenize predict dataset
|
# tokenize predict dataset
|
||||||
|
with training_args.main_process_first(desc="prediction dataset map tokenization"):
|
||||||
predict_dataset = predict_dataset.map(
|
predict_dataset = predict_dataset.map(
|
||||||
tokenize_function,
|
tokenize_function,
|
||||||
batched=True,
|
batched=True,
|
||||||
|
|||||||
@@ -503,7 +503,7 @@ def main():
|
|||||||
|
|
||||||
for test_dataset, task in zip(test_datasets, tasks):
|
for test_dataset, task in zip(test_datasets, tasks):
|
||||||
# Removing the `label` columns because it contains -1 and Trainer won't like that.
|
# Removing the `label` columns because it contains -1 and Trainer won't like that.
|
||||||
test_dataset.remove_columns_("label")
|
test_dataset = test_dataset.remove_columns("label")
|
||||||
predictions = trainer.predict(test_dataset=test_dataset).predictions
|
predictions = trainer.predict(test_dataset=test_dataset).predictions
|
||||||
predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1)
|
predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user