[Examples] Added context manager to datasets map (#12367)
* added cotext manager to datasets map * fixed style and spaces * fixed warning of deprecation * changed desc
This commit is contained in:
@@ -390,13 +390,14 @@ def main():
|
||||
train_dataset = raw_datasets["train"]
|
||||
if data_args.max_train_samples is not None:
|
||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
||||
train_dataset = train_dataset.map(
|
||||
tokenize_and_align_labels,
|
||||
batched=True,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
load_from_cache_file=not data_args.overwrite_cache,
|
||||
desc="Running tokenizer on train dataset",
|
||||
)
|
||||
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
||||
train_dataset = train_dataset.map(
|
||||
tokenize_and_align_labels,
|
||||
batched=True,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
load_from_cache_file=not data_args.overwrite_cache,
|
||||
desc="Running tokenizer on train dataset",
|
||||
)
|
||||
|
||||
if training_args.do_eval:
|
||||
if "validation" not in raw_datasets:
|
||||
@@ -404,13 +405,14 @@ def main():
|
||||
eval_dataset = raw_datasets["validation"]
|
||||
if data_args.max_eval_samples is not None:
|
||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
||||
eval_dataset = eval_dataset.map(
|
||||
tokenize_and_align_labels,
|
||||
batched=True,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
load_from_cache_file=not data_args.overwrite_cache,
|
||||
desc="Running tokenizer on validation dataset",
|
||||
)
|
||||
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
||||
eval_dataset = eval_dataset.map(
|
||||
tokenize_and_align_labels,
|
||||
batched=True,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
load_from_cache_file=not data_args.overwrite_cache,
|
||||
desc="Running tokenizer on validation dataset",
|
||||
)
|
||||
|
||||
if training_args.do_predict:
|
||||
if "test" not in raw_datasets:
|
||||
@@ -418,13 +420,14 @@ def main():
|
||||
predict_dataset = raw_datasets["test"]
|
||||
if data_args.max_predict_samples is not None:
|
||||
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
||||
predict_dataset = predict_dataset.map(
|
||||
tokenize_and_align_labels,
|
||||
batched=True,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
load_from_cache_file=not data_args.overwrite_cache,
|
||||
desc="Running tokenizer on prediction dataset",
|
||||
)
|
||||
with training_args.main_process_first(desc="prediction dataset map pre-processing"):
|
||||
predict_dataset = predict_dataset.map(
|
||||
tokenize_and_align_labels,
|
||||
batched=True,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
load_from_cache_file=not data_args.overwrite_cache,
|
||||
desc="Running tokenizer on prediction dataset",
|
||||
)
|
||||
|
||||
# Data collator
|
||||
data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)
|
||||
|
||||
Reference in New Issue
Block a user