[Examples] Added context manager to datasets map (#12367)
* added cotext manager to datasets map * fixed style and spaces * fixed warning of deprecation * changed desc
This commit is contained in:
@@ -356,14 +356,15 @@ def main():
|
|||||||
)
|
)
|
||||||
return output
|
return output
|
||||||
|
|
||||||
tokenized_datasets = raw_datasets.map(
|
with training_args.main_process_first(desc="dataset map tokenization"):
|
||||||
tokenize_function,
|
tokenized_datasets = raw_datasets.map(
|
||||||
batched=True,
|
tokenize_function,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
batched=True,
|
||||||
remove_columns=column_names,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
remove_columns=column_names,
|
||||||
desc="Running tokenizer on dataset",
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on dataset",
|
||||||
|
)
|
||||||
|
|
||||||
if data_args.block_size is None:
|
if data_args.block_size is None:
|
||||||
block_size = tokenizer.model_max_length
|
block_size = tokenizer.model_max_length
|
||||||
@@ -404,13 +405,14 @@ def main():
|
|||||||
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
|
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
|
||||||
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
|
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
|
||||||
|
|
||||||
lm_datasets = tokenized_datasets.map(
|
with training_args.main_process_first(desc="grouping texts together"):
|
||||||
group_texts,
|
lm_datasets = tokenized_datasets.map(
|
||||||
batched=True,
|
group_texts,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
batched=True,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
desc=f"Grouping texts in chunks of {block_size}",
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
)
|
desc=f"Grouping texts in chunks of {block_size}",
|
||||||
|
)
|
||||||
|
|
||||||
if training_args.do_train:
|
if training_args.do_train:
|
||||||
if "train" not in tokenized_datasets:
|
if "train" not in tokenized_datasets:
|
||||||
|
|||||||
@@ -383,14 +383,15 @@ def main():
|
|||||||
return_special_tokens_mask=True,
|
return_special_tokens_mask=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
tokenized_datasets = raw_datasets.map(
|
with training_args.main_process_first(desc="dataset map tokenization"):
|
||||||
tokenize_function,
|
tokenized_datasets = raw_datasets.map(
|
||||||
batched=True,
|
tokenize_function,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
batched=True,
|
||||||
remove_columns=[text_column_name],
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
remove_columns=[text_column_name],
|
||||||
desc="Running tokenizer on dataset line_by_line",
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on dataset line_by_line",
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
|
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
|
||||||
# We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
|
# We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
|
||||||
@@ -398,14 +399,15 @@ def main():
|
|||||||
def tokenize_function(examples):
|
def tokenize_function(examples):
|
||||||
return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
|
return tokenizer(examples[text_column_name], return_special_tokens_mask=True)
|
||||||
|
|
||||||
tokenized_datasets = raw_datasets.map(
|
with training_args.main_process_first(desc="dataset map tokenization"):
|
||||||
tokenize_function,
|
tokenized_datasets = raw_datasets.map(
|
||||||
batched=True,
|
tokenize_function,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
batched=True,
|
||||||
remove_columns=column_names,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
remove_columns=column_names,
|
||||||
desc="Running tokenizer on every text in dataset",
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on every text in dataset",
|
||||||
|
)
|
||||||
|
|
||||||
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
|
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
|
||||||
# max_seq_length.
|
# max_seq_length.
|
||||||
@@ -430,13 +432,14 @@ def main():
|
|||||||
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
|
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
|
||||||
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
|
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
|
||||||
|
|
||||||
tokenized_datasets = tokenized_datasets.map(
|
with training_args.main_process_first(desc="grouping texts together"):
|
||||||
group_texts,
|
tokenized_datasets = tokenized_datasets.map(
|
||||||
batched=True,
|
group_texts,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
batched=True,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
desc=f"Grouping texts in chunks of {max_seq_length}",
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
)
|
desc=f"Grouping texts in chunks of {max_seq_length}",
|
||||||
|
)
|
||||||
|
|
||||||
if training_args.do_train:
|
if training_args.do_train:
|
||||||
if "train" not in tokenized_datasets:
|
if "train" not in tokenized_datasets:
|
||||||
|
|||||||
@@ -359,27 +359,29 @@ def main():
|
|||||||
examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
|
examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
|
||||||
return tokenizer(examples["text"], padding=padding, truncation=True, max_length=max_seq_length)
|
return tokenizer(examples["text"], padding=padding, truncation=True, max_length=max_seq_length)
|
||||||
|
|
||||||
tokenized_datasets = raw_datasets.map(
|
with training_args.main_process_first(desc="dataset map tokenization"):
|
||||||
tokenize_function,
|
tokenized_datasets = raw_datasets.map(
|
||||||
batched=True,
|
tokenize_function,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
batched=True,
|
||||||
remove_columns=[text_column_name],
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
remove_columns=[text_column_name],
|
||||||
desc="Running tokenizer on dataset line_by_line",
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on dataset line_by_line",
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
|
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
|
||||||
def tokenize_function(examples):
|
def tokenize_function(examples):
|
||||||
return tokenizer(examples[text_column_name])
|
return tokenizer(examples[text_column_name])
|
||||||
|
|
||||||
tokenized_datasets = raw_datasets.map(
|
with training_args.main_process_first(desc="dataset map tokenization"):
|
||||||
tokenize_function,
|
tokenized_datasets = raw_datasets.map(
|
||||||
batched=True,
|
tokenize_function,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
batched=True,
|
||||||
remove_columns=column_names,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
remove_columns=column_names,
|
||||||
desc="Running tokenizer on every text in dataset",
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on every text in dataset",
|
||||||
|
)
|
||||||
|
|
||||||
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
|
# Main data processing function that will concatenate all texts from our dataset and generate chunks of
|
||||||
# max_seq_length.
|
# max_seq_length.
|
||||||
@@ -404,13 +406,14 @@ def main():
|
|||||||
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
|
# To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
|
||||||
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
|
# https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
|
||||||
|
|
||||||
tokenized_datasets = tokenized_datasets.map(
|
with training_args.main_process_first(desc="grouping texts together"):
|
||||||
group_texts,
|
tokenized_datasets = tokenized_datasets.map(
|
||||||
batched=True,
|
group_texts,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
batched=True,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
desc=f"Grouping texts in chunks of {max_seq_length}",
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
)
|
desc=f"Grouping texts in chunks of {max_seq_length}",
|
||||||
|
)
|
||||||
|
|
||||||
if training_args.do_train:
|
if training_args.do_train:
|
||||||
if "train" not in tokenized_datasets:
|
if "train" not in tokenized_datasets:
|
||||||
|
|||||||
@@ -353,12 +353,13 @@ def main():
|
|||||||
train_dataset = raw_datasets["train"]
|
train_dataset = raw_datasets["train"]
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
||||||
train_dataset = train_dataset.map(
|
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
||||||
preprocess_function,
|
train_dataset = train_dataset.map(
|
||||||
batched=True,
|
preprocess_function,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
batched=True,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
)
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
)
|
||||||
|
|
||||||
if training_args.do_eval:
|
if training_args.do_eval:
|
||||||
if "validation" not in raw_datasets:
|
if "validation" not in raw_datasets:
|
||||||
@@ -366,12 +367,13 @@ def main():
|
|||||||
eval_dataset = raw_datasets["validation"]
|
eval_dataset = raw_datasets["validation"]
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
||||||
eval_dataset = eval_dataset.map(
|
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
||||||
preprocess_function,
|
eval_dataset = eval_dataset.map(
|
||||||
batched=True,
|
preprocess_function,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
batched=True,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
)
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
)
|
||||||
|
|
||||||
# Data collator
|
# Data collator
|
||||||
data_collator = (
|
data_collator = (
|
||||||
|
|||||||
@@ -418,14 +418,15 @@ def main():
|
|||||||
# We will select sample from whole data if agument is specified
|
# We will select sample from whole data if agument is specified
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
||||||
# Create train feature from dataset
|
# Create train feature from dataset
|
||||||
train_dataset = train_dataset.map(
|
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
||||||
prepare_train_features,
|
train_dataset = train_dataset.map(
|
||||||
batched=True,
|
prepare_train_features,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
batched=True,
|
||||||
remove_columns=column_names,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
remove_columns=column_names,
|
||||||
desc="Running tokenizer on train dataset",
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on train dataset",
|
||||||
|
)
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
# Number of samples might increase during Feature Creation, We select only specified max samples
|
# Number of samples might increase during Feature Creation, We select only specified max samples
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
||||||
@@ -480,14 +481,15 @@ def main():
|
|||||||
# We will select sample from whole data
|
# We will select sample from whole data
|
||||||
eval_examples = eval_examples.select(range(data_args.max_eval_samples))
|
eval_examples = eval_examples.select(range(data_args.max_eval_samples))
|
||||||
# Validation Feature Creation
|
# Validation Feature Creation
|
||||||
eval_dataset = eval_examples.map(
|
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
||||||
prepare_validation_features,
|
eval_dataset = eval_examples.map(
|
||||||
batched=True,
|
prepare_validation_features,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
batched=True,
|
||||||
remove_columns=column_names,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
remove_columns=column_names,
|
||||||
desc="Running tokenizer on validation dataset",
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on validation dataset",
|
||||||
|
)
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
# During Feature creation dataset samples might increase, we will select required samples again
|
# During Feature creation dataset samples might increase, we will select required samples again
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
||||||
@@ -500,14 +502,15 @@ def main():
|
|||||||
# We will select sample from whole data
|
# We will select sample from whole data
|
||||||
predict_examples = predict_examples.select(range(data_args.max_predict_samples))
|
predict_examples = predict_examples.select(range(data_args.max_predict_samples))
|
||||||
# Predict Feature Creation
|
# Predict Feature Creation
|
||||||
predict_dataset = predict_examples.map(
|
with training_args.main_process_first(desc="prediction dataset map pre-processing"):
|
||||||
prepare_validation_features,
|
predict_dataset = predict_examples.map(
|
||||||
batched=True,
|
prepare_validation_features,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
batched=True,
|
||||||
remove_columns=column_names,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
remove_columns=column_names,
|
||||||
desc="Running tokenizer on prediction dataset",
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on prediction dataset",
|
||||||
|
)
|
||||||
if data_args.max_predict_samples is not None:
|
if data_args.max_predict_samples is not None:
|
||||||
# During Feature creation dataset samples might increase, we will select required samples again
|
# During Feature creation dataset samples might increase, we will select required samples again
|
||||||
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
||||||
|
|||||||
@@ -429,14 +429,15 @@ def main():
|
|||||||
# Select samples from Dataset, This will help to decrease processing time
|
# Select samples from Dataset, This will help to decrease processing time
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
||||||
# Create Training Features
|
# Create Training Features
|
||||||
train_dataset = train_dataset.map(
|
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
||||||
prepare_train_features,
|
train_dataset = train_dataset.map(
|
||||||
batched=True,
|
prepare_train_features,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
batched=True,
|
||||||
remove_columns=column_names,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
remove_columns=column_names,
|
||||||
desc="Running tokenizer on train dataset",
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on train dataset",
|
||||||
|
)
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
# Select samples from dataset again since Feature Creation might increase number of features
|
# Select samples from dataset again since Feature Creation might increase number of features
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
||||||
@@ -515,14 +516,15 @@ def main():
|
|||||||
# Selecting Eval Samples from Dataset
|
# Selecting Eval Samples from Dataset
|
||||||
eval_examples = eval_examples.select(range(data_args.max_eval_samples))
|
eval_examples = eval_examples.select(range(data_args.max_eval_samples))
|
||||||
# Create Features from Eval Dataset
|
# Create Features from Eval Dataset
|
||||||
eval_dataset = eval_examples.map(
|
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
||||||
prepare_validation_features,
|
eval_dataset = eval_examples.map(
|
||||||
batched=True,
|
prepare_validation_features,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
batched=True,
|
||||||
remove_columns=column_names,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
remove_columns=column_names,
|
||||||
desc="Running tokenizer on validation dataset",
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on validation dataset",
|
||||||
|
)
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
# Selecting Samples from Dataset again since Feature Creation might increase samples size
|
# Selecting Samples from Dataset again since Feature Creation might increase samples size
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
||||||
@@ -535,14 +537,15 @@ def main():
|
|||||||
# We will select sample from whole data
|
# We will select sample from whole data
|
||||||
predict_examples = predict_examples.select(range(data_args.max_predict_samples))
|
predict_examples = predict_examples.select(range(data_args.max_predict_samples))
|
||||||
# Test Feature Creation
|
# Test Feature Creation
|
||||||
predict_dataset = predict_examples.map(
|
with training_args.main_process_first(desc="prediction dataset map pre-processing"):
|
||||||
prepare_validation_features,
|
predict_dataset = predict_examples.map(
|
||||||
batched=True,
|
prepare_validation_features,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
batched=True,
|
||||||
remove_columns=column_names,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
remove_columns=column_names,
|
||||||
desc="Running tokenizer on prediction dataset",
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on prediction dataset",
|
||||||
|
)
|
||||||
if data_args.max_predict_samples is not None:
|
if data_args.max_predict_samples is not None:
|
||||||
# During Feature creation dataset samples might increase, we will select required samples again
|
# During Feature creation dataset samples might increase, we will select required samples again
|
||||||
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
||||||
|
|||||||
@@ -435,14 +435,15 @@ def main():
|
|||||||
train_dataset = raw_datasets["train"]
|
train_dataset = raw_datasets["train"]
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
||||||
train_dataset = train_dataset.map(
|
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
||||||
preprocess_function,
|
train_dataset = train_dataset.map(
|
||||||
batched=True,
|
preprocess_function,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
batched=True,
|
||||||
remove_columns=column_names,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
remove_columns=column_names,
|
||||||
desc="Running tokenizer on train dataset",
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on train dataset",
|
||||||
|
)
|
||||||
|
|
||||||
if training_args.do_eval:
|
if training_args.do_eval:
|
||||||
max_target_length = data_args.val_max_target_length
|
max_target_length = data_args.val_max_target_length
|
||||||
@@ -451,14 +452,15 @@ def main():
|
|||||||
eval_dataset = raw_datasets["validation"]
|
eval_dataset = raw_datasets["validation"]
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
||||||
eval_dataset = eval_dataset.map(
|
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
||||||
preprocess_function,
|
eval_dataset = eval_dataset.map(
|
||||||
batched=True,
|
preprocess_function,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
batched=True,
|
||||||
remove_columns=column_names,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
remove_columns=column_names,
|
||||||
desc="Running tokenizer on validation dataset",
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on validation dataset",
|
||||||
|
)
|
||||||
|
|
||||||
if training_args.do_predict:
|
if training_args.do_predict:
|
||||||
max_target_length = data_args.val_max_target_length
|
max_target_length = data_args.val_max_target_length
|
||||||
@@ -467,14 +469,15 @@ def main():
|
|||||||
predict_dataset = raw_datasets["test"]
|
predict_dataset = raw_datasets["test"]
|
||||||
if data_args.max_predict_samples is not None:
|
if data_args.max_predict_samples is not None:
|
||||||
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
||||||
predict_dataset = predict_dataset.map(
|
with training_args.main_process_first(desc="prediction dataset map pre-processing"):
|
||||||
preprocess_function,
|
predict_dataset = predict_dataset.map(
|
||||||
batched=True,
|
preprocess_function,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
batched=True,
|
||||||
remove_columns=column_names,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
remove_columns=column_names,
|
||||||
desc="Running tokenizer on prediction dataset",
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on prediction dataset",
|
||||||
|
)
|
||||||
|
|
||||||
# Data collator
|
# Data collator
|
||||||
label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
|
label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
|
||||||
|
|||||||
@@ -400,12 +400,13 @@ def main():
|
|||||||
result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
|
result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
|
||||||
return result
|
return result
|
||||||
|
|
||||||
raw_datasets = raw_datasets.map(
|
with training_args.main_process_first(desc="dataset map pre-processing"):
|
||||||
preprocess_function,
|
raw_datasets = raw_datasets.map(
|
||||||
batched=True,
|
preprocess_function,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
batched=True,
|
||||||
desc="Running tokenizer on dataset",
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on dataset",
|
||||||
|
)
|
||||||
if training_args.do_train:
|
if training_args.do_train:
|
||||||
if "train" not in raw_datasets:
|
if "train" not in raw_datasets:
|
||||||
raise ValueError("--do_train requires a train dataset")
|
raise ValueError("--do_train requires a train dataset")
|
||||||
@@ -526,7 +527,7 @@ def main():
|
|||||||
|
|
||||||
for predict_dataset, task in zip(predict_datasets, tasks):
|
for predict_dataset, task in zip(predict_datasets, tasks):
|
||||||
# Removing the `label` columns because it contains -1 and Trainer won't like that.
|
# Removing the `label` columns because it contains -1 and Trainer won't like that.
|
||||||
predict_dataset.remove_columns_("label")
|
predict_dataset = predict_dataset.remove_columns("label")
|
||||||
predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions
|
predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions
|
||||||
predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1)
|
predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1)
|
||||||
|
|
||||||
|
|||||||
@@ -280,12 +280,13 @@ def main():
|
|||||||
if training_args.do_train:
|
if training_args.do_train:
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
||||||
train_dataset = train_dataset.map(
|
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
||||||
preprocess_function,
|
train_dataset = train_dataset.map(
|
||||||
batched=True,
|
preprocess_function,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
batched=True,
|
||||||
desc="Running tokenizer on train dataset",
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on train dataset",
|
||||||
|
)
|
||||||
# Log a few random samples from the training set:
|
# Log a few random samples from the training set:
|
||||||
for index in random.sample(range(len(train_dataset)), 3):
|
for index in random.sample(range(len(train_dataset)), 3):
|
||||||
logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
|
logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
|
||||||
@@ -293,22 +294,24 @@ def main():
|
|||||||
if training_args.do_eval:
|
if training_args.do_eval:
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
||||||
eval_dataset = eval_dataset.map(
|
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
||||||
preprocess_function,
|
eval_dataset = eval_dataset.map(
|
||||||
batched=True,
|
preprocess_function,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
batched=True,
|
||||||
desc="Running tokenizer on validation dataset",
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on validation dataset",
|
||||||
|
)
|
||||||
|
|
||||||
if training_args.do_predict:
|
if training_args.do_predict:
|
||||||
if data_args.max_predict_samples is not None:
|
if data_args.max_predict_samples is not None:
|
||||||
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
||||||
predict_dataset = predict_dataset.map(
|
with training_args.main_process_first(desc="prediction dataset map pre-processing"):
|
||||||
preprocess_function,
|
predict_dataset = predict_dataset.map(
|
||||||
batched=True,
|
preprocess_function,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
batched=True,
|
||||||
desc="Running tokenizer on prediction dataset",
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on prediction dataset",
|
||||||
|
)
|
||||||
|
|
||||||
# Get the metric function
|
# Get the metric function
|
||||||
metric = load_metric("xnli")
|
metric = load_metric("xnli")
|
||||||
|
|||||||
@@ -390,13 +390,14 @@ def main():
|
|||||||
train_dataset = raw_datasets["train"]
|
train_dataset = raw_datasets["train"]
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
||||||
train_dataset = train_dataset.map(
|
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
||||||
tokenize_and_align_labels,
|
train_dataset = train_dataset.map(
|
||||||
batched=True,
|
tokenize_and_align_labels,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
batched=True,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
desc="Running tokenizer on train dataset",
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on train dataset",
|
||||||
|
)
|
||||||
|
|
||||||
if training_args.do_eval:
|
if training_args.do_eval:
|
||||||
if "validation" not in raw_datasets:
|
if "validation" not in raw_datasets:
|
||||||
@@ -404,13 +405,14 @@ def main():
|
|||||||
eval_dataset = raw_datasets["validation"]
|
eval_dataset = raw_datasets["validation"]
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
||||||
eval_dataset = eval_dataset.map(
|
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
||||||
tokenize_and_align_labels,
|
eval_dataset = eval_dataset.map(
|
||||||
batched=True,
|
tokenize_and_align_labels,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
batched=True,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
desc="Running tokenizer on validation dataset",
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on validation dataset",
|
||||||
|
)
|
||||||
|
|
||||||
if training_args.do_predict:
|
if training_args.do_predict:
|
||||||
if "test" not in raw_datasets:
|
if "test" not in raw_datasets:
|
||||||
@@ -418,13 +420,14 @@ def main():
|
|||||||
predict_dataset = raw_datasets["test"]
|
predict_dataset = raw_datasets["test"]
|
||||||
if data_args.max_predict_samples is not None:
|
if data_args.max_predict_samples is not None:
|
||||||
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
||||||
predict_dataset = predict_dataset.map(
|
with training_args.main_process_first(desc="prediction dataset map pre-processing"):
|
||||||
tokenize_and_align_labels,
|
predict_dataset = predict_dataset.map(
|
||||||
batched=True,
|
tokenize_and_align_labels,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
batched=True,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
desc="Running tokenizer on prediction dataset",
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
)
|
desc="Running tokenizer on prediction dataset",
|
||||||
|
)
|
||||||
|
|
||||||
# Data collator
|
# Data collator
|
||||||
data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)
|
data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)
|
||||||
|
|||||||
@@ -370,13 +370,14 @@ def main():
|
|||||||
# Select Sample from Dataset
|
# Select Sample from Dataset
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
||||||
# tokenize train dataset in batch
|
# tokenize train dataset in batch
|
||||||
train_dataset = train_dataset.map(
|
with training_args.main_process_first(desc="train dataset map tokenization"):
|
||||||
tokenize_function,
|
train_dataset = train_dataset.map(
|
||||||
batched=True,
|
tokenize_function,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
batched=True,
|
||||||
remove_columns=[text_column_name],
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
remove_columns=[text_column_name],
|
||||||
)
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
)
|
||||||
|
|
||||||
if training_args.do_eval:
|
if training_args.do_eval:
|
||||||
if "validation" not in raw_datasets:
|
if "validation" not in raw_datasets:
|
||||||
@@ -386,13 +387,14 @@ def main():
|
|||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
||||||
# tokenize validation dataset
|
# tokenize validation dataset
|
||||||
eval_dataset = eval_dataset.map(
|
with training_args.main_process_first(desc="validation dataset map tokenization"):
|
||||||
tokenize_function,
|
eval_dataset = eval_dataset.map(
|
||||||
batched=True,
|
tokenize_function,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
batched=True,
|
||||||
remove_columns=[text_column_name],
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
remove_columns=[text_column_name],
|
||||||
)
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
)
|
||||||
|
|
||||||
if training_args.do_predict:
|
if training_args.do_predict:
|
||||||
if "test" not in raw_datasets:
|
if "test" not in raw_datasets:
|
||||||
@@ -402,13 +404,14 @@ def main():
|
|||||||
if data_args.max_predict_samples is not None:
|
if data_args.max_predict_samples is not None:
|
||||||
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
||||||
# tokenize predict dataset
|
# tokenize predict dataset
|
||||||
predict_dataset = predict_dataset.map(
|
with training_args.main_process_first(desc="prediction dataset map tokenization"):
|
||||||
tokenize_function,
|
predict_dataset = predict_dataset.map(
|
||||||
batched=True,
|
tokenize_function,
|
||||||
num_proc=data_args.preprocessing_num_workers,
|
batched=True,
|
||||||
remove_columns=[text_column_name],
|
num_proc=data_args.preprocessing_num_workers,
|
||||||
load_from_cache_file=not data_args.overwrite_cache,
|
remove_columns=[text_column_name],
|
||||||
)
|
load_from_cache_file=not data_args.overwrite_cache,
|
||||||
|
)
|
||||||
|
|
||||||
# Data collator
|
# Data collator
|
||||||
data_collator=default_data_collator if not training_args.fp16 else DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
|
data_collator=default_data_collator if not training_args.fp16 else DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)
|
||||||
|
|||||||
@@ -503,7 +503,7 @@ def main():
|
|||||||
|
|
||||||
for test_dataset, task in zip(test_datasets, tasks):
|
for test_dataset, task in zip(test_datasets, tasks):
|
||||||
# Removing the `label` columns because it contains -1 and Trainer won't like that.
|
# Removing the `label` columns because it contains -1 and Trainer won't like that.
|
||||||
test_dataset.remove_columns_("label")
|
test_dataset = test_dataset.remove_columns("label")
|
||||||
predictions = trainer.predict(test_dataset=test_dataset).predictions
|
predictions = trainer.predict(test_dataset=test_dataset).predictions
|
||||||
predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1)
|
predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user