[examples] max samples can't be bigger than the len of dataset (#16501)
* [examples] max samples can't be bigger than then len of dataset * do tf and flax
This commit is contained in:
@@ -613,7 +613,8 @@ def main():
|
|||||||
raise ValueError("--do_train requires a train dataset")
|
raise ValueError("--do_train requires a train dataset")
|
||||||
train_dataset = dataset["train"]
|
train_dataset = dataset["train"]
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
# remove problematic examples
|
# remove problematic examples
|
||||||
# (if feature extraction is performed at the beginning, the filtering is done during preprocessing below
|
# (if feature extraction is performed at the beginning, the filtering is done during preprocessing below
|
||||||
# instead here.)
|
# instead here.)
|
||||||
@@ -646,7 +647,8 @@ def main():
|
|||||||
raise ValueError("--do_eval requires a validation dataset")
|
raise ValueError("--do_eval requires a validation dataset")
|
||||||
eval_dataset = dataset["validation"]
|
eval_dataset = dataset["validation"]
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||||
|
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||||
# remove problematic examples
|
# remove problematic examples
|
||||||
# (if feature extraction is performed at the beginning, the filtering is done during preprocessing below
|
# (if feature extraction is performed at the beginning, the filtering is done during preprocessing below
|
||||||
# instead here.)
|
# instead here.)
|
||||||
@@ -675,7 +677,8 @@ def main():
|
|||||||
raise ValueError("--do_predict requires a test dataset")
|
raise ValueError("--do_predict requires a test dataset")
|
||||||
predict_dataset = dataset["test"]
|
predict_dataset = dataset["test"]
|
||||||
if data_args.max_predict_samples is not None:
|
if data_args.max_predict_samples is not None:
|
||||||
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
|
||||||
|
predict_dataset = predict_dataset.select(range(max_predict_samples))
|
||||||
# remove problematic examples
|
# remove problematic examples
|
||||||
# (if feature extraction is performed at the beginning, the filtering is done during preprocessing below
|
# (if feature extraction is performed at the beginning, the filtering is done during preprocessing below
|
||||||
# instead here.)
|
# instead here.)
|
||||||
|
|||||||
@@ -527,14 +527,16 @@ def main():
|
|||||||
raise ValueError("--do_train requires a train dataset")
|
raise ValueError("--do_train requires a train dataset")
|
||||||
train_dataset = lm_datasets["train"]
|
train_dataset = lm_datasets["train"]
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
|
|
||||||
if training_args.do_eval:
|
if training_args.do_eval:
|
||||||
if "validation" not in tokenized_datasets:
|
if "validation" not in tokenized_datasets:
|
||||||
raise ValueError("--do_eval requires a validation dataset")
|
raise ValueError("--do_eval requires a validation dataset")
|
||||||
eval_dataset = lm_datasets["validation"]
|
eval_dataset = lm_datasets["validation"]
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||||
|
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||||
|
|
||||||
# Enable tensorboard only on the master node
|
# Enable tensorboard only on the master node
|
||||||
has_tensorboard = is_tensorboard_available()
|
has_tensorboard = is_tensorboard_available()
|
||||||
|
|||||||
@@ -602,7 +602,8 @@ def main():
|
|||||||
train_dataset = raw_datasets["train"]
|
train_dataset = raw_datasets["train"]
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
# We will select sample from whole data if agument is specified
|
# We will select sample from whole data if agument is specified
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
# Create train feature from dataset
|
# Create train feature from dataset
|
||||||
train_dataset = train_dataset.map(
|
train_dataset = train_dataset.map(
|
||||||
prepare_train_features,
|
prepare_train_features,
|
||||||
@@ -613,7 +614,8 @@ def main():
|
|||||||
)
|
)
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
# Number of samples might increase during Feature Creation, We select only specified max samples
|
# Number of samples might increase during Feature Creation, We select only specified max samples
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
processed_raw_datasets["train"] = train_dataset
|
processed_raw_datasets["train"] = train_dataset
|
||||||
|
|
||||||
# Validation preprocessing
|
# Validation preprocessing
|
||||||
@@ -669,7 +671,8 @@ def main():
|
|||||||
eval_examples = raw_datasets["validation"]
|
eval_examples = raw_datasets["validation"]
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
# We will select sample from whole data
|
# We will select sample from whole data
|
||||||
eval_examples = eval_examples.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_examples), data_args.max_eval_samples)
|
||||||
|
eval_examples = eval_examples.select(range(max_eval_samples))
|
||||||
# Validation Feature Creation
|
# Validation Feature Creation
|
||||||
eval_dataset = eval_examples.map(
|
eval_dataset = eval_examples.map(
|
||||||
prepare_validation_features,
|
prepare_validation_features,
|
||||||
@@ -680,7 +683,8 @@ def main():
|
|||||||
)
|
)
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
# During Feature creation dataset samples might increase, we will select required samples again
|
# During Feature creation dataset samples might increase, we will select required samples again
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||||
|
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||||
processed_raw_datasets["validation"] = eval_dataset
|
processed_raw_datasets["validation"] = eval_dataset
|
||||||
|
|
||||||
if training_args.do_predict:
|
if training_args.do_predict:
|
||||||
@@ -700,7 +704,8 @@ def main():
|
|||||||
)
|
)
|
||||||
if data_args.max_predict_samples is not None:
|
if data_args.max_predict_samples is not None:
|
||||||
# During Feature creation dataset samples might increase, we will select required samples again
|
# During Feature creation dataset samples might increase, we will select required samples again
|
||||||
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
|
||||||
|
predict_dataset = predict_dataset.select(range(max_predict_samples))
|
||||||
processed_raw_datasets["test"] = predict_dataset
|
processed_raw_datasets["test"] = predict_dataset
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
|
|||||||
@@ -547,7 +547,8 @@ def main():
|
|||||||
raise ValueError("--do_train requires a train dataset")
|
raise ValueError("--do_train requires a train dataset")
|
||||||
train_dataset = dataset["train"]
|
train_dataset = dataset["train"]
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
train_dataset = train_dataset.map(
|
train_dataset = train_dataset.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
batched=True,
|
batched=True,
|
||||||
@@ -563,7 +564,8 @@ def main():
|
|||||||
raise ValueError("--do_eval requires a validation dataset")
|
raise ValueError("--do_eval requires a validation dataset")
|
||||||
eval_dataset = dataset["validation"]
|
eval_dataset = dataset["validation"]
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||||
|
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||||
eval_dataset = eval_dataset.map(
|
eval_dataset = eval_dataset.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
batched=True,
|
batched=True,
|
||||||
@@ -579,7 +581,8 @@ def main():
|
|||||||
raise ValueError("--do_predict requires a test dataset")
|
raise ValueError("--do_predict requires a test dataset")
|
||||||
predict_dataset = dataset["test"]
|
predict_dataset = dataset["test"]
|
||||||
if data_args.max_predict_samples is not None:
|
if data_args.max_predict_samples is not None:
|
||||||
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
|
||||||
|
predict_dataset = predict_dataset.select(range(max_predict_samples))
|
||||||
predict_dataset = predict_dataset.map(
|
predict_dataset = predict_dataset.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
batched=True,
|
batched=True,
|
||||||
|
|||||||
@@ -404,7 +404,8 @@ def main():
|
|||||||
raise ValueError("--do_train requires a train dataset")
|
raise ValueError("--do_train requires a train dataset")
|
||||||
train_dataset = dataset["train"]
|
train_dataset = dataset["train"]
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
|
|
||||||
train_dataset = train_dataset.filter(
|
train_dataset = train_dataset.filter(
|
||||||
filter_corrupt_images, batched=True, num_proc=data_args.preprocessing_num_workers
|
filter_corrupt_images, batched=True, num_proc=data_args.preprocessing_num_workers
|
||||||
@@ -426,7 +427,8 @@ def main():
|
|||||||
raise ValueError("--do_eval requires a train validation")
|
raise ValueError("--do_eval requires a train validation")
|
||||||
eval_dataset = dataset["validation"]
|
eval_dataset = dataset["validation"]
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||||
|
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||||
|
|
||||||
eval_dataset = eval_dataset.filter(
|
eval_dataset = eval_dataset.filter(
|
||||||
filter_corrupt_images, batched=True, num_proc=data_args.preprocessing_num_workers
|
filter_corrupt_images, batched=True, num_proc=data_args.preprocessing_num_workers
|
||||||
@@ -448,7 +450,8 @@ def main():
|
|||||||
raise ValueError("--do_predict requires a test dataset")
|
raise ValueError("--do_predict requires a test dataset")
|
||||||
test_dataset = dataset["test"]
|
test_dataset = dataset["test"]
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
test_dataset = test_dataset.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(test_dataset), data_args.max_eval_samples)
|
||||||
|
test_dataset = test_dataset.select(range(max_eval_samples))
|
||||||
|
|
||||||
test_dataset = test_dataset.filter(
|
test_dataset = test_dataset.filter(
|
||||||
filter_corrupt_images, batched=True, num_proc=data_args.preprocessing_num_workers
|
filter_corrupt_images, batched=True, num_proc=data_args.preprocessing_num_workers
|
||||||
|
|||||||
@@ -445,14 +445,16 @@ def main():
|
|||||||
raise ValueError("--do_train requires a train dataset")
|
raise ValueError("--do_train requires a train dataset")
|
||||||
train_dataset = lm_datasets["train"]
|
train_dataset = lm_datasets["train"]
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
|
|
||||||
if training_args.do_eval:
|
if training_args.do_eval:
|
||||||
if "validation" not in tokenized_datasets:
|
if "validation" not in tokenized_datasets:
|
||||||
raise ValueError("--do_eval requires a validation dataset")
|
raise ValueError("--do_eval requires a validation dataset")
|
||||||
eval_dataset = lm_datasets["validation"]
|
eval_dataset = lm_datasets["validation"]
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||||
|
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||||
|
|
||||||
def preprocess_logits_for_metrics(logits, labels):
|
def preprocess_logits_for_metrics(logits, labels):
|
||||||
if isinstance(logits, tuple):
|
if isinstance(logits, tuple):
|
||||||
|
|||||||
@@ -468,14 +468,16 @@ def main():
|
|||||||
raise ValueError("--do_train requires a train dataset")
|
raise ValueError("--do_train requires a train dataset")
|
||||||
train_dataset = tokenized_datasets["train"]
|
train_dataset = tokenized_datasets["train"]
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
|
|
||||||
if training_args.do_eval:
|
if training_args.do_eval:
|
||||||
if "validation" not in tokenized_datasets:
|
if "validation" not in tokenized_datasets:
|
||||||
raise ValueError("--do_eval requires a validation dataset")
|
raise ValueError("--do_eval requires a validation dataset")
|
||||||
eval_dataset = tokenized_datasets["validation"]
|
eval_dataset = tokenized_datasets["validation"]
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||||
|
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||||
|
|
||||||
def preprocess_logits_for_metrics(logits, labels):
|
def preprocess_logits_for_metrics(logits, labels):
|
||||||
if isinstance(logits, tuple):
|
if isinstance(logits, tuple):
|
||||||
|
|||||||
@@ -438,14 +438,16 @@ def main():
|
|||||||
raise ValueError("--do_train requires a train dataset")
|
raise ValueError("--do_train requires a train dataset")
|
||||||
train_dataset = tokenized_datasets["train"]
|
train_dataset = tokenized_datasets["train"]
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
|
|
||||||
if training_args.do_eval:
|
if training_args.do_eval:
|
||||||
if "validation" not in tokenized_datasets:
|
if "validation" not in tokenized_datasets:
|
||||||
raise ValueError("--do_eval requires a validation dataset")
|
raise ValueError("--do_eval requires a validation dataset")
|
||||||
eval_dataset = tokenized_datasets["validation"]
|
eval_dataset = tokenized_datasets["validation"]
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||||
|
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||||
|
|
||||||
# Data collator
|
# Data collator
|
||||||
data_collator = DataCollatorForPermutationLanguageModeling(
|
data_collator = DataCollatorForPermutationLanguageModeling(
|
||||||
|
|||||||
@@ -352,7 +352,8 @@ def main():
|
|||||||
raise ValueError("--do_train requires a train dataset")
|
raise ValueError("--do_train requires a train dataset")
|
||||||
train_dataset = raw_datasets["train"]
|
train_dataset = raw_datasets["train"]
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
||||||
train_dataset = train_dataset.map(
|
train_dataset = train_dataset.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
@@ -366,7 +367,8 @@ def main():
|
|||||||
raise ValueError("--do_eval requires a validation dataset")
|
raise ValueError("--do_eval requires a validation dataset")
|
||||||
eval_dataset = raw_datasets["validation"]
|
eval_dataset = raw_datasets["validation"]
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||||
|
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||||
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
||||||
eval_dataset = eval_dataset.map(
|
eval_dataset = eval_dataset.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
|
|||||||
@@ -421,7 +421,8 @@ def main():
|
|||||||
train_dataset = raw_datasets["train"]
|
train_dataset = raw_datasets["train"]
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
# We will select sample from whole data if argument is specified
|
# We will select sample from whole data if argument is specified
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
# Create train feature from dataset
|
# Create train feature from dataset
|
||||||
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
||||||
train_dataset = train_dataset.map(
|
train_dataset = train_dataset.map(
|
||||||
@@ -434,7 +435,8 @@ def main():
|
|||||||
)
|
)
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
# Number of samples might increase during Feature Creation, We select only specified max samples
|
# Number of samples might increase during Feature Creation, We select only specified max samples
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
|
|
||||||
# Validation preprocessing
|
# Validation preprocessing
|
||||||
def prepare_validation_features(examples):
|
def prepare_validation_features(examples):
|
||||||
@@ -489,7 +491,8 @@ def main():
|
|||||||
eval_examples = raw_datasets["validation"]
|
eval_examples = raw_datasets["validation"]
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
# We will select sample from whole data
|
# We will select sample from whole data
|
||||||
eval_examples = eval_examples.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_examples), data_args.max_eval_samples)
|
||||||
|
eval_examples = eval_examples.select(range(max_eval_samples))
|
||||||
# Validation Feature Creation
|
# Validation Feature Creation
|
||||||
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
||||||
eval_dataset = eval_examples.map(
|
eval_dataset = eval_examples.map(
|
||||||
@@ -502,7 +505,8 @@ def main():
|
|||||||
)
|
)
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
# During Feature creation dataset samples might increase, we will select required samples again
|
# During Feature creation dataset samples might increase, we will select required samples again
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||||
|
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||||
|
|
||||||
if training_args.do_predict:
|
if training_args.do_predict:
|
||||||
if "test" not in raw_datasets:
|
if "test" not in raw_datasets:
|
||||||
@@ -523,7 +527,8 @@ def main():
|
|||||||
)
|
)
|
||||||
if data_args.max_predict_samples is not None:
|
if data_args.max_predict_samples is not None:
|
||||||
# During Feature creation dataset samples might increase, we will select required samples again
|
# During Feature creation dataset samples might increase, we will select required samples again
|
||||||
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
|
||||||
|
predict_dataset = predict_dataset.select(range(max_predict_samples))
|
||||||
|
|
||||||
# Data collator
|
# Data collator
|
||||||
# We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
|
# We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
|
||||||
|
|||||||
@@ -432,7 +432,8 @@ def main():
|
|||||||
train_dataset = raw_datasets["train"]
|
train_dataset = raw_datasets["train"]
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
# Select samples from Dataset, This will help to decrease processing time
|
# Select samples from Dataset, This will help to decrease processing time
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
# Create Training Features
|
# Create Training Features
|
||||||
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
||||||
train_dataset = train_dataset.map(
|
train_dataset = train_dataset.map(
|
||||||
@@ -445,7 +446,8 @@ def main():
|
|||||||
)
|
)
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
# Select samples from dataset again since Feature Creation might increase number of features
|
# Select samples from dataset again since Feature Creation might increase number of features
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
|
|
||||||
# Validation preprocessing
|
# Validation preprocessing
|
||||||
def prepare_validation_features(examples):
|
def prepare_validation_features(examples):
|
||||||
@@ -519,7 +521,8 @@ def main():
|
|||||||
eval_examples = raw_datasets["validation"]
|
eval_examples = raw_datasets["validation"]
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
# Selecting Eval Samples from Dataset
|
# Selecting Eval Samples from Dataset
|
||||||
eval_examples = eval_examples.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_examples), data_args.max_eval_samples)
|
||||||
|
eval_examples = eval_examples.select(range(max_eval_samples))
|
||||||
# Create Features from Eval Dataset
|
# Create Features from Eval Dataset
|
||||||
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
||||||
eval_dataset = eval_examples.map(
|
eval_dataset = eval_examples.map(
|
||||||
@@ -532,7 +535,8 @@ def main():
|
|||||||
)
|
)
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
# Selecting Samples from Dataset again since Feature Creation might increase samples size
|
# Selecting Samples from Dataset again since Feature Creation might increase samples size
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||||
|
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||||
|
|
||||||
if training_args.do_predict:
|
if training_args.do_predict:
|
||||||
if "test" not in raw_datasets:
|
if "test" not in raw_datasets:
|
||||||
@@ -553,7 +557,8 @@ def main():
|
|||||||
)
|
)
|
||||||
if data_args.max_predict_samples is not None:
|
if data_args.max_predict_samples is not None:
|
||||||
# During Feature creation dataset samples might increase, we will select required samples again
|
# During Feature creation dataset samples might increase, we will select required samples again
|
||||||
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
|
||||||
|
predict_dataset = predict_dataset.select(range(max_predict_samples))
|
||||||
|
|
||||||
# Data collator
|
# Data collator
|
||||||
# We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
|
# We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
|
||||||
|
|||||||
@@ -489,7 +489,8 @@ def main():
|
|||||||
train_dataset = raw_datasets["train"]
|
train_dataset = raw_datasets["train"]
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
# We will select sample from whole data if agument is specified
|
# We will select sample from whole data if agument is specified
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
# Create train feature from dataset
|
# Create train feature from dataset
|
||||||
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
||||||
train_dataset = train_dataset.map(
|
train_dataset = train_dataset.map(
|
||||||
@@ -502,7 +503,8 @@ def main():
|
|||||||
)
|
)
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
# Number of samples might increase during Feature Creation, We select only specified max samples
|
# Number of samples might increase during Feature Creation, We select only specified max samples
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
|
|
||||||
if training_args.do_eval:
|
if training_args.do_eval:
|
||||||
if "validation" not in raw_datasets:
|
if "validation" not in raw_datasets:
|
||||||
@@ -510,7 +512,8 @@ def main():
|
|||||||
eval_examples = raw_datasets["validation"]
|
eval_examples = raw_datasets["validation"]
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
# We will select sample from whole data
|
# We will select sample from whole data
|
||||||
eval_examples = eval_examples.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_examples), data_args.max_eval_samples)
|
||||||
|
eval_examples = eval_examples.select(range(max_eval_samples))
|
||||||
# Validation Feature Creation
|
# Validation Feature Creation
|
||||||
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
||||||
eval_dataset = eval_examples.map(
|
eval_dataset = eval_examples.map(
|
||||||
@@ -523,7 +526,8 @@ def main():
|
|||||||
)
|
)
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
# During Feature creation dataset samples might increase, we will select required samples again
|
# During Feature creation dataset samples might increase, we will select required samples again
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||||
|
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||||
|
|
||||||
if training_args.do_predict:
|
if training_args.do_predict:
|
||||||
if "test" not in raw_datasets:
|
if "test" not in raw_datasets:
|
||||||
@@ -544,7 +548,8 @@ def main():
|
|||||||
)
|
)
|
||||||
if data_args.max_predict_samples is not None:
|
if data_args.max_predict_samples is not None:
|
||||||
# During Feature creation dataset samples might increase, we will select required samples again
|
# During Feature creation dataset samples might increase, we will select required samples again
|
||||||
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
|
||||||
|
predict_dataset = predict_dataset.select(range(max_predict_samples))
|
||||||
|
|
||||||
# Data collator
|
# Data collator
|
||||||
label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
|
label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
|
||||||
|
|||||||
@@ -504,7 +504,8 @@ def main():
|
|||||||
raise ValueError("--do_train requires a train dataset")
|
raise ValueError("--do_train requires a train dataset")
|
||||||
train_dataset = raw_datasets["train"]
|
train_dataset = raw_datasets["train"]
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
||||||
train_dataset = train_dataset.map(
|
train_dataset = train_dataset.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
@@ -521,7 +522,8 @@ def main():
|
|||||||
raise ValueError("--do_eval requires a validation dataset")
|
raise ValueError("--do_eval requires a validation dataset")
|
||||||
eval_dataset = raw_datasets["validation"]
|
eval_dataset = raw_datasets["validation"]
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||||
|
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||||
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
||||||
eval_dataset = eval_dataset.map(
|
eval_dataset = eval_dataset.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
@@ -538,7 +540,8 @@ def main():
|
|||||||
raise ValueError("--do_predict requires a test dataset")
|
raise ValueError("--do_predict requires a test dataset")
|
||||||
predict_dataset = raw_datasets["test"]
|
predict_dataset = raw_datasets["test"]
|
||||||
if data_args.max_predict_samples is not None:
|
if data_args.max_predict_samples is not None:
|
||||||
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
|
||||||
|
predict_dataset = predict_dataset.select(range(max_predict_samples))
|
||||||
with training_args.main_process_first(desc="prediction dataset map pre-processing"):
|
with training_args.main_process_first(desc="prediction dataset map pre-processing"):
|
||||||
predict_dataset = predict_dataset.map(
|
predict_dataset = predict_dataset.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
|
|||||||
@@ -415,21 +415,24 @@ def main():
|
|||||||
raise ValueError("--do_train requires a train dataset")
|
raise ValueError("--do_train requires a train dataset")
|
||||||
train_dataset = raw_datasets["train"]
|
train_dataset = raw_datasets["train"]
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
|
|
||||||
if training_args.do_eval:
|
if training_args.do_eval:
|
||||||
if "validation" not in raw_datasets and "validation_matched" not in raw_datasets:
|
if "validation" not in raw_datasets and "validation_matched" not in raw_datasets:
|
||||||
raise ValueError("--do_eval requires a validation dataset")
|
raise ValueError("--do_eval requires a validation dataset")
|
||||||
eval_dataset = raw_datasets["validation_matched" if data_args.task_name == "mnli" else "validation"]
|
eval_dataset = raw_datasets["validation_matched" if data_args.task_name == "mnli" else "validation"]
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||||
|
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||||
|
|
||||||
if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None:
|
if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None:
|
||||||
if "test" not in raw_datasets and "test_matched" not in raw_datasets:
|
if "test" not in raw_datasets and "test_matched" not in raw_datasets:
|
||||||
raise ValueError("--do_predict requires a test dataset")
|
raise ValueError("--do_predict requires a test dataset")
|
||||||
predict_dataset = raw_datasets["test_matched" if data_args.task_name == "mnli" else "test"]
|
predict_dataset = raw_datasets["test_matched" if data_args.task_name == "mnli" else "test"]
|
||||||
if data_args.max_predict_samples is not None:
|
if data_args.max_predict_samples is not None:
|
||||||
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
|
||||||
|
predict_dataset = predict_dataset.select(range(max_predict_samples))
|
||||||
|
|
||||||
# Log a few random samples from the training set:
|
# Log a few random samples from the training set:
|
||||||
if training_args.do_train:
|
if training_args.do_train:
|
||||||
|
|||||||
@@ -279,7 +279,8 @@ def main():
|
|||||||
|
|
||||||
if training_args.do_train:
|
if training_args.do_train:
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
||||||
train_dataset = train_dataset.map(
|
train_dataset = train_dataset.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
@@ -293,7 +294,8 @@ def main():
|
|||||||
|
|
||||||
if training_args.do_eval:
|
if training_args.do_eval:
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||||
|
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||||
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
||||||
eval_dataset = eval_dataset.map(
|
eval_dataset = eval_dataset.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
@@ -304,7 +306,8 @@ def main():
|
|||||||
|
|
||||||
if training_args.do_predict:
|
if training_args.do_predict:
|
||||||
if data_args.max_predict_samples is not None:
|
if data_args.max_predict_samples is not None:
|
||||||
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
|
||||||
|
predict_dataset = predict_dataset.select(range(max_predict_samples))
|
||||||
with training_args.main_process_first(desc="prediction dataset map pre-processing"):
|
with training_args.main_process_first(desc="prediction dataset map pre-processing"):
|
||||||
predict_dataset = predict_dataset.map(
|
predict_dataset = predict_dataset.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
|
|||||||
@@ -431,7 +431,8 @@ def main():
|
|||||||
raise ValueError("--do_train requires a train dataset")
|
raise ValueError("--do_train requires a train dataset")
|
||||||
train_dataset = raw_datasets["train"]
|
train_dataset = raw_datasets["train"]
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
||||||
train_dataset = train_dataset.map(
|
train_dataset = train_dataset.map(
|
||||||
tokenize_and_align_labels,
|
tokenize_and_align_labels,
|
||||||
@@ -446,7 +447,8 @@ def main():
|
|||||||
raise ValueError("--do_eval requires a validation dataset")
|
raise ValueError("--do_eval requires a validation dataset")
|
||||||
eval_dataset = raw_datasets["validation"]
|
eval_dataset = raw_datasets["validation"]
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||||
|
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||||
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
||||||
eval_dataset = eval_dataset.map(
|
eval_dataset = eval_dataset.map(
|
||||||
tokenize_and_align_labels,
|
tokenize_and_align_labels,
|
||||||
@@ -461,7 +463,8 @@ def main():
|
|||||||
raise ValueError("--do_predict requires a test dataset")
|
raise ValueError("--do_predict requires a test dataset")
|
||||||
predict_dataset = raw_datasets["test"]
|
predict_dataset = raw_datasets["test"]
|
||||||
if data_args.max_predict_samples is not None:
|
if data_args.max_predict_samples is not None:
|
||||||
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
|
||||||
|
predict_dataset = predict_dataset.select(range(max_predict_samples))
|
||||||
with training_args.main_process_first(desc="prediction dataset map pre-processing"):
|
with training_args.main_process_first(desc="prediction dataset map pre-processing"):
|
||||||
predict_dataset = predict_dataset.map(
|
predict_dataset = predict_dataset.map(
|
||||||
tokenize_and_align_labels,
|
tokenize_and_align_labels,
|
||||||
|
|||||||
@@ -433,7 +433,8 @@ def main():
|
|||||||
raise ValueError("--do_train requires a train dataset")
|
raise ValueError("--do_train requires a train dataset")
|
||||||
train_dataset = raw_datasets["train"]
|
train_dataset = raw_datasets["train"]
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
||||||
train_dataset = train_dataset.map(
|
train_dataset = train_dataset.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
@@ -450,7 +451,8 @@ def main():
|
|||||||
raise ValueError("--do_eval requires a validation dataset")
|
raise ValueError("--do_eval requires a validation dataset")
|
||||||
eval_dataset = raw_datasets["validation"]
|
eval_dataset = raw_datasets["validation"]
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||||
|
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||||
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
||||||
eval_dataset = eval_dataset.map(
|
eval_dataset = eval_dataset.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
@@ -467,7 +469,8 @@ def main():
|
|||||||
raise ValueError("--do_predict requires a test dataset")
|
raise ValueError("--do_predict requires a test dataset")
|
||||||
predict_dataset = raw_datasets["test"]
|
predict_dataset = raw_datasets["test"]
|
||||||
if data_args.max_predict_samples is not None:
|
if data_args.max_predict_samples is not None:
|
||||||
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
|
||||||
|
predict_dataset = predict_dataset.select(range(max_predict_samples))
|
||||||
with training_args.main_process_first(desc="prediction dataset map pre-processing"):
|
with training_args.main_process_first(desc="prediction dataset map pre-processing"):
|
||||||
predict_dataset = predict_dataset.map(
|
predict_dataset = predict_dataset.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
|
|||||||
@@ -398,14 +398,16 @@ def main():
|
|||||||
raise ValueError("--do_train requires a train dataset")
|
raise ValueError("--do_train requires a train dataset")
|
||||||
train_dataset = lm_datasets["train"]
|
train_dataset = lm_datasets["train"]
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
|
|
||||||
if training_args.do_eval:
|
if training_args.do_eval:
|
||||||
if "validation" not in tokenized_datasets:
|
if "validation" not in tokenized_datasets:
|
||||||
raise ValueError("--do_eval requires a validation dataset")
|
raise ValueError("--do_eval requires a validation dataset")
|
||||||
eval_dataset = lm_datasets["validation"]
|
eval_dataset = lm_datasets["validation"]
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||||
|
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||||
|
|
||||||
# Enable tensorboard only on the master node
|
# Enable tensorboard only on the master node
|
||||||
has_tensorboard = is_tensorboard_available()
|
has_tensorboard = is_tensorboard_available()
|
||||||
|
|||||||
@@ -434,7 +434,8 @@ def main():
|
|||||||
train_dataset = raw_datasets["train"]
|
train_dataset = raw_datasets["train"]
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
# We will select sample from whole data if agument is specified
|
# We will select sample from whole data if agument is specified
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
# Create train feature from dataset
|
# Create train feature from dataset
|
||||||
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
||||||
train_dataset = train_dataset.map(
|
train_dataset = train_dataset.map(
|
||||||
@@ -447,7 +448,8 @@ def main():
|
|||||||
)
|
)
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
# Number of samples might increase during Feature Creation, We select only specified max samples
|
# Number of samples might increase during Feature Creation, We select only specified max samples
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
|
|
||||||
# Validation preprocessing
|
# Validation preprocessing
|
||||||
def prepare_validation_features(examples):
|
def prepare_validation_features(examples):
|
||||||
@@ -497,7 +499,8 @@ def main():
|
|||||||
eval_examples = raw_datasets["validation"]
|
eval_examples = raw_datasets["validation"]
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
# We will select sample from whole data
|
# We will select sample from whole data
|
||||||
eval_examples = eval_examples.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_examples), data_args.max_eval_samples)
|
||||||
|
eval_examples = eval_examples.select(range(max_eval_samples))
|
||||||
# Validation Feature Creation
|
# Validation Feature Creation
|
||||||
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
||||||
eval_dataset = eval_examples.map(
|
eval_dataset = eval_examples.map(
|
||||||
@@ -510,7 +513,8 @@ def main():
|
|||||||
)
|
)
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
# During Feature creation dataset samples might increase, we will select required samples again
|
# During Feature creation dataset samples might increase, we will select required samples again
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||||
|
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||||
|
|
||||||
if training_args.do_predict:
|
if training_args.do_predict:
|
||||||
if "test" not in raw_datasets:
|
if "test" not in raw_datasets:
|
||||||
@@ -531,7 +535,8 @@ def main():
|
|||||||
)
|
)
|
||||||
if data_args.max_predict_samples is not None:
|
if data_args.max_predict_samples is not None:
|
||||||
# During Feature creation dataset samples might increase, we will select required samples again
|
# During Feature creation dataset samples might increase, we will select required samples again
|
||||||
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
|
||||||
|
predict_dataset = predict_dataset.select(range(max_predict_samples))
|
||||||
|
|
||||||
# Data collator
|
# Data collator
|
||||||
# We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
|
# We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
|
||||||
|
|||||||
@@ -375,7 +375,8 @@ def main():
|
|||||||
)
|
)
|
||||||
|
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
|
|
||||||
if data_args.max_val_samples is not None:
|
if data_args.max_val_samples is not None:
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
|
eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
|
||||||
|
|||||||
@@ -415,9 +415,11 @@ def main():
|
|||||||
train_dataset = train_dataset.select(train_indices)
|
train_dataset = train_dataset.select(train_indices)
|
||||||
|
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||||
|
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||||
|
|
||||||
# Log a few random samples from the training set:
|
# Log a few random samples from the training set:
|
||||||
for index in random.sample(range(len(train_dataset)), 3):
|
for index in random.sample(range(len(train_dataset)), 3):
|
||||||
|
|||||||
@@ -456,9 +456,11 @@ def main():
|
|||||||
train_dataset = train_dataset.select(train_indices)
|
train_dataset = train_dataset.select(train_indices)
|
||||||
|
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||||
|
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||||
|
|
||||||
# Log a few random samples from the training set:
|
# Log a few random samples from the training set:
|
||||||
for index in random.sample(range(len(train_dataset)), 3):
|
for index in random.sample(range(len(train_dataset)), 3):
|
||||||
|
|||||||
@@ -369,7 +369,8 @@ def main():
|
|||||||
train_dataset = raw_datasets["train"]
|
train_dataset = raw_datasets["train"]
|
||||||
non_label_columns = [feature for feature in train_dataset.features if feature not in ("label", "labels")]
|
non_label_columns = [feature for feature in train_dataset.features if feature not in ("label", "labels")]
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
||||||
train_dataset = train_dataset.map(
|
train_dataset = train_dataset.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
@@ -385,7 +386,8 @@ def main():
|
|||||||
if not training_args.do_train:
|
if not training_args.do_train:
|
||||||
non_label_columns = [feature for feature in eval_dataset.features if feature not in ("label", "labels")]
|
non_label_columns = [feature for feature in eval_dataset.features if feature not in ("label", "labels")]
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||||
|
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||||
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
||||||
eval_dataset = eval_dataset.map(
|
eval_dataset = eval_dataset.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
|
|||||||
@@ -438,7 +438,8 @@ def main():
|
|||||||
train_dataset = datasets["train"]
|
train_dataset = datasets["train"]
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
# We will select sample from whole data if agument is specified
|
# We will select sample from whole data if agument is specified
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
# Create train feature from dataset
|
# Create train feature from dataset
|
||||||
train_dataset = train_dataset.map(
|
train_dataset = train_dataset.map(
|
||||||
prepare_train_features,
|
prepare_train_features,
|
||||||
@@ -449,7 +450,8 @@ def main():
|
|||||||
)
|
)
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
# Number of samples might increase during Feature Creation, We select only specified max samples
|
# Number of samples might increase during Feature Creation, We select only specified max samples
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
processed_datasets["train"] = train_dataset
|
processed_datasets["train"] = train_dataset
|
||||||
|
|
||||||
# Validation preprocessing
|
# Validation preprocessing
|
||||||
@@ -505,7 +507,8 @@ def main():
|
|||||||
eval_examples = datasets["validation"]
|
eval_examples = datasets["validation"]
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
# We will select sample from whole data
|
# We will select sample from whole data
|
||||||
eval_examples = eval_examples.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_examples), data_args.max_eval_samples)
|
||||||
|
eval_examples = eval_examples.select(range(max_eval_samples))
|
||||||
# Validation Feature Creation
|
# Validation Feature Creation
|
||||||
eval_dataset = eval_examples.map(
|
eval_dataset = eval_examples.map(
|
||||||
prepare_validation_features,
|
prepare_validation_features,
|
||||||
@@ -516,7 +519,8 @@ def main():
|
|||||||
)
|
)
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
# During Feature creation dataset samples might increase, we will select required samples again
|
# During Feature creation dataset samples might increase, we will select required samples again
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||||
|
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||||
processed_datasets["validation"] = eval_dataset
|
processed_datasets["validation"] = eval_dataset
|
||||||
|
|
||||||
if training_args.do_predict:
|
if training_args.do_predict:
|
||||||
@@ -536,7 +540,8 @@ def main():
|
|||||||
)
|
)
|
||||||
if data_args.max_predict_samples is not None:
|
if data_args.max_predict_samples is not None:
|
||||||
# During Feature creation dataset samples might increase, we will select required samples again
|
# During Feature creation dataset samples might increase, we will select required samples again
|
||||||
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
|
||||||
|
predict_dataset = predict_dataset.select(range(max_predict_samples))
|
||||||
processed_datasets["test"] = predict_dataset
|
processed_datasets["test"] = predict_dataset
|
||||||
# endregion
|
# endregion
|
||||||
|
|
||||||
|
|||||||
@@ -490,7 +490,8 @@ def main():
|
|||||||
raise ValueError("--do_train requires a train dataset")
|
raise ValueError("--do_train requires a train dataset")
|
||||||
train_dataset = raw_datasets["train"]
|
train_dataset = raw_datasets["train"]
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
||||||
train_dataset = train_dataset.map(
|
train_dataset = train_dataset.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
@@ -509,7 +510,8 @@ def main():
|
|||||||
raise ValueError("--do_eval requires a validation dataset")
|
raise ValueError("--do_eval requires a validation dataset")
|
||||||
eval_dataset = raw_datasets["validation"]
|
eval_dataset = raw_datasets["validation"]
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||||
|
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||||
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
||||||
eval_dataset = eval_dataset.map(
|
eval_dataset = eval_dataset.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
|
|||||||
@@ -445,7 +445,8 @@ def main():
|
|||||||
raise ValueError("--do_train requires a train dataset")
|
raise ValueError("--do_train requires a train dataset")
|
||||||
train_dataset = raw_datasets["train"]
|
train_dataset = raw_datasets["train"]
|
||||||
if data_args.max_train_samples is not None:
|
if data_args.max_train_samples is not None:
|
||||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||||
|
train_dataset = train_dataset.select(range(max_train_samples))
|
||||||
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
||||||
train_dataset = train_dataset.map(
|
train_dataset = train_dataset.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
@@ -464,7 +465,8 @@ def main():
|
|||||||
raise ValueError("--do_eval requires a validation dataset")
|
raise ValueError("--do_eval requires a validation dataset")
|
||||||
eval_dataset = raw_datasets["validation"]
|
eval_dataset = raw_datasets["validation"]
|
||||||
if data_args.max_eval_samples is not None:
|
if data_args.max_eval_samples is not None:
|
||||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||||
|
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||||
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
||||||
eval_dataset = eval_dataset.map(
|
eval_dataset = eval_dataset.map(
|
||||||
preprocess_function,
|
preprocess_function,
|
||||||
|
|||||||
Reference in New Issue
Block a user