[examples] max samples can't be bigger than the len of dataset (#16501)
* [examples] max samples can't be bigger than then len of dataset * do tf and flax
This commit is contained in:
@@ -398,14 +398,16 @@ def main():
|
||||
raise ValueError("--do_train requires a train dataset")
|
||||
train_dataset = lm_datasets["train"]
|
||||
if data_args.max_train_samples is not None:
|
||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
||||
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||
train_dataset = train_dataset.select(range(max_train_samples))
|
||||
|
||||
if training_args.do_eval:
|
||||
if "validation" not in tokenized_datasets:
|
||||
raise ValueError("--do_eval requires a validation dataset")
|
||||
eval_dataset = lm_datasets["validation"]
|
||||
if data_args.max_eval_samples is not None:
|
||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
||||
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||
|
||||
# Enable tensorboard only on the master node
|
||||
has_tensorboard = is_tensorboard_available()
|
||||
|
||||
@@ -434,7 +434,8 @@ def main():
|
||||
train_dataset = raw_datasets["train"]
|
||||
if data_args.max_train_samples is not None:
|
||||
# We will select sample from whole data if agument is specified
|
||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
||||
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||
train_dataset = train_dataset.select(range(max_train_samples))
|
||||
# Create train feature from dataset
|
||||
with training_args.main_process_first(desc="train dataset map pre-processing"):
|
||||
train_dataset = train_dataset.map(
|
||||
@@ -447,7 +448,8 @@ def main():
|
||||
)
|
||||
if data_args.max_train_samples is not None:
|
||||
# Number of samples might increase during Feature Creation, We select only specified max samples
|
||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
||||
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||
train_dataset = train_dataset.select(range(max_train_samples))
|
||||
|
||||
# Validation preprocessing
|
||||
def prepare_validation_features(examples):
|
||||
@@ -497,7 +499,8 @@ def main():
|
||||
eval_examples = raw_datasets["validation"]
|
||||
if data_args.max_eval_samples is not None:
|
||||
# We will select sample from whole data
|
||||
eval_examples = eval_examples.select(range(data_args.max_eval_samples))
|
||||
max_eval_samples = min(len(eval_examples), data_args.max_eval_samples)
|
||||
eval_examples = eval_examples.select(range(max_eval_samples))
|
||||
# Validation Feature Creation
|
||||
with training_args.main_process_first(desc="validation dataset map pre-processing"):
|
||||
eval_dataset = eval_examples.map(
|
||||
@@ -510,7 +513,8 @@ def main():
|
||||
)
|
||||
if data_args.max_eval_samples is not None:
|
||||
# During Feature creation dataset samples might increase, we will select required samples again
|
||||
eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
|
||||
max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
|
||||
eval_dataset = eval_dataset.select(range(max_eval_samples))
|
||||
|
||||
if training_args.do_predict:
|
||||
if "test" not in raw_datasets:
|
||||
@@ -531,7 +535,8 @@ def main():
|
||||
)
|
||||
if data_args.max_predict_samples is not None:
|
||||
# During Feature creation dataset samples might increase, we will select required samples again
|
||||
predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
|
||||
max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
|
||||
predict_dataset = predict_dataset.select(range(max_predict_samples))
|
||||
|
||||
# Data collator
|
||||
# We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
|
||||
|
||||
@@ -375,7 +375,8 @@ def main():
|
||||
)
|
||||
|
||||
if data_args.max_train_samples is not None:
|
||||
train_dataset = train_dataset.select(range(data_args.max_train_samples))
|
||||
max_train_samples = min(len(train_dataset), data_args.max_train_samples)
|
||||
train_dataset = train_dataset.select(range(max_train_samples))
|
||||
|
||||
if data_args.max_val_samples is not None:
|
||||
eval_dataset = eval_dataset.select(range(data_args.max_val_samples))
|
||||
|
||||
Reference in New Issue
Block a user