[Examples] Add automatic dataset splitting in language-modeling examples (#9133)

* replaced jnp.split + removing textual model inputs + ensuring warmup_steps > 0

* Add automatic dataset splitting in language-modeling examples
This commit is contained in:
Teven
2020-12-15 22:02:43 +01:00
committed by GitHub
parent e771749777
commit 2a7e8e1608
5 changed files with 113 additions and 16 deletions

View File

@@ -93,6 +93,12 @@ class DataTrainingArguments:
overwrite_cache: bool = field(
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
)
validation_split_percentage: Optional[int] = field(
default=5,
metadata={
"help": "The percentage of the train set used as validation set in case there's no validation split"
},
)
max_seq_length: int = field(
default=512,
metadata={
@@ -196,6 +202,17 @@ def main():
if data_args.dataset_name is not None:
# Downloading and loading a dataset from the hub.
datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name)
if "validation" not in datasets.keys():
datasets["validation"] = load_dataset(
data_args.dataset_name,
data_args.dataset_config_name,
split=f"train[:{data_args.validation_split_percentage}%]",
)
datasets["train"] = load_dataset(
data_args.dataset_name,
data_args.dataset_config_name,
split=f"train[{data_args.validation_split_percentage}%:]",
)
else:
data_files = {}
if data_args.train_file is not None: