MLM training fails with no validation file(same as #12406 for pytorch now) (#12517)

* Validation split percentage to be used for custom data files also

Issue same as https://github.com/huggingface/transformers/issues/12406 fixed for pytorch branch run_mlm.py

* Validation split added in the right place

* Update run_clm.py

* validation split added for custom files

* Validation split added for custom files

* Update run_plm.py

* fixed validation split for custom files as input for pytorch examples in lm

* Update run_clm_no_trainer.py

* args modified
This commit is contained in:
Souvic Chakraborty
2021-07-07 18:35:44 +05:30
committed by GitHub
parent 3488ef5a92
commit 1d6623c6a2
5 changed files with 72 additions and 0 deletions

View File

@@ -278,6 +278,21 @@ def main():
if extension == "txt":
extension = "text"
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
# If no validation data is there, validation_split_percentage will be used to divide the dataset.
if "validation" not in raw_datasets.keys():
raw_datasets["validation"] = load_dataset(
extension,
data_files=data_files,
split=f"train[:{data_args.validation_split_percentage}%]",
cache_dir=model_args.cache_dir,
)
raw_datasets["train"] = load_dataset(
extension,
data_files=data_files,
split=f"train[{data_args.validation_split_percentage}%:]",
cache_dir=model_args.cache_dir,
)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html.

View File

@@ -253,6 +253,19 @@ def main():
if extension == "txt":
extension = "text"
raw_datasets = load_dataset(extension, data_files=data_files)
# If no validation data is there, validation_split_percentage will be used to divide the dataset.
if "validation" not in raw_datasets.keys():
raw_datasets["validation"] = load_dataset(
extension,
data_files=data_files,
split=f"train[:{args.validation_split_percentage}%]",
)
raw_datasets["train"] = load_dataset(
extension,
data_files=data_files,
split=f"train[{args.validation_split_percentage}%:]",
)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html.

View File

@@ -285,6 +285,22 @@ def main():
if extension == "txt":
extension = "text"
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
# If no validation data is there, validation_split_percentage will be used to divide the dataset.
if "validation" not in raw_datasets.keys():
raw_datasets["validation"] = load_dataset(
extension,
data_files=data_files,
split=f"train[:{data_args.validation_split_percentage}%]",
cache_dir=model_args.cache_dir,
)
raw_datasets["train"] = load_dataset(
extension,
data_files=data_files,
split=f"train[{data_args.validation_split_percentage}%:]",
cache_dir=model_args.cache_dir,
)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html.

View File

@@ -265,6 +265,19 @@ def main():
if extension == "txt":
extension = "text"
raw_datasets = load_dataset(extension, data_files=data_files)
# If no validation data is there, validation_split_percentage will be used to divide the dataset.
if "validation" not in raw_datasets.keys():
raw_datasets["validation"] = load_dataset(
extension,
data_files=data_files,
split=f"train[:{args.validation_split_percentage}%]",
)
raw_datasets["train"] = load_dataset(
extension,
data_files=data_files,
split=f"train[{args.validation_split_percentage}%:]",
)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html.

View File

@@ -280,6 +280,21 @@ def main():
if extension == "txt":
extension = "text"
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
# If no validation data is there, validation_split_percentage will be used to divide the dataset.
if "validation" not in raw_datasets.keys():
raw_datasets["validation"] = load_dataset(
extension,
data_files=data_files,
split=f"train[:{data_args.validation_split_percentage}%]",
cache_dir=model_args.cache_dir,
)
raw_datasets["train"] = load_dataset(
extension,
data_files=data_files,
split=f"train[{data_args.validation_split_percentage}%:]",
cache_dir=model_args.cache_dir,
)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html.