From 1d6623c6a25f9c1be3af36ffdcc3b0e0d3848999 Mon Sep 17 00:00:00 2001 From: Souvic Chakraborty Date: Wed, 7 Jul 2021 18:35:44 +0530 Subject: [PATCH] MLM training fails with no validation file(same as #12406 for pytorch now) (#12517) * Validation split percentage to be used for custom data files also Issue same as https://github.com/huggingface/transformers/issues/12406 fixed for pytorch branch run_mlm.py * Validation split added in the right place * Update run_clm.py * validation split added for custom files * Validation split added for custom files * Update run_plm.py * fixed validation split for custom files as input for pytorch examples in lm * Update run_clm_no_trainer.py * args modified --- examples/pytorch/language-modeling/run_clm.py | 15 +++++++++++++++ .../language-modeling/run_clm_no_trainer.py | 13 +++++++++++++ examples/pytorch/language-modeling/run_mlm.py | 16 ++++++++++++++++ .../language-modeling/run_mlm_no_trainer.py | 13 +++++++++++++ examples/pytorch/language-modeling/run_plm.py | 15 +++++++++++++++ 5 files changed, 72 insertions(+) diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index cd76849f75..13b5e660f4 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -278,6 +278,21 @@ def main(): if extension == "txt": extension = "text" raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) + # If no validation data is there, validation_split_percentage will be used to divide the dataset. + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + extension, + data_files=data_files, + split=f"train[:{data_args.validation_split_percentage}%]", + cache_dir=model_args.cache_dir, + ) + raw_datasets["train"] = load_dataset( + extension, + data_files=data_files, + split=f"train[{data_args.validation_split_percentage}%:]", + cache_dir=model_args.cache_dir, + ) + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 2d51b8a655..c22021aaa1 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -253,6 +253,19 @@ def main(): if extension == "txt": extension = "text" raw_datasets = load_dataset(extension, data_files=data_files) + # If no validation data is there, validation_split_percentage will be used to divide the dataset. + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + extension, + data_files=data_files, + split=f"train[:{args.validation_split_percentage}%]", + ) + raw_datasets["train"] = load_dataset( + extension, + data_files=data_files, + split=f"train[{args.validation_split_percentage}%:]", + ) + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py index f016e3df01..b2acdb999e 100755 --- a/examples/pytorch/language-modeling/run_mlm.py +++ b/examples/pytorch/language-modeling/run_mlm.py @@ -285,6 +285,22 @@ def main(): if extension == "txt": extension = "text" raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) + + # If no validation data is there, validation_split_percentage will be used to divide the dataset. + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + extension, + data_files=data_files, + split=f"train[:{data_args.validation_split_percentage}%]", + cache_dir=model_args.cache_dir, + ) + raw_datasets["train"] = load_dataset( + extension, + data_files=data_files, + split=f"train[{data_args.validation_split_percentage}%:]", + cache_dir=model_args.cache_dir, + ) + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index e5203f3f9a..4deb746025 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -265,6 +265,19 @@ def main(): if extension == "txt": extension = "text" raw_datasets = load_dataset(extension, data_files=data_files) + # If no validation data is there, validation_split_percentage will be used to divide the dataset. + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + extension, + data_files=data_files, + split=f"train[:{args.validation_split_percentage}%]", + ) + raw_datasets["train"] = load_dataset( + extension, + data_files=data_files, + split=f"train[{args.validation_split_percentage}%:]", + ) + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py index 125dc88443..fbc98fb01f 100755 --- a/examples/pytorch/language-modeling/run_plm.py +++ b/examples/pytorch/language-modeling/run_plm.py @@ -280,6 +280,21 @@ def main(): if extension == "txt": extension = "text" raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir) + # If no validation data is there, validation_split_percentage will be used to divide the dataset. + if "validation" not in raw_datasets.keys(): + raw_datasets["validation"] = load_dataset( + extension, + data_files=data_files, + split=f"train[:{data_args.validation_split_percentage}%]", + cache_dir=model_args.cache_dir, + ) + raw_datasets["train"] = load_dataset( + extension, + data_files=data_files, + split=f"train[{data_args.validation_split_percentage}%:]", + cache_dir=model_args.cache_dir, + ) + # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html.