Fix input data file extension in examples (#28741)
This commit is contained in:
@@ -558,9 +558,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if data_args.train_file is not None:
|
if data_args.train_file is not None:
|
||||||
data_files["train"] = data_args.train_file
|
data_files["train"] = data_args.train_file
|
||||||
|
extension = data_args.train_file.split(".")[-1]
|
||||||
if data_args.validation_file is not None:
|
if data_args.validation_file is not None:
|
||||||
data_files["validation"] = data_args.validation_file
|
data_files["validation"] = data_args.validation_file
|
||||||
extension = data_args.train_file.split(".")[-1]
|
extension = data_args.validation_file.split(".")[-1]
|
||||||
if extension == "txt":
|
if extension == "txt":
|
||||||
extension = "text"
|
extension = "text"
|
||||||
datasets = load_dataset(
|
datasets = load_dataset(
|
||||||
|
|||||||
@@ -449,9 +449,10 @@ def main():
|
|||||||
dataset_args = {}
|
dataset_args = {}
|
||||||
if data_args.train_file is not None:
|
if data_args.train_file is not None:
|
||||||
data_files["train"] = data_args.train_file
|
data_files["train"] = data_args.train_file
|
||||||
|
extension = data_args.train_file.split(".")[-1]
|
||||||
if data_args.validation_file is not None:
|
if data_args.validation_file is not None:
|
||||||
data_files["validation"] = data_args.validation_file
|
data_files["validation"] = data_args.validation_file
|
||||||
extension = data_args.train_file.split(".")[-1]
|
extension = data_args.validation_file.split(".")[-1]
|
||||||
if extension == "txt":
|
if extension == "txt":
|
||||||
extension = "text"
|
extension = "text"
|
||||||
dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
|
dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
|
||||||
|
|||||||
@@ -485,9 +485,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if data_args.train_file is not None:
|
if data_args.train_file is not None:
|
||||||
data_files["train"] = data_args.train_file
|
data_files["train"] = data_args.train_file
|
||||||
|
extension = data_args.train_file.split(".")[-1]
|
||||||
if data_args.validation_file is not None:
|
if data_args.validation_file is not None:
|
||||||
data_files["validation"] = data_args.validation_file
|
data_files["validation"] = data_args.validation_file
|
||||||
extension = data_args.train_file.split(".")[-1]
|
extension = data_args.validation_file.split(".")[-1]
|
||||||
if extension == "txt":
|
if extension == "txt":
|
||||||
extension = "text"
|
extension = "text"
|
||||||
datasets = load_dataset(
|
datasets = load_dataset(
|
||||||
|
|||||||
@@ -599,9 +599,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if data_args.train_file is not None:
|
if data_args.train_file is not None:
|
||||||
data_files["train"] = data_args.train_file
|
data_files["train"] = data_args.train_file
|
||||||
|
extension = data_args.train_file.split(".")[-1]
|
||||||
if data_args.validation_file is not None:
|
if data_args.validation_file is not None:
|
||||||
data_files["validation"] = data_args.validation_file
|
data_files["validation"] = data_args.validation_file
|
||||||
extension = data_args.train_file.split(".")[-1]
|
extension = data_args.validation_file.split(".")[-1]
|
||||||
if extension == "txt":
|
if extension == "txt":
|
||||||
extension = "text"
|
extension = "text"
|
||||||
datasets = load_dataset(
|
datasets = load_dataset(
|
||||||
|
|||||||
@@ -345,9 +345,10 @@ def main():
|
|||||||
dataset_args = {}
|
dataset_args = {}
|
||||||
if args.train_file is not None:
|
if args.train_file is not None:
|
||||||
data_files["train"] = args.train_file
|
data_files["train"] = args.train_file
|
||||||
|
extension = args.train_file.split(".")[-1]
|
||||||
if args.validation_file is not None:
|
if args.validation_file is not None:
|
||||||
data_files["validation"] = args.validation_file
|
data_files["validation"] = args.validation_file
|
||||||
extension = args.train_file.split(".")[-1]
|
extension = args.validation_file.split(".")[-1]
|
||||||
if extension == "txt":
|
if extension == "txt":
|
||||||
extension = "text"
|
extension = "text"
|
||||||
dataset_args["keep_linebreaks"] = not args.no_keep_linebreaks
|
dataset_args["keep_linebreaks"] = not args.no_keep_linebreaks
|
||||||
|
|||||||
@@ -351,9 +351,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if args.train_file is not None:
|
if args.train_file is not None:
|
||||||
data_files["train"] = args.train_file
|
data_files["train"] = args.train_file
|
||||||
|
extension = args.train_file.split(".")[-1]
|
||||||
if args.validation_file is not None:
|
if args.validation_file is not None:
|
||||||
data_files["validation"] = args.validation_file
|
data_files["validation"] = args.validation_file
|
||||||
extension = args.train_file.split(".")[-1]
|
extension = args.validation_file.split(".")[-1]
|
||||||
if extension == "txt":
|
if extension == "txt":
|
||||||
extension = "text"
|
extension = "text"
|
||||||
raw_datasets = load_dataset(extension, data_files=data_files)
|
raw_datasets = load_dataset(extension, data_files=data_files)
|
||||||
|
|||||||
@@ -328,9 +328,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if data_args.train_file is not None:
|
if data_args.train_file is not None:
|
||||||
data_files["train"] = data_args.train_file
|
data_files["train"] = data_args.train_file
|
||||||
|
extension = data_args.train_file.split(".")[-1]
|
||||||
if data_args.validation_file is not None:
|
if data_args.validation_file is not None:
|
||||||
data_files["validation"] = data_args.validation_file
|
data_files["validation"] = data_args.validation_file
|
||||||
extension = data_args.train_file.split(".")[-1]
|
extension = data_args.validation_file.split(".")[-1]
|
||||||
if extension == "txt":
|
if extension == "txt":
|
||||||
extension = "text"
|
extension = "text"
|
||||||
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
|
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
|
||||||
|
|||||||
@@ -311,9 +311,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if data_args.train_file is not None:
|
if data_args.train_file is not None:
|
||||||
data_files["train"] = data_args.train_file
|
data_files["train"] = data_args.train_file
|
||||||
|
extension = data_args.train_file.split(".")[-1]
|
||||||
if data_args.validation_file is not None:
|
if data_args.validation_file is not None:
|
||||||
data_files["validation"] = data_args.validation_file
|
data_files["validation"] = data_args.validation_file
|
||||||
extension = data_args.train_file.split(".")[-1]
|
extension = data_args.validation_file.split(".")[-1]
|
||||||
raw_datasets = load_dataset(
|
raw_datasets = load_dataset(
|
||||||
extension,
|
extension,
|
||||||
data_files=data_files,
|
data_files=data_files,
|
||||||
|
|||||||
@@ -357,9 +357,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if args.train_file is not None:
|
if args.train_file is not None:
|
||||||
data_files["train"] = args.train_file
|
data_files["train"] = args.train_file
|
||||||
|
extension = args.train_file.split(".")[-1]
|
||||||
if args.validation_file is not None:
|
if args.validation_file is not None:
|
||||||
data_files["validation"] = args.validation_file
|
data_files["validation"] = args.validation_file
|
||||||
extension = args.train_file.split(".")[-1]
|
extension = args.validation_file.split(".")[-1]
|
||||||
raw_datasets = load_dataset(extension, data_files=data_files)
|
raw_datasets = load_dataset(extension, data_files=data_files)
|
||||||
# Trim a number of training examples
|
# Trim a number of training examples
|
||||||
if args.debug:
|
if args.debug:
|
||||||
|
|||||||
@@ -362,11 +362,13 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if args.train_file is not None:
|
if args.train_file is not None:
|
||||||
data_files["train"] = args.train_file
|
data_files["train"] = args.train_file
|
||||||
|
extension = args.train_file.split(".")[-1]
|
||||||
if args.validation_file is not None:
|
if args.validation_file is not None:
|
||||||
data_files["validation"] = args.validation_file
|
data_files["validation"] = args.validation_file
|
||||||
|
extension = args.validation_file.split(".")[-1]
|
||||||
if args.test_file is not None:
|
if args.test_file is not None:
|
||||||
data_files["test"] = args.test_file
|
data_files["test"] = args.test_file
|
||||||
extension = args.train_file.split(".")[-1]
|
extension = args.test_file.split(".")[-1]
|
||||||
raw_datasets = load_dataset(extension, data_files=data_files, field="data")
|
raw_datasets = load_dataset(extension, data_files=data_files, field="data")
|
||||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||||
|
|||||||
@@ -410,11 +410,13 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if args.train_file is not None:
|
if args.train_file is not None:
|
||||||
data_files["train"] = args.train_file
|
data_files["train"] = args.train_file
|
||||||
|
extension = args.train_file.split(".")[-1]
|
||||||
if args.validation_file is not None:
|
if args.validation_file is not None:
|
||||||
data_files["validation"] = args.validation_file
|
data_files["validation"] = args.validation_file
|
||||||
|
extension = args.validation_file.split(".")[-1]
|
||||||
if args.test_file is not None:
|
if args.test_file is not None:
|
||||||
data_files["test"] = args.test_file
|
data_files["test"] = args.test_file
|
||||||
extension = args.train_file.split(".")[-1]
|
extension = args.test_file.split(".")[-1]
|
||||||
raw_datasets = load_dataset(extension, data_files=data_files, field="data")
|
raw_datasets = load_dataset(extension, data_files=data_files, field="data")
|
||||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||||
|
|||||||
@@ -404,9 +404,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if args.train_file is not None:
|
if args.train_file is not None:
|
||||||
data_files["train"] = args.train_file
|
data_files["train"] = args.train_file
|
||||||
|
extension = args.train_file.split(".")[-1]
|
||||||
if args.validation_file is not None:
|
if args.validation_file is not None:
|
||||||
data_files["validation"] = args.validation_file
|
data_files["validation"] = args.validation_file
|
||||||
extension = args.train_file.split(".")[-1]
|
extension = args.validation_file.split(".")[-1]
|
||||||
raw_datasets = load_dataset(extension, data_files=data_files)
|
raw_datasets = load_dataset(extension, data_files=data_files)
|
||||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||||
|
|||||||
@@ -311,11 +311,13 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if data_args.train_file is not None:
|
if data_args.train_file is not None:
|
||||||
data_files["train"] = data_args.train_file
|
data_files["train"] = data_args.train_file
|
||||||
|
extension = data_args.train_file.split(".")[-1]
|
||||||
if data_args.validation_file is not None:
|
if data_args.validation_file is not None:
|
||||||
data_files["validation"] = data_args.validation_file
|
data_files["validation"] = data_args.validation_file
|
||||||
|
extension = data_args.validation_file.split(".")[-1]
|
||||||
if data_args.test_file is not None:
|
if data_args.test_file is not None:
|
||||||
data_files["test"] = data_args.test_file
|
data_files["test"] = data_args.test_file
|
||||||
extension = data_args.train_file.split(".")[-1]
|
extension = data_args.test_file.split(".")[-1]
|
||||||
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
|
raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
|
||||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||||
|
|||||||
@@ -339,9 +339,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if args.train_file is not None:
|
if args.train_file is not None:
|
||||||
data_files["train"] = args.train_file
|
data_files["train"] = args.train_file
|
||||||
|
extension = args.train_file.split(".")[-1]
|
||||||
if args.validation_file is not None:
|
if args.validation_file is not None:
|
||||||
data_files["validation"] = args.validation_file
|
data_files["validation"] = args.validation_file
|
||||||
extension = args.train_file.split(".")[-1]
|
extension = args.validation_file.split(".")[-1]
|
||||||
raw_datasets = load_dataset(extension, data_files=data_files)
|
raw_datasets = load_dataset(extension, data_files=data_files)
|
||||||
# Trim a number of training examples
|
# Trim a number of training examples
|
||||||
if args.debug:
|
if args.debug:
|
||||||
|
|||||||
@@ -384,9 +384,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if args.train_file is not None:
|
if args.train_file is not None:
|
||||||
data_files["train"] = args.train_file
|
data_files["train"] = args.train_file
|
||||||
|
extension = args.train_file.split(".")[-1]
|
||||||
if args.validation_file is not None:
|
if args.validation_file is not None:
|
||||||
data_files["validation"] = args.validation_file
|
data_files["validation"] = args.validation_file
|
||||||
extension = args.train_file.split(".")[-1]
|
extension = args.validation_file.split(".")[-1]
|
||||||
raw_datasets = load_dataset(extension, data_files=data_files)
|
raw_datasets = load_dataset(extension, data_files=data_files)
|
||||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||||
|
|||||||
@@ -297,9 +297,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if data_args.train_file is not None:
|
if data_args.train_file is not None:
|
||||||
data_files["train"] = data_args.train_file
|
data_files["train"] = data_args.train_file
|
||||||
|
extension = data_args.train_file.split(".")[-1]
|
||||||
if data_args.validation_file is not None:
|
if data_args.validation_file is not None:
|
||||||
data_files["validation"] = data_args.validation_file
|
data_files["validation"] = data_args.validation_file
|
||||||
extension = data_args.train_file.split(".")[-1]
|
extension = data_args.validation_file.split(".")[-1]
|
||||||
if extension == "txt":
|
if extension == "txt":
|
||||||
extension = "text"
|
extension = "text"
|
||||||
dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
|
dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
|
||||||
|
|||||||
@@ -285,9 +285,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if args.train_file is not None:
|
if args.train_file is not None:
|
||||||
data_files["train"] = args.train_file
|
data_files["train"] = args.train_file
|
||||||
|
extension = args.train_file.split(".")[-1]
|
||||||
if args.validation_file is not None:
|
if args.validation_file is not None:
|
||||||
data_files["validation"] = args.validation_file
|
data_files["validation"] = args.validation_file
|
||||||
extension = args.train_file.split(".")[-1]
|
extension = args.validation_file.split(".")[-1]
|
||||||
raw_datasets = load_dataset(extension, data_files=data_files)
|
raw_datasets = load_dataset(extension, data_files=data_files)
|
||||||
# Trim a number of training examples
|
# Trim a number of training examples
|
||||||
if args.debug:
|
if args.debug:
|
||||||
|
|||||||
@@ -271,9 +271,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if data_args.train_file is not None:
|
if data_args.train_file is not None:
|
||||||
data_files["train"] = data_args.train_file
|
data_files["train"] = data_args.train_file
|
||||||
|
extension = data_args.train_file.split(".")[-1]
|
||||||
if data_args.validation_file is not None:
|
if data_args.validation_file is not None:
|
||||||
data_files["validation"] = data_args.validation_file
|
data_files["validation"] = data_args.validation_file
|
||||||
extension = data_args.train_file.split(".")[-1]
|
extension = data_args.validation_file.split(".")[-1]
|
||||||
if extension == "txt":
|
if extension == "txt":
|
||||||
extension = "text"
|
extension = "text"
|
||||||
datasets = load_dataset(extension, data_files=data_files)
|
datasets = load_dataset(extension, data_files=data_files)
|
||||||
|
|||||||
@@ -517,9 +517,10 @@ if __name__ == "__main__":
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if data_args.train_file is not None:
|
if data_args.train_file is not None:
|
||||||
data_files["train"] = data_args.train_file
|
data_files["train"] = data_args.train_file
|
||||||
|
extension = data_args.train_file.split(".")[-1]
|
||||||
if data_args.validation_file is not None:
|
if data_args.validation_file is not None:
|
||||||
data_files["validation"] = data_args.validation_file
|
data_files["validation"] = data_args.validation_file
|
||||||
extension = data_args.train_file.split(".")[-1]
|
extension = data_args.validation_file.split(".")[-1]
|
||||||
if extension == "txt":
|
if extension == "txt":
|
||||||
extension = "text"
|
extension = "text"
|
||||||
datasets = load_dataset(extension, data_files=data_files)
|
datasets = load_dataset(extension, data_files=data_files)
|
||||||
|
|||||||
@@ -341,9 +341,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if data_args.train_file is not None:
|
if data_args.train_file is not None:
|
||||||
data_files["train"] = data_args.train_file
|
data_files["train"] = data_args.train_file
|
||||||
|
extension = data_args.train_file.split(".")[-1]
|
||||||
if data_args.validation_file is not None:
|
if data_args.validation_file is not None:
|
||||||
data_files["validation"] = data_args.validation_file
|
data_files["validation"] = data_args.validation_file
|
||||||
extension = data_args.train_file.split(".")[-1]
|
extension = data_args.validation_file.split(".")[-1]
|
||||||
if extension == "txt":
|
if extension == "txt":
|
||||||
extension = "text"
|
extension = "text"
|
||||||
raw_datasets = load_dataset(
|
raw_datasets = load_dataset(
|
||||||
|
|||||||
@@ -320,9 +320,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if data_args.train_file is not None:
|
if data_args.train_file is not None:
|
||||||
data_files["train"] = data_args.train_file
|
data_files["train"] = data_args.train_file
|
||||||
|
extension = data_args.train_file.split(".")[-1]
|
||||||
if data_args.validation_file is not None:
|
if data_args.validation_file is not None:
|
||||||
data_files["validation"] = data_args.validation_file
|
data_files["validation"] = data_args.validation_file
|
||||||
extension = data_args.train_file.split(".")[-1]
|
extension = data_args.validation_file.split(".")[-1]
|
||||||
raw_datasets = load_dataset(
|
raw_datasets = load_dataset(
|
||||||
extension,
|
extension,
|
||||||
data_files=data_files,
|
data_files=data_files,
|
||||||
|
|||||||
@@ -260,9 +260,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if data_args.train_file is not None:
|
if data_args.train_file is not None:
|
||||||
data_files["train"] = data_args.train_file
|
data_files["train"] = data_args.train_file
|
||||||
|
extension = data_args.train_file.split(".")[-1]
|
||||||
if data_args.validation_file is not None:
|
if data_args.validation_file is not None:
|
||||||
data_files["validation"] = data_args.validation_file
|
data_files["validation"] = data_args.validation_file
|
||||||
extension = data_args.train_file.split(".")[-1]
|
extension = data_args.validation_file.split(".")[-1]
|
||||||
raw_datasets = load_dataset(
|
raw_datasets = load_dataset(
|
||||||
extension,
|
extension,
|
||||||
data_files=data_files,
|
data_files=data_files,
|
||||||
|
|||||||
@@ -730,9 +730,10 @@ def main():
|
|||||||
data_files = {}
|
data_files = {}
|
||||||
if args.train_file is not None:
|
if args.train_file is not None:
|
||||||
data_files["train"] = args.train_file
|
data_files["train"] = args.train_file
|
||||||
|
extension = args.train_file.split(".")[-1]
|
||||||
if args.validation_file is not None:
|
if args.validation_file is not None:
|
||||||
data_files["validation"] = args.validation_file
|
data_files["validation"] = args.validation_file
|
||||||
extension = args.train_file.split(".")[-1]
|
extension = args.validation_file.split(".")[-1]
|
||||||
raw_datasets = load_dataset(extension, data_files=data_files)
|
raw_datasets = load_dataset(extension, data_files=data_files)
|
||||||
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
|
||||||
# https://huggingface.co/docs/datasets/loading_datasets.
|
# https://huggingface.co/docs/datasets/loading_datasets.
|
||||||
|
|||||||
Reference in New Issue
Block a user