Pass datasets trust_remote_code (#31406)
* Pass datasets trust_remote_code * Pass trust_remote_code in more tests * Add trust_remote_dataset_code arg to some tests * Revert "Temporarily pin datasets upper version to fix CI" This reverts commitb7672826ca. * Pass trust_remote_code in librispeech_asr_dummy docstrings * Revert "Pin datasets<2.20.0 for examples" This reverts commit833fc17a3e. * Pass trust_remote_code to all examples * Revert "Add trust_remote_dataset_code arg to some tests" to research_projects * Pass trust_remote_code to tests * Pass trust_remote_code to docstrings * Fix flax examples tests requirements * Pass trust_remote_dataset_code arg to tests * Replace trust_remote_dataset_code with trust_remote_code in one example * Fix duplicate trust_remote_code * Replace args.trust_remote_dataset_code with args.trust_remote_code * Replace trust_remote_dataset_code with trust_remote_code in parser * Replace trust_remote_dataset_code with trust_remote_code in dataclasses * Replace trust_remote_dataset_code with trust_remote_code arg
This commit is contained in:
committed by
GitHub
parent
485fd81471
commit
a14b055b65
@@ -1,4 +1,4 @@
|
||||
datasets >= 1.13.3,<2.20.0 # Temporary upper version
|
||||
datasets >= 1.13.3
|
||||
pytest<8.0.1
|
||||
conllu
|
||||
nltk
|
||||
|
||||
@@ -195,9 +195,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -458,6 +458,7 @@ def main():
|
||||
keep_in_memory=False,
|
||||
data_dir=data_args.data_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -191,6 +191,16 @@ class DataTrainingArguments:
|
||||
dataset_config_name: Optional[str] = field(
|
||||
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
|
||||
)
|
||||
trust_remote_code: bool = field(
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
|
||||
validation_file: Optional[str] = field(
|
||||
default=None,
|
||||
@@ -518,6 +528,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
trust_remote_code=data_args.trust_remote_code,
|
||||
)
|
||||
|
||||
if "validation" not in datasets.keys():
|
||||
@@ -528,6 +539,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
trust_remote_code=data_args.trust_remote_code,
|
||||
)
|
||||
datasets["train"] = load_dataset(
|
||||
data_args.dataset_name,
|
||||
@@ -536,6 +548,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
trust_remote_code=data_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -182,9 +182,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -408,6 +408,7 @@ def main():
|
||||
keep_in_memory=False,
|
||||
token=model_args.token,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
|
||||
if "validation" not in dataset.keys():
|
||||
@@ -418,6 +419,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
dataset["train"] = load_dataset(
|
||||
data_args.dataset_name,
|
||||
@@ -426,6 +428,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -188,9 +188,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -446,6 +446,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
|
||||
if "validation" not in datasets.keys():
|
||||
@@ -456,6 +457,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
datasets["train"] = load_dataset(
|
||||
data_args.dataset_name,
|
||||
@@ -464,6 +466,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -192,6 +192,16 @@ class DataTrainingArguments:
|
||||
dataset_config_name: Optional[str] = field(
|
||||
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
|
||||
)
|
||||
trust_remote_code: bool = field(
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
|
||||
validation_file: Optional[str] = field(
|
||||
default=None,
|
||||
@@ -560,6 +570,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
trust_remote_code=data_args.trust_remote_code,
|
||||
)
|
||||
|
||||
if "validation" not in datasets.keys():
|
||||
@@ -570,6 +581,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
trust_remote_code=data_args.trust_remote_code,
|
||||
)
|
||||
datasets["train"] = load_dataset(
|
||||
data_args.dataset_name,
|
||||
@@ -578,6 +590,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
trust_remote_code=data_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -168,9 +168,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -498,6 +498,7 @@ def main():
|
||||
data_args.dataset_config_name,
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
# Loading the dataset from local csv or json file.
|
||||
|
||||
@@ -136,6 +136,16 @@ class DataTrainingArguments:
|
||||
dataset_config_name: Optional[str] = field(
|
||||
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
|
||||
)
|
||||
trust_remote_code: bool = field(
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
text_column: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."},
|
||||
@@ -442,6 +452,7 @@ def main():
|
||||
cache_dir=data_args.dataset_cache_dir,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
token=True if model_args.use_auth_token else None,
|
||||
trust_remote_code=data_args.trust_remote_code,
|
||||
)
|
||||
|
||||
if training_args.do_eval:
|
||||
@@ -452,6 +463,7 @@ def main():
|
||||
cache_dir=data_args.dataset_cache_dir,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
token=True if model_args.use_auth_token else None,
|
||||
trust_remote_code=data_args.trust_remote_code,
|
||||
)
|
||||
|
||||
if not training_args.do_train and not training_args.do_eval:
|
||||
|
||||
@@ -201,9 +201,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -485,6 +485,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
keep_in_memory=False,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -265,6 +265,7 @@ class ExamplesTests(TestCasePlus):
|
||||
--dataset_config clean
|
||||
--train_split_name validation
|
||||
--eval_split_name validation
|
||||
--trust_remote_code
|
||||
--output_dir {tmp_dir}
|
||||
--overwrite_output_dir
|
||||
--num_train_epochs=2
|
||||
|
||||
@@ -170,9 +170,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -449,6 +449,7 @@ def main():
|
||||
data_args.dataset_config_name,
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
# Loading the dataset from local csv or json file.
|
||||
|
||||
@@ -13,7 +13,7 @@ streamlit
|
||||
elasticsearch
|
||||
nltk
|
||||
pandas
|
||||
datasets >= 1.13.3,<2.20.0 # Temporary upper version
|
||||
datasets >= 1.13.3
|
||||
fire
|
||||
pytest<8.0.1
|
||||
conllu
|
||||
|
||||
@@ -165,9 +165,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -261,12 +261,14 @@ def main():
|
||||
data_args.dataset_config_name,
|
||||
split=data_args.train_split_name,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
raw_datasets["eval"] = load_dataset(
|
||||
data_args.dataset_name,
|
||||
data_args.dataset_config_name,
|
||||
split=data_args.eval_split_name,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
|
||||
if data_args.audio_column_name not in raw_datasets["train"].column_names:
|
||||
|
||||
@@ -99,9 +99,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -305,6 +305,7 @@ def main():
|
||||
keep_in_memory=False,
|
||||
data_dir=data_args.data_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -164,9 +164,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -242,6 +242,7 @@ def main():
|
||||
data_args.dataset_config_name,
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -150,12 +150,11 @@ def parse_args():
|
||||
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
||||
parser.add_argument(
|
||||
"--trust_remote_code",
|
||||
type=bool,
|
||||
default=False,
|
||||
action="store_true",
|
||||
help=(
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
@@ -284,7 +283,7 @@ def main():
|
||||
# download the dataset.
|
||||
if args.dataset_name is not None:
|
||||
# Downloading and loading a dataset from the hub.
|
||||
dataset = load_dataset(args.dataset_name)
|
||||
dataset = load_dataset(args.dataset_name, trust_remote_code=args.trust_remote_code)
|
||||
else:
|
||||
data_files = {}
|
||||
if args.train_dir is not None:
|
||||
|
||||
@@ -63,6 +63,16 @@ class DataTrainingArguments:
|
||||
dataset_config_name: Optional[str] = field(
|
||||
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
|
||||
)
|
||||
trust_remote_code: bool = field(
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
image_column_name: Optional[str] = field(
|
||||
default=None, metadata={"help": "The column name of the images in the files."}
|
||||
)
|
||||
@@ -225,6 +235,7 @@ def main():
|
||||
data_files=data_args.data_files,
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=data_args.trust_remote_code,
|
||||
)
|
||||
|
||||
# If we don't have a validation split, split off a percentage of train as validation.
|
||||
|
||||
@@ -166,9 +166,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -299,6 +299,7 @@ def main():
|
||||
data_files=data_args.data_files,
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
|
||||
# If we don't have a validation split, split off a percentage of train as validation.
|
||||
|
||||
@@ -197,12 +197,11 @@ def parse_args():
|
||||
)
|
||||
parser.add_argument(
|
||||
"--trust_remote_code",
|
||||
type=bool,
|
||||
default=False,
|
||||
action="store_true",
|
||||
help=(
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
@@ -441,6 +440,7 @@ def main():
|
||||
data_files=args.data_files,
|
||||
cache_dir=args.cache_dir,
|
||||
token=args.token,
|
||||
trust_remote_code=args.trust_remote_code,
|
||||
)
|
||||
|
||||
# If we don't have a validation split, split off a percentage of train as validation.
|
||||
|
||||
@@ -68,6 +68,16 @@ class Arguments:
|
||||
"help": "Name of a dataset from the hub (could be your own, possibly private dataset hosted on the hub)."
|
||||
},
|
||||
)
|
||||
trust_remote_code: bool = field(
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
image_height: Optional[int] = field(default=512, metadata={"help": "Image height after resizing."})
|
||||
image_width: Optional[int] = field(default=512, metadata={"help": "Image width after resizing."})
|
||||
token: str = field(
|
||||
@@ -364,7 +374,7 @@ def main():
|
||||
# Load dataset, prepare splits
|
||||
# ------------------------------------------------------------------------------------------------
|
||||
|
||||
dataset = load_dataset(args.dataset_name)
|
||||
dataset = load_dataset(args.dataset_name, trust_remote_code=args.trust_remote_code)
|
||||
|
||||
# We need to specify the label2id mapping for the model
|
||||
# it is a mapping from semantic class name to class index.
|
||||
|
||||
@@ -71,6 +71,15 @@ def parse_args():
|
||||
help="Name of the dataset on the hub.",
|
||||
default="qubvel-hf/ade20k-mini",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--trust_remote_code",
|
||||
action="store_true",
|
||||
help=(
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--image_height",
|
||||
type=int,
|
||||
@@ -425,7 +434,7 @@ def main():
|
||||
|
||||
# In distributed training, the load_dataset function guarantees that only one local process can concurrently
|
||||
# download the dataset.
|
||||
dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir)
|
||||
dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir, trust_remote_code=args.trust_remote_code)
|
||||
|
||||
# We need to specify the label2id mapping for the model
|
||||
# it is a mapping from semantic class name to class index.
|
||||
|
||||
@@ -124,9 +124,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -312,6 +312,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
streaming=data_args.streaming,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
if "validation" not in raw_datasets.keys():
|
||||
raw_datasets["validation"] = load_dataset(
|
||||
@@ -321,6 +322,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
streaming=data_args.streaming,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
raw_datasets["train"] = load_dataset(
|
||||
data_args.dataset_name,
|
||||
@@ -329,6 +331,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
streaming=data_args.streaming,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -195,12 +195,11 @@ def parse_args():
|
||||
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
||||
parser.add_argument(
|
||||
"--trust_remote_code",
|
||||
type=bool,
|
||||
default=False,
|
||||
action="store_true",
|
||||
help=(
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
@@ -327,17 +326,21 @@ def main():
|
||||
# download the dataset.
|
||||
if args.dataset_name is not None:
|
||||
# Downloading and loading a dataset from the hub.
|
||||
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
|
||||
raw_datasets = load_dataset(
|
||||
args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
|
||||
)
|
||||
if "validation" not in raw_datasets.keys():
|
||||
raw_datasets["validation"] = load_dataset(
|
||||
args.dataset_name,
|
||||
args.dataset_config_name,
|
||||
split=f"train[:{args.validation_split_percentage}%]",
|
||||
trust_remote_code=args.trust_remote_code,
|
||||
)
|
||||
raw_datasets["train"] = load_dataset(
|
||||
args.dataset_name,
|
||||
args.dataset_config_name,
|
||||
split=f"train[{args.validation_split_percentage}%:]",
|
||||
trust_remote_code=args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -127,9 +127,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -382,6 +382,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
streaming=data_args.streaming,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
if "validation" not in raw_datasets.keys():
|
||||
raw_datasets["validation"] = load_dataset(
|
||||
@@ -391,6 +392,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
streaming=data_args.streaming,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
raw_datasets["train"] = load_dataset(
|
||||
data_args.dataset_name,
|
||||
@@ -399,6 +401,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
streaming=data_args.streaming,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -257,12 +257,11 @@ def parse_args():
|
||||
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
||||
parser.add_argument(
|
||||
"--trust_remote_code",
|
||||
type=bool,
|
||||
default=False,
|
||||
action="store_true",
|
||||
help=(
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
@@ -395,17 +394,21 @@ def main():
|
||||
# download the dataset.
|
||||
if args.dataset_name is not None:
|
||||
# Downloading and loading a dataset from the hub.
|
||||
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
|
||||
raw_datasets = load_dataset(
|
||||
args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
|
||||
)
|
||||
if "validation" not in raw_datasets.keys():
|
||||
raw_datasets["validation"] = load_dataset(
|
||||
args.dataset_name,
|
||||
args.dataset_config_name,
|
||||
split=f"train[:{args.validation_split_percentage}%]",
|
||||
trust_remote_code=args.trust_remote_code,
|
||||
)
|
||||
raw_datasets["train"] = load_dataset(
|
||||
args.dataset_name,
|
||||
args.dataset_config_name,
|
||||
split=f"train[{args.validation_split_percentage}%:]",
|
||||
trust_remote_code=args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -121,9 +121,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -324,6 +324,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
streaming=data_args.streaming,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
if "validation" not in raw_datasets.keys():
|
||||
raw_datasets["validation"] = load_dataset(
|
||||
@@ -333,6 +334,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
streaming=data_args.streaming,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
raw_datasets["train"] = load_dataset(
|
||||
data_args.dataset_name,
|
||||
@@ -341,6 +343,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
streaming=data_args.streaming,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -202,12 +202,11 @@ def parse_args():
|
||||
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
||||
parser.add_argument(
|
||||
"--trust_remote_code",
|
||||
type=bool,
|
||||
default=False,
|
||||
action="store_true",
|
||||
help=(
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
@@ -334,17 +333,21 @@ def main():
|
||||
# download the dataset.
|
||||
if args.dataset_name is not None:
|
||||
# Downloading and loading a dataset from the hub.
|
||||
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
|
||||
raw_datasets = load_dataset(
|
||||
args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
|
||||
)
|
||||
if "validation" not in raw_datasets.keys():
|
||||
raw_datasets["validation"] = load_dataset(
|
||||
args.dataset_name,
|
||||
args.dataset_config_name,
|
||||
split=f"train[:{args.validation_split_percentage}%]",
|
||||
trust_remote_code=args.trust_remote_code,
|
||||
)
|
||||
raw_datasets["train"] = load_dataset(
|
||||
args.dataset_name,
|
||||
args.dataset_config_name,
|
||||
split=f"train[{args.validation_split_percentage}%:]",
|
||||
trust_remote_code=args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -133,6 +133,16 @@ class DataTrainingArguments:
|
||||
dataset_config_name: Optional[str] = field(
|
||||
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
|
||||
)
|
||||
trust_remote_code: bool = field(
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
|
||||
validation_file: Optional[str] = field(
|
||||
default=None,
|
||||
@@ -292,6 +302,7 @@ def main():
|
||||
data_args.dataset_config_name,
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=data_args.trust_remote_code,
|
||||
)
|
||||
if "validation" not in raw_datasets.keys():
|
||||
raw_datasets["validation"] = load_dataset(
|
||||
@@ -300,6 +311,7 @@ def main():
|
||||
split=f"train[:{data_args.validation_split_percentage}%]",
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=data_args.trust_remote_code,
|
||||
)
|
||||
raw_datasets["train"] = load_dataset(
|
||||
data_args.dataset_name,
|
||||
@@ -307,6 +319,7 @@ def main():
|
||||
split=f"train[{data_args.validation_split_percentage}%:]",
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=data_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -184,12 +184,11 @@ def parse_args():
|
||||
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
||||
parser.add_argument(
|
||||
"--trust_remote_code",
|
||||
type=bool,
|
||||
default=False,
|
||||
action="store_true",
|
||||
help=(
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
@@ -351,7 +350,9 @@ def main():
|
||||
# download the dataset.
|
||||
if args.dataset_name is not None:
|
||||
# Downloading and loading a dataset from the hub.
|
||||
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
|
||||
raw_datasets = load_dataset(
|
||||
args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
if args.train_file is not None:
|
||||
|
||||
@@ -313,9 +313,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -383,7 +383,9 @@ def main():
|
||||
# Load dataset, prepare splits
|
||||
# ------------------------------------------------------------------------------------------------
|
||||
|
||||
dataset = load_dataset(data_args.dataset_name, cache_dir=model_args.cache_dir)
|
||||
dataset = load_dataset(
|
||||
data_args.dataset_name, cache_dir=model_args.cache_dir, trust_remote_code=model_args.trust_remote_code
|
||||
)
|
||||
|
||||
# If we don't have a validation split, split off a percentage of train as validation
|
||||
data_args.train_val_split = None if "validation" in dataset.keys() else data_args.train_val_split
|
||||
|
||||
@@ -340,12 +340,11 @@ def parse_args():
|
||||
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
||||
parser.add_argument(
|
||||
"--trust_remote_code",
|
||||
type=bool,
|
||||
default=False,
|
||||
action="store_true",
|
||||
help=(
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
@@ -445,7 +444,7 @@ def main():
|
||||
# Load dataset
|
||||
# In distributed training, the load_dataset function guarantees that only one local process can concurrently
|
||||
# download the dataset.
|
||||
dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir)
|
||||
dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir, trust_remote_code=args.trust_remote_code)
|
||||
|
||||
# If we don't have a validation split, split off a percentage of train as validation.
|
||||
args.train_val_split = None if "validation" in dataset.keys() else args.train_val_split
|
||||
|
||||
@@ -93,9 +93,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -301,6 +301,7 @@ def main():
|
||||
data_args.dataset_config_name,
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -101,6 +101,16 @@ class DataTrainingArguments:
|
||||
dataset_config_name: Optional[str] = field(
|
||||
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
|
||||
)
|
||||
trust_remote_code: bool = field(
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
|
||||
validation_file: Optional[str] = field(
|
||||
default=None,
|
||||
@@ -289,6 +299,7 @@ def main():
|
||||
data_args.dataset_config_name,
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=data_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -100,6 +100,15 @@ def parse_args():
|
||||
default=None,
|
||||
help="The configuration name of the dataset to use (via the datasets library).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--trust_remote_code",
|
||||
action="store_true",
|
||||
help=(
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--train_file", type=str, default=None, help="A csv or a json file containing the training data."
|
||||
)
|
||||
@@ -356,7 +365,9 @@ def main():
|
||||
# download the dataset.
|
||||
if args.dataset_name is not None:
|
||||
# Downloading and loading a dataset from the hub.
|
||||
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
|
||||
raw_datasets = load_dataset(
|
||||
args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
if args.train_file is not None:
|
||||
|
||||
@@ -275,12 +275,11 @@ def parse_args():
|
||||
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
||||
parser.add_argument(
|
||||
"--trust_remote_code",
|
||||
type=bool,
|
||||
default=False,
|
||||
action="store_true",
|
||||
help=(
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
@@ -404,7 +403,9 @@ def main():
|
||||
# download the dataset.
|
||||
if args.dataset_name is not None:
|
||||
# Downloading and loading a dataset from the hub.
|
||||
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
|
||||
raw_datasets = load_dataset(
|
||||
args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
if args.train_file is not None:
|
||||
|
||||
@@ -93,9 +93,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -346,6 +346,7 @@ def main():
|
||||
data_args.dataset_config_name,
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -165,9 +165,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -233,7 +233,9 @@ def main():
|
||||
# In distributed training, the load_dataset function guarantees that only one local process can concurrently
|
||||
# download the dataset.
|
||||
# TODO support datasets from local folders
|
||||
dataset = load_dataset(data_args.dataset_name, cache_dir=model_args.cache_dir)
|
||||
dataset = load_dataset(
|
||||
data_args.dataset_name, cache_dir=model_args.cache_dir, trust_remote_code=model_args.trust_remote_code
|
||||
)
|
||||
|
||||
# Rename column names to standardized names (only "image" and "label" need to be present)
|
||||
if "pixel_values" in dataset["train"].column_names:
|
||||
|
||||
@@ -180,12 +180,11 @@ def parse_args():
|
||||
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
||||
parser.add_argument(
|
||||
"--trust_remote_code",
|
||||
type=bool,
|
||||
default=False,
|
||||
action="store_true",
|
||||
help=(
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
@@ -294,7 +293,7 @@ def main():
|
||||
# In distributed training, the load_dataset function guarantees that only one local process can concurrently
|
||||
# download the dataset.
|
||||
# TODO support datasets from local folders
|
||||
dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir)
|
||||
dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir, trust_remote_code=args.trust_remote_code)
|
||||
|
||||
# Rename column names to standardized names (only "image" and "label" need to be present)
|
||||
if "pixel_values" in dataset["train"].column_names:
|
||||
|
||||
@@ -71,6 +71,15 @@ def parse_args():
|
||||
required=True,
|
||||
help="The names of the training data set splits to use (via the datasets library).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--trust_remote_code",
|
||||
action="store_true",
|
||||
help=(
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--preprocessing_num_workers",
|
||||
type=int,
|
||||
@@ -446,6 +455,7 @@ def main():
|
||||
dataset_config_name,
|
||||
split=train_split_name,
|
||||
cache_dir=args.cache_dir,
|
||||
trust_remote_code=args.trust_remote_code,
|
||||
)
|
||||
datasets_splits.append(dataset_split)
|
||||
|
||||
|
||||
@@ -255,9 +255,9 @@ class DataTrainingArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -454,6 +454,7 @@ def main():
|
||||
data_args.dataset_config_name,
|
||||
split=data_args.train_split_name,
|
||||
token=data_args.token,
|
||||
trust_remote_code=data_args.trust_remote_code,
|
||||
)
|
||||
|
||||
if data_args.audio_column_name not in raw_datasets["train"].column_names:
|
||||
@@ -479,6 +480,7 @@ def main():
|
||||
data_args.dataset_config_name,
|
||||
split=data_args.eval_split_name,
|
||||
token=data_args.token,
|
||||
trust_remote_code=data_args.trust_remote_code,
|
||||
)
|
||||
|
||||
if data_args.max_eval_samples is not None:
|
||||
|
||||
@@ -245,9 +245,9 @@ class DataTrainingArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -434,6 +434,7 @@ def main():
|
||||
data_args.dataset_config_name,
|
||||
split=data_args.train_split_name,
|
||||
token=data_args.token,
|
||||
trust_remote_code=data_args.trust_remote_code,
|
||||
)
|
||||
|
||||
if data_args.audio_column_name not in raw_datasets["train"].column_names:
|
||||
@@ -459,6 +460,7 @@ def main():
|
||||
data_args.dataset_config_name,
|
||||
split=data_args.eval_split_name,
|
||||
token=data_args.token,
|
||||
trust_remote_code=data_args.trust_remote_code,
|
||||
)
|
||||
|
||||
if data_args.max_eval_samples is not None:
|
||||
|
||||
@@ -98,9 +98,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -347,6 +347,7 @@ def main():
|
||||
split=data_args.train_split_name,
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
|
||||
if training_args.do_eval:
|
||||
@@ -356,6 +357,7 @@ def main():
|
||||
split=data_args.eval_split_name,
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
|
||||
if data_args.audio_column_name not in next(iter(raw_datasets.values())).column_names:
|
||||
|
||||
@@ -112,9 +112,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -397,6 +397,7 @@ def main():
|
||||
data_args.dataset_config_name,
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -268,12 +268,11 @@ def parse_args():
|
||||
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
||||
parser.add_argument(
|
||||
"--trust_remote_code",
|
||||
type=bool,
|
||||
default=False,
|
||||
action="store_true",
|
||||
help=(
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
@@ -398,7 +397,9 @@ def main():
|
||||
# download the dataset.
|
||||
if args.dataset_name is not None:
|
||||
# Downloading and loading a dataset from the hub.
|
||||
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
|
||||
raw_datasets = load_dataset(
|
||||
args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
if args.train_file is not None:
|
||||
|
||||
@@ -313,6 +313,7 @@ class ExamplesTestsNoTrainer(TestCasePlus):
|
||||
{self.examples_dir}/pytorch/image-classification/run_image_classification_no_trainer.py
|
||||
--model_name_or_path google/vit-base-patch16-224-in21k
|
||||
--dataset_name hf-internal-testing/cats_vs_dogs_sample
|
||||
--trust_remote_code
|
||||
--learning_rate 1e-4
|
||||
--per_device_train_batch_size 2
|
||||
--per_device_eval_batch_size 1
|
||||
|
||||
@@ -391,6 +391,7 @@ class ExamplesTests(TestCasePlus):
|
||||
--output_dir {tmp_dir}
|
||||
--model_name_or_path google/vit-base-patch16-224-in21k
|
||||
--dataset_name hf-internal-testing/cats_vs_dogs_sample
|
||||
--trust_remote_code
|
||||
--do_train
|
||||
--do_eval
|
||||
--learning_rate 1e-4
|
||||
@@ -424,6 +425,7 @@ class ExamplesTests(TestCasePlus):
|
||||
--dataset_config_name clean
|
||||
--train_split_name validation
|
||||
--eval_split_name validation
|
||||
--trust_remote_code
|
||||
--do_train
|
||||
--do_eval
|
||||
--learning_rate 1e-4
|
||||
@@ -454,6 +456,7 @@ class ExamplesTests(TestCasePlus):
|
||||
--dataset_config_name clean
|
||||
--train_split_name validation
|
||||
--eval_split_name validation
|
||||
--trust_remote_code
|
||||
--do_train
|
||||
--do_eval
|
||||
--learning_rate 1e-4
|
||||
@@ -486,6 +489,7 @@ class ExamplesTests(TestCasePlus):
|
||||
--dataset_config_name clean
|
||||
--train_split_name validation
|
||||
--eval_split_name validation
|
||||
--trust_remote_code
|
||||
--do_train
|
||||
--do_eval
|
||||
--learning_rate 1e-4
|
||||
@@ -513,6 +517,7 @@ class ExamplesTests(TestCasePlus):
|
||||
--output_dir {tmp_dir}
|
||||
--model_name_or_path hf-internal-testing/tiny-random-wav2vec2
|
||||
--dataset_name anton-l/superb_demo
|
||||
--trust_remote_code
|
||||
--dataset_config_name ks
|
||||
--train_split_name test
|
||||
--eval_split_name test
|
||||
@@ -547,6 +552,7 @@ class ExamplesTests(TestCasePlus):
|
||||
--dataset_name hf-internal-testing/librispeech_asr_dummy
|
||||
--dataset_config_names clean
|
||||
--dataset_split_names validation
|
||||
--trust_remote_code
|
||||
--learning_rate 1e-4
|
||||
--per_device_train_batch_size 4
|
||||
--per_device_eval_batch_size 4
|
||||
@@ -567,6 +573,7 @@ class ExamplesTests(TestCasePlus):
|
||||
run_mae.py
|
||||
--output_dir {tmp_dir}
|
||||
--dataset_name hf-internal-testing/cats_vs_dogs_sample
|
||||
--trust_remote_code
|
||||
--do_train
|
||||
--do_eval
|
||||
--learning_rate 1e-4
|
||||
|
||||
@@ -240,9 +240,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -338,6 +338,7 @@ def main():
|
||||
data_args.dataset_config_name,
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
# Try print some info about the dataset
|
||||
logger.info(f"Dataset loaded: {raw_datasets}")
|
||||
|
||||
@@ -201,9 +201,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -300,6 +300,7 @@ def main():
|
||||
data_args.dataset_config_name,
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
# Loading a dataset from your local files.
|
||||
|
||||
@@ -92,9 +92,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -290,6 +290,7 @@ def main():
|
||||
data_args.dataset_config_name,
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -212,12 +212,11 @@ def parse_args():
|
||||
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
||||
parser.add_argument(
|
||||
"--trust_remote_code",
|
||||
type=bool,
|
||||
default=False,
|
||||
action="store_true",
|
||||
help=(
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
@@ -333,7 +332,9 @@ def main():
|
||||
# download the dataset.
|
||||
if args.dataset_name is not None:
|
||||
# Downloading and loading a dataset from the hub.
|
||||
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
|
||||
raw_datasets = load_dataset(
|
||||
args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
if args.train_file is not None:
|
||||
|
||||
@@ -102,9 +102,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -346,6 +346,7 @@ def main():
|
||||
data_args.dataset_config_name,
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -76,7 +76,6 @@ def parse_args():
|
||||
default=None,
|
||||
help="The name of the dataset to use (via the datasets library).",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--predict_with_generate",
|
||||
type=bool,
|
||||
@@ -259,12 +258,11 @@ def parse_args():
|
||||
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
||||
parser.add_argument(
|
||||
"--trust_remote_code",
|
||||
type=bool,
|
||||
default=False,
|
||||
action="store_true",
|
||||
help=(
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
@@ -378,7 +376,9 @@ def main():
|
||||
# download the dataset.
|
||||
if args.dataset_name is not None:
|
||||
# Downloading and loading a dataset from the hub.
|
||||
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
|
||||
raw_datasets = load_dataset(
|
||||
args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
if args.train_file is not None:
|
||||
|
||||
@@ -14,7 +14,7 @@ streamlit
|
||||
elasticsearch
|
||||
nltk
|
||||
pandas
|
||||
datasets >= 1.13.3,<2.20.0 # Temporary upper version
|
||||
datasets >= 1.13.3
|
||||
fire
|
||||
pytest<8.0.1
|
||||
conllu
|
||||
|
||||
@@ -105,9 +105,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -326,6 +326,7 @@ def main():
|
||||
keep_in_memory=False,
|
||||
data_dir=data_args.data_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -171,9 +171,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -284,6 +284,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
task="image-classification",
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -42,6 +42,15 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--dataset_config", type=str, default="wikitext-103-raw-v1", help="Configuration name of the dataset."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--trust_remote_code",
|
||||
action="store_true",
|
||||
help=(
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tokenizer_name_or_path",
|
||||
type=str,
|
||||
@@ -105,7 +114,9 @@ def get_serialized_examples(tokenized_data):
|
||||
|
||||
|
||||
def main(args):
|
||||
dataset = datasets.load_dataset(args.dataset_name, args.dataset_config, split=args.split)
|
||||
dataset = datasets.load_dataset(
|
||||
args.dataset_name, args.dataset_config, split=args.split, trust_remote_code=args.trust_remote_code
|
||||
)
|
||||
|
||||
if args.limit is not None:
|
||||
max_samples = min(len(dataset), args.limit)
|
||||
|
||||
@@ -41,6 +41,15 @@ def parse_args():
|
||||
parser.add_argument(
|
||||
"--dataset_config", type=str, default="wikitext-103-raw-v1", help="Configuration name of the dataset."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--trust_remote_code",
|
||||
action="store_true",
|
||||
help=(
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch_size",
|
||||
type=int,
|
||||
@@ -69,7 +78,9 @@ def parse_args():
|
||||
|
||||
|
||||
def main(args):
|
||||
dataset = datasets.load_dataset(args.dataset_name, args.dataset_config, split="train")
|
||||
dataset = datasets.load_dataset(
|
||||
args.dataset_name, args.dataset_config, split="train", trust_remote_code=args.trust_remote_code
|
||||
)
|
||||
|
||||
if args.limit is not None:
|
||||
max_train_samples = min(len(dataset), args.limit)
|
||||
|
||||
@@ -125,9 +125,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -298,6 +298,7 @@ def main():
|
||||
data_args.dataset_config_name,
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
if "validation" not in raw_datasets.keys():
|
||||
raw_datasets["validation"] = load_dataset(
|
||||
@@ -306,6 +307,7 @@ def main():
|
||||
split=f"train[:{data_args.validation_split_percentage}%]",
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
raw_datasets["train"] = load_dataset(
|
||||
data_args.dataset_name,
|
||||
@@ -313,6 +315,7 @@ def main():
|
||||
split=f"train[{data_args.validation_split_percentage}%:]",
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -123,9 +123,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -307,6 +307,7 @@ def main():
|
||||
data_args.dataset_name,
|
||||
data_args.dataset_config_name,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
if "validation" not in raw_datasets.keys():
|
||||
raw_datasets["validation"] = load_dataset(
|
||||
@@ -314,12 +315,14 @@ def main():
|
||||
data_args.dataset_config_name,
|
||||
split=f"train[:{data_args.validation_split_percentage}%]",
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
raw_datasets["train"] = load_dataset(
|
||||
data_args.dataset_name,
|
||||
data_args.dataset_config_name,
|
||||
split=f"train[{data_args.validation_split_percentage}%:]",
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -104,9 +104,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -329,6 +329,7 @@ def main():
|
||||
data_args.dataset_config_name,
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -112,9 +112,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -366,6 +366,7 @@ def main():
|
||||
data_args.dataset_config_name,
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -316,6 +316,7 @@ class ExamplesTests(TestCasePlus):
|
||||
testargs = f"""
|
||||
run_image_classification.py
|
||||
--dataset_name hf-internal-testing/cats_vs_dogs_sample
|
||||
--trust_remote_code
|
||||
--model_name_or_path microsoft/resnet-18
|
||||
--do_train
|
||||
--do_eval
|
||||
|
||||
@@ -88,9 +88,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -239,6 +239,7 @@ def main():
|
||||
data_args.dataset_name,
|
||||
data_args.dataset_config_name,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -106,9 +106,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -333,6 +333,7 @@ def main():
|
||||
data_args.dataset_config_name,
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
Reference in New Issue
Block a user