Pass datasets trust_remote_code (#31406)
* Pass datasets trust_remote_code * Pass trust_remote_code in more tests * Add trust_remote_dataset_code arg to some tests * Revert "Temporarily pin datasets upper version to fix CI" This reverts commitb7672826ca. * Pass trust_remote_code in librispeech_asr_dummy docstrings * Revert "Pin datasets<2.20.0 for examples" This reverts commit833fc17a3e. * Pass trust_remote_code to all examples * Revert "Add trust_remote_dataset_code arg to some tests" to research_projects * Pass trust_remote_code to tests * Pass trust_remote_code to docstrings * Fix flax examples tests requirements * Pass trust_remote_dataset_code arg to tests * Replace trust_remote_dataset_code with trust_remote_code in one example * Fix duplicate trust_remote_code * Replace args.trust_remote_dataset_code with args.trust_remote_code * Replace trust_remote_dataset_code with trust_remote_code in parser * Replace trust_remote_dataset_code with trust_remote_code in dataclasses * Replace trust_remote_dataset_code with trust_remote_code arg
This commit is contained in:
committed by
GitHub
parent
485fd81471
commit
a14b055b65
@@ -124,9 +124,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -312,6 +312,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
streaming=data_args.streaming,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
if "validation" not in raw_datasets.keys():
|
||||
raw_datasets["validation"] = load_dataset(
|
||||
@@ -321,6 +322,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
streaming=data_args.streaming,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
raw_datasets["train"] = load_dataset(
|
||||
data_args.dataset_name,
|
||||
@@ -329,6 +331,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
streaming=data_args.streaming,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -195,12 +195,11 @@ def parse_args():
|
||||
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
||||
parser.add_argument(
|
||||
"--trust_remote_code",
|
||||
type=bool,
|
||||
default=False,
|
||||
action="store_true",
|
||||
help=(
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
@@ -327,17 +326,21 @@ def main():
|
||||
# download the dataset.
|
||||
if args.dataset_name is not None:
|
||||
# Downloading and loading a dataset from the hub.
|
||||
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
|
||||
raw_datasets = load_dataset(
|
||||
args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
|
||||
)
|
||||
if "validation" not in raw_datasets.keys():
|
||||
raw_datasets["validation"] = load_dataset(
|
||||
args.dataset_name,
|
||||
args.dataset_config_name,
|
||||
split=f"train[:{args.validation_split_percentage}%]",
|
||||
trust_remote_code=args.trust_remote_code,
|
||||
)
|
||||
raw_datasets["train"] = load_dataset(
|
||||
args.dataset_name,
|
||||
args.dataset_config_name,
|
||||
split=f"train[{args.validation_split_percentage}%:]",
|
||||
trust_remote_code=args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -127,9 +127,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -382,6 +382,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
streaming=data_args.streaming,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
if "validation" not in raw_datasets.keys():
|
||||
raw_datasets["validation"] = load_dataset(
|
||||
@@ -391,6 +392,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
streaming=data_args.streaming,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
raw_datasets["train"] = load_dataset(
|
||||
data_args.dataset_name,
|
||||
@@ -399,6 +401,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
streaming=data_args.streaming,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -257,12 +257,11 @@ def parse_args():
|
||||
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
||||
parser.add_argument(
|
||||
"--trust_remote_code",
|
||||
type=bool,
|
||||
default=False,
|
||||
action="store_true",
|
||||
help=(
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option"
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
@@ -395,17 +394,21 @@ def main():
|
||||
# download the dataset.
|
||||
if args.dataset_name is not None:
|
||||
# Downloading and loading a dataset from the hub.
|
||||
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
|
||||
raw_datasets = load_dataset(
|
||||
args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
|
||||
)
|
||||
if "validation" not in raw_datasets.keys():
|
||||
raw_datasets["validation"] = load_dataset(
|
||||
args.dataset_name,
|
||||
args.dataset_config_name,
|
||||
split=f"train[:{args.validation_split_percentage}%]",
|
||||
trust_remote_code=args.trust_remote_code,
|
||||
)
|
||||
raw_datasets["train"] = load_dataset(
|
||||
args.dataset_name,
|
||||
args.dataset_config_name,
|
||||
split=f"train[{args.validation_split_percentage}%:]",
|
||||
trust_remote_code=args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -121,9 +121,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -324,6 +324,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
streaming=data_args.streaming,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
if "validation" not in raw_datasets.keys():
|
||||
raw_datasets["validation"] = load_dataset(
|
||||
@@ -333,6 +334,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
streaming=data_args.streaming,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
raw_datasets["train"] = load_dataset(
|
||||
data_args.dataset_name,
|
||||
@@ -341,6 +343,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
streaming=data_args.streaming,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -202,12 +202,11 @@ def parse_args():
|
||||
parser.add_argument("--hub_token", type=str, help="The token to use to push to the Model Hub.")
|
||||
parser.add_argument(
|
||||
"--trust_remote_code",
|
||||
type=bool,
|
||||
default=False,
|
||||
action="store_true",
|
||||
help=(
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
@@ -334,17 +333,21 @@ def main():
|
||||
# download the dataset.
|
||||
if args.dataset_name is not None:
|
||||
# Downloading and loading a dataset from the hub.
|
||||
raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
|
||||
raw_datasets = load_dataset(
|
||||
args.dataset_name, args.dataset_config_name, trust_remote_code=args.trust_remote_code
|
||||
)
|
||||
if "validation" not in raw_datasets.keys():
|
||||
raw_datasets["validation"] = load_dataset(
|
||||
args.dataset_name,
|
||||
args.dataset_config_name,
|
||||
split=f"train[:{args.validation_split_percentage}%]",
|
||||
trust_remote_code=args.trust_remote_code,
|
||||
)
|
||||
raw_datasets["train"] = load_dataset(
|
||||
args.dataset_name,
|
||||
args.dataset_config_name,
|
||||
split=f"train[{args.validation_split_percentage}%:]",
|
||||
trust_remote_code=args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -133,6 +133,16 @@ class DataTrainingArguments:
|
||||
dataset_config_name: Optional[str] = field(
|
||||
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
|
||||
)
|
||||
trust_remote_code: bool = field(
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
|
||||
validation_file: Optional[str] = field(
|
||||
default=None,
|
||||
@@ -292,6 +302,7 @@ def main():
|
||||
data_args.dataset_config_name,
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=data_args.trust_remote_code,
|
||||
)
|
||||
if "validation" not in raw_datasets.keys():
|
||||
raw_datasets["validation"] = load_dataset(
|
||||
@@ -300,6 +311,7 @@ def main():
|
||||
split=f"train[:{data_args.validation_split_percentage}%]",
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=data_args.trust_remote_code,
|
||||
)
|
||||
raw_datasets["train"] = load_dataset(
|
||||
data_args.dataset_name,
|
||||
@@ -307,6 +319,7 @@ def main():
|
||||
split=f"train[{data_args.validation_split_percentage}%:]",
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=data_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
Reference in New Issue
Block a user