Pass datasets trust_remote_code (#31406)
* Pass datasets trust_remote_code * Pass trust_remote_code in more tests * Add trust_remote_dataset_code arg to some tests * Revert "Temporarily pin datasets upper version to fix CI" This reverts commitb7672826ca. * Pass trust_remote_code in librispeech_asr_dummy docstrings * Revert "Pin datasets<2.20.0 for examples" This reverts commit833fc17a3e. * Pass trust_remote_code to all examples * Revert "Add trust_remote_dataset_code arg to some tests" to research_projects * Pass trust_remote_code to tests * Pass trust_remote_code to docstrings * Fix flax examples tests requirements * Pass trust_remote_dataset_code arg to tests * Replace trust_remote_dataset_code with trust_remote_code in one example * Fix duplicate trust_remote_code * Replace args.trust_remote_dataset_code with args.trust_remote_code * Replace trust_remote_dataset_code with trust_remote_code in parser * Replace trust_remote_dataset_code with trust_remote_code in dataclasses * Replace trust_remote_dataset_code with trust_remote_code arg
This commit is contained in:
committed by
GitHub
parent
485fd81471
commit
a14b055b65
@@ -1,4 +1,4 @@
|
||||
datasets >= 1.13.3,<2.20.0 # Temporary upper version
|
||||
datasets >= 1.13.3
|
||||
pytest<8.0.1
|
||||
conllu
|
||||
nltk
|
||||
|
||||
@@ -195,9 +195,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -458,6 +458,7 @@ def main():
|
||||
keep_in_memory=False,
|
||||
data_dir=data_args.data_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -191,6 +191,16 @@ class DataTrainingArguments:
|
||||
dataset_config_name: Optional[str] = field(
|
||||
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
|
||||
)
|
||||
trust_remote_code: bool = field(
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
|
||||
validation_file: Optional[str] = field(
|
||||
default=None,
|
||||
@@ -518,6 +528,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
trust_remote_code=data_args.trust_remote_code,
|
||||
)
|
||||
|
||||
if "validation" not in datasets.keys():
|
||||
@@ -528,6 +539,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
trust_remote_code=data_args.trust_remote_code,
|
||||
)
|
||||
datasets["train"] = load_dataset(
|
||||
data_args.dataset_name,
|
||||
@@ -536,6 +548,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
trust_remote_code=data_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -182,9 +182,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -408,6 +408,7 @@ def main():
|
||||
keep_in_memory=False,
|
||||
token=model_args.token,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
|
||||
if "validation" not in dataset.keys():
|
||||
@@ -418,6 +419,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
dataset["train"] = load_dataset(
|
||||
data_args.dataset_name,
|
||||
@@ -426,6 +428,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -188,9 +188,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -446,6 +446,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
|
||||
if "validation" not in datasets.keys():
|
||||
@@ -456,6 +457,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
datasets["train"] = load_dataset(
|
||||
data_args.dataset_name,
|
||||
@@ -464,6 +466,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -192,6 +192,16 @@ class DataTrainingArguments:
|
||||
dataset_config_name: Optional[str] = field(
|
||||
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
|
||||
)
|
||||
trust_remote_code: bool = field(
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
|
||||
validation_file: Optional[str] = field(
|
||||
default=None,
|
||||
@@ -560,6 +570,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
trust_remote_code=data_args.trust_remote_code,
|
||||
)
|
||||
|
||||
if "validation" not in datasets.keys():
|
||||
@@ -570,6 +581,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
trust_remote_code=data_args.trust_remote_code,
|
||||
)
|
||||
datasets["train"] = load_dataset(
|
||||
data_args.dataset_name,
|
||||
@@ -578,6 +590,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
trust_remote_code=data_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -168,9 +168,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -498,6 +498,7 @@ def main():
|
||||
data_args.dataset_config_name,
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
# Loading the dataset from local csv or json file.
|
||||
|
||||
@@ -136,6 +136,16 @@ class DataTrainingArguments:
|
||||
dataset_config_name: Optional[str] = field(
|
||||
default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
|
||||
)
|
||||
trust_remote_code: bool = field(
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
text_column: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={"help": "The name of the column in the datasets containing the full texts (for summarization)."},
|
||||
@@ -442,6 +452,7 @@ def main():
|
||||
cache_dir=data_args.dataset_cache_dir,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
token=True if model_args.use_auth_token else None,
|
||||
trust_remote_code=data_args.trust_remote_code,
|
||||
)
|
||||
|
||||
if training_args.do_eval:
|
||||
@@ -452,6 +463,7 @@ def main():
|
||||
cache_dir=data_args.dataset_cache_dir,
|
||||
num_proc=data_args.preprocessing_num_workers,
|
||||
token=True if model_args.use_auth_token else None,
|
||||
trust_remote_code=data_args.trust_remote_code,
|
||||
)
|
||||
|
||||
if not training_args.do_train and not training_args.do_eval:
|
||||
|
||||
@@ -201,9 +201,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -485,6 +485,7 @@ def main():
|
||||
cache_dir=model_args.cache_dir,
|
||||
keep_in_memory=False,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
data_files = {}
|
||||
|
||||
@@ -265,6 +265,7 @@ class ExamplesTests(TestCasePlus):
|
||||
--dataset_config clean
|
||||
--train_split_name validation
|
||||
--eval_split_name validation
|
||||
--trust_remote_code
|
||||
--output_dir {tmp_dir}
|
||||
--overwrite_output_dir
|
||||
--num_train_epochs=2
|
||||
|
||||
@@ -170,9 +170,9 @@ class ModelArguments:
|
||||
default=False,
|
||||
metadata={
|
||||
"help": (
|
||||
"Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
|
||||
"should only be set to `True` for repositories you trust and in which you have read the code, as it will "
|
||||
"execute code present on the Hub on your local machine."
|
||||
"Whether to trust the execution of code from datasets/models defined on the Hub."
|
||||
" This option should only be set to `True` for repositories you trust and in which you have read the"
|
||||
" code, as it will execute code present on the Hub on your local machine."
|
||||
)
|
||||
},
|
||||
)
|
||||
@@ -449,6 +449,7 @@ def main():
|
||||
data_args.dataset_config_name,
|
||||
cache_dir=model_args.cache_dir,
|
||||
token=model_args.token,
|
||||
trust_remote_code=model_args.trust_remote_code,
|
||||
)
|
||||
else:
|
||||
# Loading the dataset from local csv or json file.
|
||||
|
||||
Reference in New Issue
Block a user