Pass datasets trust_remote_code (#31406)

* Pass datasets trust_remote_code * Pass trust_remote_code in more tests * Add trust_remote_dataset_code arg to some tests * Revert "Temporarily pin datasets upper version to fix CI" This reverts commit b7672826ca. * Pass trust_remote_code in librispeech_asr_dummy docstrings * Revert "Pin datasets<2.20.0 for examples" This reverts commit 833fc17a3e. * Pass trust_remote_code to all examples * Revert "Add trust_remote_dataset_code arg to some tests" to research_projects * Pass trust_remote_code to tests * Pass trust_remote_code to docstrings * Fix flax examples tests requirements * Pass trust_remote_dataset_code arg to tests * Replace trust_remote_dataset_code with trust_remote_code in one example * Fix duplicate trust_remote_code * Replace args.trust_remote_dataset_code with args.trust_remote_code * Replace trust_remote_dataset_code with trust_remote_code in parser * Replace trust_remote_dataset_code with trust_remote_code in dataclasses * Replace trust_remote_dataset_code with trust_remote_code arg
2024-06-17 18:29:13 +02:00
parent 485fd81471
commit a14b055b65
168 changed files with 804 additions and 410 deletions
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@@ -255,9 +255,9 @@ class DataTrainingArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -454,6 +454,7 @@ def main():
            data_args.dataset_config_name,
            split=data_args.train_split_name,
            token=data_args.token,
+            trust_remote_code=data_args.trust_remote_code,
        )

        if data_args.audio_column_name not in raw_datasets["train"].column_names:
@@ -479,6 +480,7 @@ def main():
            data_args.dataset_config_name,
            split=data_args.eval_split_name,
            token=data_args.token,
+            trust_remote_code=data_args.trust_remote_code,
        )

        if data_args.max_eval_samples is not None:
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
@@ -245,9 +245,9 @@ class DataTrainingArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -434,6 +434,7 @@ def main():
            data_args.dataset_config_name,
            split=data_args.train_split_name,
            token=data_args.token,
+            trust_remote_code=data_args.trust_remote_code,
        )

        if data_args.audio_column_name not in raw_datasets["train"].column_names:
@@ -459,6 +460,7 @@ def main():
            data_args.dataset_config_name,
            split=data_args.eval_split_name,
            token=data_args.token,
+            trust_remote_code=data_args.trust_remote_code,
        )

        if data_args.max_eval_samples is not None:
--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@@ -98,9 +98,9 @@ class ModelArguments:
        default=False,
        metadata={
            "help": (
-                "Whether or not to allow for custom models defined on the Hub in their own modeling files. This option "
-                "should only be set to `True` for repositories you trust and in which you have read the code, as it will "
-                "execute code present on the Hub on your local machine."
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
            )
        },
    )
@@ -347,6 +347,7 @@ def main():
            split=data_args.train_split_name,
            cache_dir=model_args.cache_dir,
            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
        )

    if training_args.do_eval:
@@ -356,6 +357,7 @@ def main():
            split=data_args.eval_split_name,
            cache_dir=model_args.cache_dir,
            token=model_args.token,
+            trust_remote_code=model_args.trust_remote_code,
        )

    if data_args.audio_column_name not in next(iter(raw_datasets.values())).column_names: