Pass datasets trust_remote_code (#31406)

* Pass datasets trust_remote_code * Pass trust_remote_code in more tests * Add trust_remote_dataset_code arg to some tests * Revert "Temporarily pin datasets upper version to fix CI" This reverts commit b7672826ca. * Pass trust_remote_code in librispeech_asr_dummy docstrings * Revert "Pin datasets<2.20.0 for examples" This reverts commit 833fc17a3e. * Pass trust_remote_code to all examples * Revert "Add trust_remote_dataset_code arg to some tests" to research_projects * Pass trust_remote_code to tests * Pass trust_remote_code to docstrings * Fix flax examples tests requirements * Pass trust_remote_dataset_code arg to tests * Replace trust_remote_dataset_code with trust_remote_code in one example * Fix duplicate trust_remote_code * Replace args.trust_remote_dataset_code with args.trust_remote_code * Replace trust_remote_dataset_code with trust_remote_code in parser * Replace trust_remote_dataset_code with trust_remote_code in dataclasses * Replace trust_remote_dataset_code with trust_remote_code arg
2024-06-17 18:29:13 +02:00
parent 485fd81471
commit a14b055b65
168 changed files with 804 additions and 410 deletions
--- a/examples/pytorch/instance-segmentation/run_instance_segmentation.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation.py
@@ -68,6 +68,16 @@ class Arguments:
            "help": "Name of a dataset from the hub (could be your own, possibly private dataset hosted on the hub)."
        },
    )
+    trust_remote_code: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to trust the execution of code from datasets/models defined on the Hub."
+                " This option should only be set to `True` for repositories you trust and in which you have read the"
+                " code, as it will execute code present on the Hub on your local machine."
+            )
+        },
+    )
    image_height: Optional[int] = field(default=512, metadata={"help": "Image height after resizing."})
    image_width: Optional[int] = field(default=512, metadata={"help": "Image width after resizing."})
    token: str = field(
@@ -364,7 +374,7 @@ def main():
    # Load dataset, prepare splits
    # ------------------------------------------------------------------------------------------------

-    dataset = load_dataset(args.dataset_name)
+    dataset = load_dataset(args.dataset_name, trust_remote_code=args.trust_remote_code)

    # We need to specify the label2id mapping for the model
    # it is a mapping from semantic class name to class index.
--- a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
@@ -71,6 +71,15 @@ def parse_args():
        help="Name of the dataset on the hub.",
        default="qubvel-hf/ade20k-mini",
    )
+    parser.add_argument(
+        "--trust_remote_code",
+        action="store_true",
+        help=(
+            "Whether to trust the execution of code from datasets/models defined on the Hub."
+            " This option should only be set to `True` for repositories you trust and in which you have read the"
+            " code, as it will execute code present on the Hub on your local machine."
+        ),
+    )
    parser.add_argument(
        "--image_height",
        type=int,
@@ -425,7 +434,7 @@ def main():

    # In distributed training, the load_dataset function guarantees that only one local process can concurrently
    # download the dataset.
-    dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir)
+    dataset = load_dataset(args.dataset_name, cache_dir=args.cache_dir, trust_remote_code=args.trust_remote_code)

    # We need to specify the label2id mapping for the model
    # it is a mapping from semantic class name to class index.